diff --git a/.github/maven-settings.xml b/.github/maven-settings.xml new file mode 100644 index 000000000..9c8a6c405 --- /dev/null +++ b/.github/maven-settings.xml @@ -0,0 +1,24 @@ + + + + + sonatype-snapshots + + + sonatype-nexus-snapshots + Sonatype Nexus Snapshots + https://oss.sonatype.org/content/repositories/snapshots + + false + + + true + + + + + + + sonatype-snapshots + + \ No newline at end of file diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml new file mode 100644 index 000000000..1d065c4e8 --- /dev/null +++ b/.github/workflows/maven.yml @@ -0,0 +1,33 @@ +# This workflow will build a Java project with Maven +# For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven + +name: Java CI with Maven + +on: + push: + branches: [ master, v2.6 ] + pull_request: + branches: [ master, v2.6 ] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - name: Set up Python 3.7 + uses: actions/setup-python@v2 + with: + python-version: 3.7 + - name: Install python dependencies + run: | + python -m pip install --upgrade pip + pip install flair==0.11.3 + - uses: actions/checkout@v2 + - name: Set up JDK 11 + uses: actions/setup-java@v2 + with: + java-version: '11' + distribution: 'adopt' + - name: Build with Maven + run: mvn -B package --file pom.xml --settings .github/maven-settings.xml diff --git a/.gitignore b/.gitignore index 247d87c61..6da01ef44 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ target **/*.iml /julie-xml-tools.jar +/jcore-pmc-db-reader/src/test/resources/hiddenConfig diff --git a/.travis.yml b/.travis.yml index 208b0219a..bce762cc1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,8 +6,8 @@ addons: sources: - deadsnakes packages: - - python3.6 - - python3.6-dev + - python3.7 + - python3.7-dev env: global: @@ -19,7 +19,7 @@ env: - # GPG_KEY_NAME - secure: pxYxmA/9xS/9DO6rUAhlbAtYQMmG633jSwG8OIVCnnoQSXS4UILJgNl7Q6dQsAuT27tk+/fin0kXTnxWqCe0URb3c3XgNQwfGAuz1JIYVPHvezoDQLLRQA6LRgqd7GuvBDsyXJvBANozGKJYJVfoeT9gqFosFuMdRZ88eQm+ltX7zVKyMiz2rqKYPoSFInNxDGMOaIQ+RZdf8ai8rLY3E11PxsMC0LgypEDbuC7d9Q+Tu89YfUeuRly0hAuxmW++RrMgeeAs/7BndmZqcHVpkrcX6Drq8nZ2cj0ev4IDJelV/Nd17Vjfg7HgfJ4/d9S+PCg4KhvOY/y9Xad8geIIzXLFD9ZgcaK7MT9+BFGYXj7ExizFSc+Ico5Q822RJA1XZWfc/EgnY+7jEZCCMz/ceHx8oSh0ce1VbPl7c+O+jMXUMQC69Gpys57XC48rdPn0bbjc4/jpSOq46Xv7YdcGuA2BcWEEeQ0WAbi9IDcevpCXiZ7kng5hHTCpfaYVhn63KAIAMKf7tu6C78wFZR63F8Gf4x/jKE37QqvHV3uOzD7ar6nTAuy/ukZK0p4zyeIYe25PnS9K4kpolT1I12i7/l/7MO9NPFdB0aOCBHUNPBEkifwceltX6RP4PDIKdtCEQ4vcqrRNvhtAhO9Vo1udkyaeFx5swbY3j11CjzcfrBE= - # GPG_PASSPHRASE - - PYTHON=/usr/bin/python3.6 + - PYTHON=/usr/bin/python3.7 before_install: - | @@ -31,11 +31,11 @@ before_install: if ! find "$HOME/pip-cache" -mindepth 1 -print -quit 2>/dev/null | grep -q .; then $PYTHON -m pip download --destination-directory="$HOME/pip-cache" flair fi - sudo -H $PYTHON -m pip install --find-links="$HOME/pip-cache" flair==0.4.5 + sudo -H $PYTHON -m pip install --find-links="$HOME/pip-cache" flair==0.6.1 torch==1.7.1 - #./travis-deployment/install-flair-nightly.sh - export BOTO_CONFIG=/dev/null install: mvn install -DskipTests=true -Dmaven.javadoc.skip=true -B -V -script: mvn -T 1C test -B +script: mvn -T 2C test -B cache: directories: @@ -51,4 +51,4 @@ deploy: skip_cleanup: true on: all_branches: true - condition: $TRAVIS_BRANCH =~ ^v2.5|master$ \ No newline at end of file + condition: $TRAVIS_BRANCH =~ ^v2.6|master$ \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..7e93520be --- /dev/null +++ b/LICENSE @@ -0,0 +1,25 @@ +BSD 2-Clause License + +Copyright (c) 2022, JULIE Lab +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md index 9035ccbb1..79c1fbc99 100644 --- a/README.md +++ b/README.md @@ -12,24 +12,29 @@ In order to automate the builds of complex NLP pipelines and properly represent A description for each individual component can be found in their respective `README.md`. ### Requirements & Dependencies -In order to use our components you need at least [JDK 11](https://www.oracle.com/technetwork/java/javase/downloads/jdk11-downloads-5066655.html) (Java SE Development Kit 11), [UIMA 2.10](https://uima.apache.org/index.html) & [Maven 3](https://maven.apache.org/). We develop with the [Eclipse IDE for Java Developers](http://www.eclipse.org/downloads/) and [IntelliJ IDEA](https://www.jetbrains.com/idea/) Java IDEs. If course you're free to try it with different versions or tools than those mentioned, but we can't make promises for a flawless functioning of our components in these cases. +In order to use our components you need at least [JDK 11](https://www.oracle.com/technetwork/java/javase/downloads/jdk11-downloads-5066655.html) (Java SE Development Kit 11), [UIMA 2.x](https://uima.apache.org/index.html) & [Maven 3](https://maven.apache.org/). We develop with the [Eclipse IDE for Java Developers](http://www.eclipse.org/downloads/) and [IntelliJ IDEA](https://www.jetbrains.com/idea/) Java IDEs. If course you're free to try it with different versions or tools than those mentioned, but we can't make promises for a flawless functioning of our components in these cases. ### UIMA's Collection Processing Engine (CPE) -UIMA features a relatively easy way to combine UIMA components together in order to analyze a collection of artifacts. If you're not firm or willing to deal with Java Code, the usage of a CPE might be the right choice. +UIMA offers a relatively easy way to combine UIMA components together in order to analyze a collection of artifacts. If you're not firm or willing to deal with Java Code, the usage of a CPE might be the right choice. For more detailed information see [UIMA's CPE Documentation](https://uima.apache.org/downloads/releaseDocs/2.1.0-incubating/docs/html/tutorials_and_users_guides/tutorials_and_users_guides.html#ugr.tug.cpe). -We're also working on a simple [Python script](https://github.com/JULIELab/jcore-misc/tree/master/jcore-cpe-builder) that builds rudimentary and preconfigured CPEs of your choice. It's working but still work in progress so please bear with us and post issues. +A newer alternative is [UIMA AS](https://uima.apache.org/doc-uimaas-what.html). It is today's officially recommended way to use and scale UIMA pipelines. Our existing CPE infrastructure serves us well, however, so we mostly stick to those for the time being. + +### JCoRe UIMA Pipeline Builder + +Most CPE configurations consisting of JCoRe components can be easily built using the [JCoRe UIMA Pipeline Builder](https://github.com/JULIELab/jcore-pipeline-modules). +This is a Java program that offers a simple command line interface for the creation of CPEs. There is also support for UIMA AS. ### Maven Artifacts If not stated otherwise, all the components found in this project are at least in their latest release version also available as Maven artifacts: ``` de.julielab - #COMPONENT-NAME + COMPONENT-NAME ${jcore-version} ``` -Where `#COMPONENT-NAME` is exactly the same as the name on GitHub. +Where `COMPONENT-NAME` is exactly the same as the name on GitHub. For instance, to get the Acronym Resolver, include this in your Maven dependencies: ``` diff --git a/jcore-ace-reader/component.meta b/jcore-ace-reader/component.meta index 65d83f33b..ac1392e63 100644 --- a/jcore-ace-reader/component.meta +++ b/jcore-ace-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-ace-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe ACE Reader" } diff --git a/jcore-ace-reader/pom.xml b/jcore-ace-reader/pom.xml index fad4ca485..c4fa13273 100644 --- a/jcore-ace-reader/pom.xml +++ b/jcore-ace-reader/pom.xml @@ -13,7 +13,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -32,8 +32,8 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine de.julielab diff --git a/jcore-ace-reader/src/main/resources/de/julielab/jcore/reader/ace/desc/jcore-ace-reader.xml b/jcore-ace-reader/src/main/resources/de/julielab/jcore/reader/ace/desc/jcore-ace-reader.xml index 6d7d29ff9..576236d5c 100644 --- a/jcore-ace-reader/src/main/resources/de/julielab/jcore/reader/ace/desc/jcore-ace-reader.xml +++ b/jcore-ace-reader/src/main/resources/de/julielab/jcore/reader/ace/desc/jcore-ace-reader.xml @@ -5,7 +5,7 @@ AceReader Descriptor automatically generated by uimaFIT - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-ace-reader/src/test/java/de/julielab/jcore/reader/ace/AceReaderTest.java b/jcore-ace-reader/src/test/java/de/julielab/jcore/reader/ace/AceReaderTest.java index 465a384f7..b6bd606e4 100644 --- a/jcore-ace-reader/src/test/java/de/julielab/jcore/reader/ace/AceReaderTest.java +++ b/jcore-ace-reader/src/test/java/de/julielab/jcore/reader/ace/AceReaderTest.java @@ -21,7 +21,6 @@ import de.julielab.jcore.types.ArgumentMention; import de.julielab.jcore.types.EntityMention; import de.julielab.jcore.types.ace.*; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.metadata.AnalysisEngineMetaData; import org.apache.uima.cas.CAS; @@ -38,6 +37,8 @@ import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; import org.apache.uima.util.XMLSerializer; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import org.w3c.dom.Node; import org.xml.sax.SAXException; @@ -50,7 +51,9 @@ import java.util.ArrayList; import java.util.Iterator; -public class AceReaderTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class AceReaderTest { /** * Path to the MedlineReader descriptor */ @@ -65,47 +68,46 @@ public class AceReaderTest extends TestCase { /** * Object to be tested */ - private CollectionReader aceReader; + private static CollectionReader aceReader; /** * Auxiliary collection reader */ - private CollectionReader testReader; + private static CollectionReader testReader; /** * CAS array list with CAS objects that where processed by the aceReader */ - private ArrayList casArrayList = new ArrayList(); + private static ArrayList casArrayList = new ArrayList(); /** * Auxiliary CAS objects */ - private CAS aceReaderCas; + private static CAS aceReaderCas; - private CAS testReaderCas; + private static CAS testReaderCas; - private JCas aceReaderJCas; + private static JCas aceReaderJCas; - private JCas testReaderJCas; + private static JCas testReaderJCas; - LOC entity1_1; + static LOC entity1_1; - LOC entity1_2; + static LOC entity1_2; - GPE entity2_1; + static GPE entity2_1; - GPE entity2_2; + static GPE entity2_2; - GPE entity2_3; + static GPE entity2_3; - GPE entity2_4; + static GPE entity2_4; /*----------------------------------------------------------------------------------------------*/ - @Override - protected void setUp() throws Exception { + @BeforeAll + protected static void setUp() throws Exception { aceReader = getCollectionReader(ACE_READER_DESCRIPTOR); processAllCases(); - super.setUp(); System.out.println("ALL CASes were processed"); } // of setUp @@ -118,7 +120,7 @@ protected void setUp() throws Exception { * @throws SAXException * @throws ParserConfigurationException */ - private void processAllCases() throws CASException, SAXException, ParserConfigurationException { + private static void processAllCases() throws CASException, SAXException, ParserConfigurationException { try { while (aceReader.hasNext()) { @@ -157,13 +159,13 @@ private void processAllCases() throws CASException, SAXException, ParserConfigur } // of processAllCases /*----------------------------------------------------------------------------------------------*/ - private void compareCASes() { - assertTrue("Invalid source file attributes!", checkSourceFile()); - assertTrue("Invalid generated Jules Components!", checkGeneratedJulesComponents()); + private static void compareCASes() { + assertTrue(checkSourceFile(), "Invalid source file attributes!"); + assertTrue(checkGeneratedJulesComponents(), "Invalid generated Jules Components!"); } // compareCASes /*----------------------------------------------------------------------------------------------*/ - private boolean checkGeneratedJulesComponents() { + private static boolean checkGeneratedJulesComponents() { System.out.println("CALL checkGeneratedJulesComponents()"); boolean julesComponentsEqual = true; @@ -185,7 +187,7 @@ private boolean checkGeneratedJulesComponents() { } // checkGeneratedJulesComponents /*----------------------------------------------------------------------------------------------*/ - private boolean checkJulesEntities() { + private static boolean checkJulesEntities() { System.out.println("CALL checkJulesEntities()"); boolean julesEntityEqual = true; @@ -237,7 +239,7 @@ private boolean checkJulesEntities() { } // of checkJulesEntities /*----------------------------------------------------------------------------------------------*/ - private boolean checkJulesRelations() { + private static boolean checkJulesRelations() { System.out.println("CALL checkJulesRelations()"); boolean juleRelationEqual = true; @@ -286,8 +288,8 @@ private boolean checkJulesRelations() { } // of checkJulesRelations /*----------------------------------------------------------------------------------------------*/ - private boolean checkJulesRelationArguments(de.julielab.jcore.types.RelationMention aceReaderRelation, - de.julielab.jcore.types.RelationMention testReaderRelation) { + private static boolean checkJulesRelationArguments(de.julielab.jcore.types.RelationMention aceReaderRelation, + de.julielab.jcore.types.RelationMention testReaderRelation) { System.out.println("CALL checkJulesRelationArguments()"); boolean julesRelationArgumentEqual = true; @@ -449,7 +451,7 @@ private boolean checkJulesEventArguments(de.julielab.jcore.types.EventMention ac } // of checkJulesEventArguments /*----------------------------------------------------------------------------------------------*/ - private boolean checkSourceFile() { + private static boolean checkSourceFile() { boolean sourceFileEqual = true; Iterator aceReaderIterator = getTypeIterator(aceReaderCas, de.julielab.jcore.types.ace.SourceFile.type); @@ -499,7 +501,7 @@ private boolean checkSourceFile() { } // checkSourceFile /*----------------------------------------------------------------------------------------------*/ - private boolean checkDocument() { + private static boolean checkDocument() { boolean documentEqual = true; Iterator aceReaderIterator = getTypeIterator(aceReaderCas, de.julielab.jcore.types.ace.Document.type); @@ -568,7 +570,7 @@ private boolean checkDocument() { } // of checkDocument /*----------------------------------------------------------------------------------------------*/ - private boolean checkEvents(Document aceReaderDocument, Document testReaderDocument) { + private static boolean checkEvents(Document aceReaderDocument, Document testReaderDocument) { System.out.println("CALL checkEvents()"); boolean eventEqual = true; @@ -641,7 +643,7 @@ private boolean checkEvents(Document aceReaderDocument, Document testReaderDocum } // of checkEvents /*----------------------------------------------------------------------------------------------*/ - private boolean checkEventMentions(Event aceReaderEvent, Event testReaderEvent) { + private static boolean checkEventMentions(Event aceReaderEvent, Event testReaderEvent) { boolean eventMentionEqual = true; FSArray aceReaderEventMentionFSArray = aceReaderEvent.getMentions(); @@ -703,7 +705,7 @@ private boolean checkEventMentions(Event aceReaderEvent, Event testReaderEvent) } // checkEventMentions /*----------------------------------------------------------------------------------------------*/ - private boolean checkEventMentionArguments(EventMention aceReaderEventMention, EventMention testReaderEventMention) { + private static boolean checkEventMentionArguments(EventMention aceReaderEventMention, EventMention testReaderEventMention) { boolean eventMentionArgumentEqual = true; FSArray aceReaderEventMentionArgumentFSArray = aceReaderEventMention.getArguments(); @@ -740,7 +742,7 @@ private boolean checkEventMentionArguments(EventMention aceReaderEventMention, E } // of checkEventMentionArguments /*----------------------------------------------------------------------------------------------*/ - private boolean checkEventArguments(Event aceReaderEvent, Event testReaderEvent) { + private static boolean checkEventArguments(Event aceReaderEvent, Event testReaderEvent) { boolean eventArgumentEqual = true; FSArray aceReaderEventArgumentFSArray = aceReaderEvent.getArguments(); @@ -767,7 +769,7 @@ private boolean checkEventArguments(Event aceReaderEvent, Event testReaderEvent) } // of checkEventArguments /*----------------------------------------------------------------------------------------------*/ - private boolean checkRelations(Document aceReaderDocument, Document testReaderDocument) { + private static boolean checkRelations(Document aceReaderDocument, Document testReaderDocument) { boolean relationEqual = true; FSArray aceReaderRelationFSArray = aceReaderDocument.getRelations(); @@ -830,7 +832,7 @@ private boolean checkRelations(Document aceReaderDocument, Document testReaderDo } // of checkRelations /*----------------------------------------------------------------------------------------------*/ - private boolean checkRelationMentions(Relation aceReaderRelation, Relation testReaderRelation) { + private static boolean checkRelationMentions(Relation aceReaderRelation, Relation testReaderRelation) { boolean relationMentionEqual = true; FSArray aceReaderRelationMentionFSArray = aceReaderRelation.getMentions(); @@ -885,8 +887,8 @@ private boolean checkRelationMentions(Relation aceReaderRelation, Relation testR } // checkRelationMentions /*----------------------------------------------------------------------------------------------*/ - private boolean checkRelationMentionArguments(RelationMention aceReaderRelationMention, - RelationMention testReaderRelationMention) { + private static boolean checkRelationMentionArguments(RelationMention aceReaderRelationMention, + RelationMention testReaderRelationMention) { boolean relationMentionArgumentEqual = true; FSArray aceReaderRelationMentionArgumentFSArray = aceReaderRelationMention.getArguments(); @@ -925,7 +927,7 @@ private boolean checkRelationMentionArguments(RelationMention aceReaderRelationM } /*----------------------------------------------------------------------------------------------*/ - private boolean checkRelationArguments(Relation aceReaderRelation, Relation testReaderRelation) { + private static boolean checkRelationArguments(Relation aceReaderRelation, Relation testReaderRelation) { boolean relationArgumentEqual = true; FSArray aceReaderRelationArgumentFSArray = aceReaderRelation.getArguments(); @@ -952,7 +954,7 @@ private boolean checkRelationArguments(Relation aceReaderRelation, Relation test } // checkRelationArguments /*----------------------------------------------------------------------------------------------*/ - private boolean checkTimex2(Document aceReaderDocument, Document testReaderDocument) { + private static boolean checkTimex2(Document aceReaderDocument, Document testReaderDocument) { boolean timex2Equal = true; FSArray aceReaderTimex2FSArray = aceReaderDocument.getTimex2(); @@ -985,7 +987,7 @@ private boolean checkTimex2(Document aceReaderDocument, Document testReaderDocum } // checkTimex2 /*----------------------------------------------------------------------------------------------*/ - private boolean checkTimex2Mentions(Timex2 aceReaderTimex2, Timex2 testReaderTimex2) { + private static boolean checkTimex2Mentions(Timex2 aceReaderTimex2, Timex2 testReaderTimex2) { boolean timex2MentionEqual = true; FSArray aceReaderTimex2MentionFSArray = aceReaderTimex2.getMentions(); @@ -1017,7 +1019,7 @@ private boolean checkTimex2Mentions(Timex2 aceReaderTimex2, Timex2 testReaderTim } // of checkTimex2Mentions /*----------------------------------------------------------------------------------------------*/ - private boolean checkValues(Document aceReaderDocument, Document testReaderDocument) { + private static boolean checkValues(Document aceReaderDocument, Document testReaderDocument) { boolean valueEqual = true; FSArray aceReaderValueFSArray = aceReaderDocument.getValues(); @@ -1060,7 +1062,7 @@ private boolean checkValues(Document aceReaderDocument, Document testReaderDocum } // of checkValues /*----------------------------------------------------------------------------------------------*/ - private boolean checkValueMentions(Value aceReaderValue, Value testReaderValue) { + private static boolean checkValueMentions(Value aceReaderValue, Value testReaderValue) { boolean valueMentionEqual = true; FSArray aceReaderValueMentionFSArray = aceReaderValue.getMentions(); @@ -1093,7 +1095,7 @@ private boolean checkValueMentions(Value aceReaderValue, Value testReaderValue) } // of checkValueMentions /*----------------------------------------------------------------------------------------------*/ - private boolean checkEntities() { + private static boolean checkEntities() { boolean entityEqual = true; Iterator aceReaderIterator = getTypeIterator(aceReaderCas, de.julielab.jcore.types.ace.Entity.type); @@ -1176,7 +1178,7 @@ private boolean checkEntities() { } // checkEntities /*----------------------------------------------------------------------------------------------*/ - private boolean checkEntityAttributes(Entity aceReaderEntity, Entity testReaderEntity) { + private static boolean checkEntityAttributes(Entity aceReaderEntity, Entity testReaderEntity) { boolean entityAttributeEqual = true; FSArray aceReaderEntityAttributeFSArray = aceReaderEntity.getEntity_attributes(); FSArray testReaderEntityAttributeFSArray = testReaderEntity.getEntity_attributes(); @@ -1208,8 +1210,8 @@ private boolean checkEntityAttributes(Entity aceReaderEntity, Entity testReaderE } // of checkEntityAttributes /*----------------------------------------------------------------------------------------------*/ - private boolean checkEntityAttributesNames(EntityAttribute aceReaderEntityAttribute, - EntityAttribute testReaderEntityAttribute) { + private static boolean checkEntityAttributesNames(EntityAttribute aceReaderEntityAttribute, + EntityAttribute testReaderEntityAttribute) { boolean entityAttributesNamesEqual = true; FSArray aceReaderEntityAttributesNamesFSArray = aceReaderEntityAttribute.getNames(); FSArray testReaderEntityAttributesNamesFSArray = testReaderEntityAttribute.getNames(); @@ -1241,7 +1243,7 @@ private boolean checkEntityAttributesNames(EntityAttribute aceReaderEntityAttrib } // checkEntityAttributesNames /*----------------------------------------------------------------------------------------------*/ - private boolean checkEntityMentions(Entity aceReaderEntity, Entity testReaderEntity) { + private static boolean checkEntityMentions(Entity aceReaderEntity, Entity testReaderEntity) { boolean entityMentionEqual = true; FSArray aceReaderEntityMentionFSArray = aceReaderEntity.getEntity_mentions(); FSArray testReaderEntityMentionFSArray = testReaderEntity.getEntity_mentions(); @@ -1309,7 +1311,7 @@ private boolean checkEntityMentions(Entity aceReaderEntity, Entity testReaderEnt } // of checkEntityMentions /*----------------------------------------------------------------------------------------------*/ - private void buildSourceFile(JCas jcas) throws SAXException, IOException, ParserConfigurationException { + private static void buildSourceFile(JCas jcas) throws SAXException, IOException, ParserConfigurationException { de.julielab.jcore.types.ace.SourceFile sourceFile = new de.julielab.jcore.types.ace.SourceFile(jcas); sourceFile.setUri("XIN_ENG_20030624.0085.sgm"); @@ -1329,14 +1331,14 @@ private void buildSourceFile(JCas jcas) throws SAXException, IOException, Parser } // buildSourceFile /*----------------------------------------------------------------------------------------------*/ - private void setDocumentText(CAS testReaderCas2, org.w3c.dom.Document sgmDomDocument) { + private static void setDocumentText(CAS testReaderCas2, org.w3c.dom.Document sgmDomDocument) { Node documentNode = sgmDomDocument.getDocumentElement(); String documentText = documentNode.getTextContent(); testReaderCas2.setDocumentText(documentText); } // of setDocumentText /*----------------------------------------------------------------------------------------------*/ - private void buildDocument(JCas jcas, SourceFile sourceFile) { + private static void buildDocument(JCas jcas, SourceFile sourceFile) { de.julielab.jcore.types.ace.Document document = new de.julielab.jcore.types.ace.Document(jcas); document.setDocid("XIN_ENG_20030624.0085"); buildEntities(jcas, document); @@ -1401,7 +1403,7 @@ private void buildJulesEventArgs(JCas jcas, Transaction event1) { } // buildJulesEventArgs /*----------------------------------------------------------------------------------------------*/ - private void buildJulesRelations(JCas jcas, Document document) { + private static void buildJulesRelations(JCas jcas, Document document) { System.out.println("CALL buildJulesRelations()"); PART_WHOLE relation1_1 = new PART_WHOLE(jcas); relation1_1.setBegin(543); @@ -1490,7 +1492,7 @@ private void buildJulesRelations(JCas jcas, Document document) { } // of buildJulesRelations /*----------------------------------------------------------------------------------------------*/ - private void buildJulesEntities(JCas jcas, Document document) { + private static void buildJulesEntities(JCas jcas, Document document) { System.out.println("CALL buildJulesEntities()"); entity1_1 = new LOC(jcas); @@ -1562,7 +1564,7 @@ private void buildJulesEntities(JCas jcas, Document document) { } // of buildJulesEntities /*----------------------------------------------------------------------------------------------*/ - private void buildEvents(JCas jcas, Document document) { + private static void buildEvents(JCas jcas, Document document) { de.julielab.jcore.types.ace.Event event = new de.julielab.jcore.types.ace.Event(jcas); event.setGenericity("Specific"); @@ -1583,7 +1585,7 @@ private void buildEvents(JCas jcas, Document document) { } // of buildEvents /*----------------------------------------------------------------------------------------------*/ - private void buildEventMentions(JCas jcas, Event event) { + private static void buildEventMentions(JCas jcas, Event event) { de.julielab.jcore.types.ace.EventMention eventMention = new de.julielab.jcore.types.ace.EventMention(jcas); eventMention.setId("XIN_ENG_20030405.0080-EV2-1"); eventMention.setBegin(625); @@ -1612,7 +1614,7 @@ private void buildEventMentions(JCas jcas, Event event) { } // of buildEventMentions /*----------------------------------------------------------------------------------------------*/ - private void buildEventMentionArguments(JCas jcas, EventMention eventMention) { + private static void buildEventMentionArguments(JCas jcas, EventMention eventMention) { de.julielab.jcore.types.ace.EventMentionArgument eventMentionArgument1 = new de.julielab.jcore.types.ace.EventMentionArgument( jcas); eventMentionArgument1.setAce_role("Recipient"); @@ -1637,7 +1639,7 @@ private void buildEventMentionArguments(JCas jcas, EventMention eventMention) { } // of buildEventMentionArguments /*----------------------------------------------------------------------------------------------*/ - private void buildEventArguments(JCas jcas, Event event) { + private static void buildEventArguments(JCas jcas, Event event) { de.julielab.jcore.types.ace.EventArgument eventArgument1 = new de.julielab.jcore.types.ace.EventArgument(jcas); eventArgument1.setAce_role("Recipient"); eventArgument1.setRefid("XIN_ENG_20030405.0080-E1"); @@ -1656,7 +1658,7 @@ private void buildEventArguments(JCas jcas, Event event) { } // of buildEventArguments /*----------------------------------------------------------------------------------------------*/ - private void buildRelations(JCas jcas, Document document) { + private static void buildRelations(JCas jcas, Document document) { de.julielab.jcore.types.ace.Relation relation1 = new de.julielab.jcore.types.ace.Relation(jcas); relation1.setModality("Asserted"); relation1.setTense("Unspecified"); @@ -1685,7 +1687,7 @@ private void buildRelations(JCas jcas, Document document) { } // of buildRelations /*----------------------------------------------------------------------------------------------*/ - private void buildRelationMentions2(JCas jcas, Relation relation2) { + private static void buildRelationMentions2(JCas jcas, Relation relation2) { de.julielab.jcore.types.ace.RelationMention relationMention2_1 = new de.julielab.jcore.types.ace.RelationMention( jcas); relationMention2_1.setLexical_condition("Preposition"); @@ -1714,7 +1716,7 @@ private void buildRelationMentions2(JCas jcas, Relation relation2) { } // of buildRelationMentions2 /*----------------------------------------------------------------------------------------------*/ - private void buildRelationMentionArgument2_2(JCas jcas, RelationMention relationMention2_2) { + private static void buildRelationMentionArgument2_2(JCas jcas, RelationMention relationMention2_2) { de.julielab.jcore.types.ace.RelationMentionArgument argument1 = new de.julielab.jcore.types.ace.RelationMentionArgument( jcas); argument1.setAce_role("Arg-2"); @@ -1739,7 +1741,7 @@ private void buildRelationMentionArgument2_2(JCas jcas, RelationMention relation } // of buildRelationMentionArgument2_2 /*----------------------------------------------------------------------------------------------*/ - private void buildRelationMentionArguments2_1(JCas jcas, RelationMention relationMention1) { + private static void buildRelationMentionArguments2_1(JCas jcas, RelationMention relationMention1) { de.julielab.jcore.types.ace.RelationMentionArgument argument1 = new de.julielab.jcore.types.ace.RelationMentionArgument( jcas); argument1.setAce_role("Arg-2"); @@ -1764,7 +1766,7 @@ private void buildRelationMentionArguments2_1(JCas jcas, RelationMention relatio } // of buildRelationMentionArguments2_1 /*----------------------------------------------------------------------------------------------*/ - private void buildRelationArguments2(JCas jcas, Relation relation2) { + private static void buildRelationArguments2(JCas jcas, Relation relation2) { de.julielab.jcore.types.ace.RelationArgument argument1 = new de.julielab.jcore.types.ace.RelationArgument(jcas); argument1.setAce_role("Arg-2"); argument1.setRefid("XIN_ENG_20030624.0085-E1"); @@ -1782,7 +1784,7 @@ private void buildRelationArguments2(JCas jcas, Relation relation2) { } // of buildRelationArguments2 /*----------------------------------------------------------------------------------------------*/ - private void buildRelationMentions1(JCas jcas, Relation relation) { + private static void buildRelationMentions1(JCas jcas, Relation relation) { de.julielab.jcore.types.ace.RelationMention relationMention1 = new de.julielab.jcore.types.ace.RelationMention( jcas); relationMention1.setLexical_condition("Preposition"); @@ -1811,7 +1813,7 @@ private void buildRelationMentions1(JCas jcas, Relation relation) { } // buildRelationMentions /*----------------------------------------------------------------------------------------------*/ - private void buildRelationMentionArguments1_2(JCas jcas, RelationMention relationMention2) { + private static void buildRelationMentionArguments1_2(JCas jcas, RelationMention relationMention2) { de.julielab.jcore.types.ace.RelationMentionArgument argument1 = new de.julielab.jcore.types.ace.RelationMentionArgument( jcas); argument1.setAce_role("Arg-1"); @@ -1836,7 +1838,7 @@ private void buildRelationMentionArguments1_2(JCas jcas, RelationMention relatio } // buildRelationMentionArguments2 /*----------------------------------------------------------------------------------------------*/ - private void buildRelationMentionArguments1_1(JCas jcas, RelationMention relationMention1) { + private static void buildRelationMentionArguments1_1(JCas jcas, RelationMention relationMention1) { de.julielab.jcore.types.ace.RelationMentionArgument argument1 = new de.julielab.jcore.types.ace.RelationMentionArgument( jcas); argument1.setAce_role("Arg-1"); @@ -1861,7 +1863,7 @@ private void buildRelationMentionArguments1_1(JCas jcas, RelationMention relatio } // buildRelationMentionArguments1 /*----------------------------------------------------------------------------------------------*/ - private void buildRelationAgruments1(JCas jcas, Relation relation) { + private static void buildRelationAgruments1(JCas jcas, Relation relation) { de.julielab.jcore.types.ace.RelationArgument argument1 = new de.julielab.jcore.types.ace.RelationArgument(jcas); argument1.setAce_role("Arg-1"); argument1.setRefid("XIN_ENG_20030624.0085-E1"); @@ -1880,7 +1882,7 @@ private void buildRelationAgruments1(JCas jcas, Relation relation) { } // buildRelationAgruments /*----------------------------------------------------------------------------------------------*/ - private void buildTimex2(JCas jcas, Document document) { + private static void buildTimex2(JCas jcas, Document document) { de.julielab.jcore.types.ace.Timex2 timex2_1 = new de.julielab.jcore.types.ace.Timex2(jcas); timex2_1.setId("XIN_ENG_20030624.0085-T4"); buildTimex2Mentions1(jcas, timex2_1); @@ -1897,7 +1899,7 @@ private void buildTimex2(JCas jcas, Document document) { } // buildTimex2 /*----------------------------------------------------------------------------------------------*/ - private void buildTimex2Mentions2(JCas jcas, Timex2 timex2_2) { + private static void buildTimex2Mentions2(JCas jcas, Timex2 timex2_2) { de.julielab.jcore.types.ace.Timex2Mention timex2Mention = new de.julielab.jcore.types.ace.Timex2Mention(jcas); timex2Mention.setId("XIN_ENG_20030624.0085-T8-1"); timex2Mention.setBegin(1327); @@ -1911,7 +1913,7 @@ private void buildTimex2Mentions2(JCas jcas, Timex2 timex2_2) { } // buildTimex2Mentions2 /*----------------------------------------------------------------------------------------------*/ - private void buildTimex2Mentions1(JCas jcas, Timex2 timex2_1) { + private static void buildTimex2Mentions1(JCas jcas, Timex2 timex2_1) { de.julielab.jcore.types.ace.Timex2Mention timex2Mention = new de.julielab.jcore.types.ace.Timex2Mention(jcas); timex2Mention.setId("XIN_ENG_20030624.0085-T4-1"); timex2Mention.setBegin(327); @@ -1925,7 +1927,7 @@ private void buildTimex2Mentions1(JCas jcas, Timex2 timex2_1) { } // buildTimex2Mentions1 /*----------------------------------------------------------------------------------------------*/ - private void buildValues(JCas jcas, Document document) { + private static void buildValues(JCas jcas, Document document) { de.julielab.jcore.types.ace.Value value1 = new de.julielab.jcore.types.ace.Value(jcas); value1.setAce_type("Numeric"); value1.setAce_subtype("Money"); @@ -1948,7 +1950,7 @@ private void buildValues(JCas jcas, Document document) { } // buildValues /*----------------------------------------------------------------------------------------------*/ - private void buildValueMentuions2(JCas jcas, Value value2) { + private static void buildValueMentuions2(JCas jcas, Value value2) { de.julielab.jcore.types.ace.ValueMention valueMention = new de.julielab.jcore.types.ace.ValueMention(jcas); valueMention.setId("XIN_ENG_20030624.0085-V3-1"); valueMention.setBegin(1079); @@ -1962,7 +1964,7 @@ private void buildValueMentuions2(JCas jcas, Value value2) { } // buildValueMentuions2 /*----------------------------------------------------------------------------------------------*/ - private void buildValueMentions1(JCas jcas, Value value1) { + private static void buildValueMentions1(JCas jcas, Value value1) { de.julielab.jcore.types.ace.ValueMention valueMention = new de.julielab.jcore.types.ace.ValueMention(jcas); valueMention.setId("XIN_ENG_20030624.0085-V2-1"); valueMention.setBegin(826); @@ -1976,7 +1978,7 @@ private void buildValueMentions1(JCas jcas, Value value1) { } // buildValueMentions1 /*----------------------------------------------------------------------------------------------*/ - private void buildEntities(JCas jcas, de.julielab.jcore.types.ace.Document document) { + private static void buildEntities(JCas jcas, de.julielab.jcore.types.ace.Document document) { Entity entity1 = new Entity(jcas); entity1.setAce_class("USP"); entity1.setAce_type("LOC"); @@ -2003,14 +2005,14 @@ private void buildEntities(JCas jcas, de.julielab.jcore.types.ace.Document docum } // of buildEntities /*----------------------------------------------------------------------------------------------*/ - private void buildEntityAttributes1(JCas jcas, Entity entity1) { + private static void buildEntityAttributes1(JCas jcas, Entity entity1) { FSArray entityAttributeFSArray = new FSArray(jcas, 0); entityAttributeFSArray.addToIndexes(); entity1.setEntity_attributes(entityAttributeFSArray); } // buildEntityAttributes1 /*----------------------------------------------------------------------------------------------*/ - private void buildEntityAttributes2(JCas jcas, Entity entity2) { + private static void buildEntityAttributes2(JCas jcas, Entity entity2) { de.julielab.jcore.types.ace.EntityAttribute entityAttribute = new de.julielab.jcore.types.ace.EntityAttribute( jcas); @@ -2024,7 +2026,7 @@ private void buildEntityAttributes2(JCas jcas, Entity entity2) { } // ofbuildEntityAttributes2 /*----------------------------------------------------------------------------------------------*/ - private void buildEntityAttributeNames(JCas jcas, de.julielab.jcore.types.ace.EntityAttribute entityAttribute) { + private static void buildEntityAttributeNames(JCas jcas, de.julielab.jcore.types.ace.EntityAttribute entityAttribute) { FSArray nameFSArray = new FSArray(jcas, 4); de.julielab.jcore.types.ace.Name entityAttributeName1 = new de.julielab.jcore.types.ace.Name(jcas); @@ -2060,7 +2062,7 @@ private void buildEntityAttributeNames(JCas jcas, de.julielab.jcore.types.ace.En } // buildEntityAttributeNames /*----------------------------------------------------------------------------------------------*/ - private void buildEntityMentions1(JCas jcas, Entity entity) { + private static void buildEntityMentions1(JCas jcas, Entity entity) { de.julielab.jcore.types.ace.EntityMention entityMention1 = new de.julielab.jcore.types.ace.EntityMention(jcas); entityMention1.setMention_ldctype("PTV"); entityMention1.setMention_type("PRO"); @@ -2101,7 +2103,7 @@ private void buildEntityMentions1(JCas jcas, Entity entity) { } // of buildEntityMentions /*----------------------------------------------------------------------------------------------*/ - private void buildEntityMentions2(JCas jcas, Entity entity2) { + private static void buildEntityMentions2(JCas jcas, Entity entity2) { de.julielab.jcore.types.ace.EntityMention entityMention1 = new de.julielab.jcore.types.ace.EntityMention(jcas); entityMention1.setLdcatr("FALSE"); entityMention1.setAce_role("LOC"); @@ -2180,6 +2182,7 @@ private void buildEntityMentions2(JCas jcas, Entity entity2) { /** * Test if method getNextCas() has done its job */ + @Test public void testGetNextCas() { System.out.println("CALL testGetNextCas"); checkDocumentText(); @@ -2195,7 +2198,7 @@ public void checkDocumentText() { for (int i = 0; i < casArrayList.size(); i++) { String text = casArrayList.get(i).getDocumentText(); - assertTrue(((text == null) ? "null" : text), (text != null) && (!text.equals(""))); + assertTrue((text != null) && (!text.equals("")), ((text == null) ? "null" : text)); } // of for } // of checkDocumentText @@ -2209,7 +2212,7 @@ public void checkDocumentText() { * the type * @return the iterator */ - private Iterator getTypeIterator(CAS cas, int type) { + private static Iterator getTypeIterator(CAS cas, int type) { Iterator iterator = null; try { @@ -2221,7 +2224,7 @@ private Iterator getTypeIterator(CAS cas, int type) { } // getTypeIterator /*----------------------------------------------------------------------------------------------*/ - private void writeCasToXMI(CAS cas, int docs) throws CASException, IOException, SAXException { + private static void writeCasToXMI(CAS cas, int docs) throws CASException, IOException, SAXException { JFSIndexRepository indexes = cas.getJCas().getJFSIndexRepository(); Iterator documentIter = indexes.getAnnotationIndex(Document.type).iterator(); diff --git a/jcore-ace-reader/src/test/resources/de/julielab/jcore/reader/ace/data/out/PLACEHOLDER b/jcore-ace-reader/src/test/resources/de/julielab/jcore/reader/ace/data/out/PLACEHOLDER new file mode 100644 index 000000000..e69de29bb diff --git a/jcore-acronym-ae/component.meta b/jcore-acronym-ae/component.meta index 4ccd014c0..b7c013133 100644 --- a/jcore-acronym-ae/component.meta +++ b/jcore-acronym-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-acronym-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Acronym Tagger" } diff --git a/jcore-acronym-ae/pom.xml b/jcore-acronym-ae/pom.xml index df40261b4..b5e1c0d89 100644 --- a/jcore-acronym-ae/pom.xml +++ b/jcore-acronym-ae/pom.xml @@ -14,7 +14,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -38,8 +38,8 @@ ${jcore-utilities-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-acronym-ae/src/main/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotator.java b/jcore-acronym-ae/src/main/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotator.java index ad7877e80..a8e588af9 100644 --- a/jcore-acronym-ae/src/main/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotator.java +++ b/jcore-acronym-ae/src/main/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotator.java @@ -158,12 +158,9 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept LOGGER.info(" done"); - } catch (AnnotatorContextException e) { - throw new ResourceInitializationException(); - } catch (AnnotatorConfigurationException e) { - throw new ResourceInitializationException(); - } catch (ResourceProcessException e) { - throw new ResourceInitializationException(); + } catch (AnnotatorContextException | AnnotatorConfigurationException| ResourceProcessException e) { + LOGGER.error("Could not initialize acronym annotator", e); + throw new ResourceInitializationException(e); } } @@ -237,19 +234,21 @@ public void process(JCas aJCas) { annotate(sentenceText, aJCas, sentence.getBegin()); } - // if extra annotation is whished, do so :-) + // if extra annotation is wished, do so :-) if (consistencyAnno) { ConsistencyAnnotator ca = new ConsistencyAnnotator(); ca.consistencyAnnotate(aJCas); } - + if (postprocessing) { Postprocessing.doPostprocessing(aJCas); } - + } catch (StringIndexOutOfBoundsException e) { LOGGER.error("typical Error in AcronymAnnotator.process() : StringIndexOutOfBounds"); + } catch (Throwable t) { + LOGGER.error("Acronym resolution error: ", t); } } @@ -557,10 +556,6 @@ private int findFullformStart(String potFF, String acro) { /** * looks for the 'best' position in the sentence to start looking for a fullform * - * @param sentence - * @param acroStart - * @param maxTokens - * @return */ private int getPotFullformStart(String sentence, int acroStart, int acroLength) { diff --git a/jcore-acronym-ae/src/main/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronym-ae.xml b/jcore-acronym-ae/src/main/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronym-ae.xml index f31cada2f..df6b3d9cc 100755 --- a/jcore-acronym-ae/src/main/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronym-ae.xml +++ b/jcore-acronym-ae/src/main/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronym-ae.xml @@ -6,7 +6,7 @@ JCoRe AcronymAnnotator - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-acronym-ae/src/test/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotatorTest.java b/jcore-acronym-ae/src/test/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotatorTest.java index c2c74ba6e..3721ee562 100644 --- a/jcore-acronym-ae/src/test/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotatorTest.java +++ b/jcore-acronym-ae/src/test/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotatorTest.java @@ -18,7 +18,6 @@ import de.julielab.jcore.types.Abbreviation; import de.julielab.jcore.types.AbbreviationLongform; import de.julielab.jcore.types.Sentence; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.cas.CAS; @@ -35,7 +34,7 @@ import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -45,12 +44,14 @@ import java.util.ArrayList; import java.util.Collection; +import static org.junit.jupiter.api.Assertions.*; + /** * The AcronymAnnotatorTest class * * @author jwermter */ -public class AcronymAnnotatorTest extends TestCase { +public class AcronymAnnotatorTest { private static final String DOCUMENT_TEXT = "[TAZ]Die Firma Kohl-kopf (FK-K) hat für die Straßenverkehrsordnung (StVO) " + "in der Bundesrepublik Deutschland(BRD) einen hochintelligenten Manager für die Chefetage " @@ -73,6 +74,7 @@ public class AcronymAnnotatorTest extends TestCase { private static final String ALL_TYPES_NAME = "de.julielab.jcore.types.jcore-all-types"; + @Test public void testProcess() throws ResourceInitializationException, InvalidXMLException, IOException, CASException { CAS cas = CasCreationUtils.createCas( diff --git a/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/JulesToolsAEDescriptor.xml b/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/JulesToolsAEDescriptor.xml index 9aa0a7e09..05179e6b2 100644 --- a/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/JulesToolsAEDescriptor.xml +++ b/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/JulesToolsAEDescriptor.xml @@ -6,7 +6,7 @@ JulesToolsDescriptor - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronymtagger-test.xml b/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronymtagger-test.xml index 8e179d4c3..f9a981135 100755 --- a/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronymtagger-test.xml +++ b/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronymtagger-test.xml @@ -6,7 +6,7 @@ JCoRe AcronymAnnotator - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/types/StemNetSemanticsTypeSystem.xml b/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/types/StemNetSemanticsTypeSystem.xml index fd197d12f..d918bfcba 100644 --- a/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/types/StemNetSemanticsTypeSystem.xml +++ b/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/types/StemNetSemanticsTypeSystem.xml @@ -2,7 +2,7 @@ StemNetSemanticsTypeSystem -2.5.1-SNAPSHOT +2.6.0 http://www.julielab.de diff --git a/jcore-acronym-writer/component.meta b/jcore-acronym-writer/component.meta index b0999bc38..7cdcd3451 100644 --- a/jcore-acronym-writer/component.meta +++ b/jcore-acronym-writer/component.meta @@ -2,7 +2,7 @@ "categories": [ "consumer" ], - "description": "Writes acronyms annotations from the CAS to a text file format.", + "description": "Writes acronym annotations from the CAS to a text file format.", "descriptors": [ { "category": "consumer", @@ -13,8 +13,8 @@ "group": "general", "maven-artifact": { "artifactId": "jcore-acronym-writer", - "groupId": "de.julielab.jcore.consumer.acronyms", - "version": "2.5.0-SNAPSHOT" + "groupId": "de.julielab", + "version": "2.6.0" }, "name": "JCoRe Acronym Writer" } diff --git a/jcore-acronym-writer/pom.xml b/jcore-acronym-writer/pom.xml index e01349996..287448025 100644 --- a/jcore-acronym-writer/pom.xml +++ b/jcore-acronym-writer/pom.xml @@ -5,12 +5,11 @@ 4.0.0 jcore-acronym-writer jar - de.julielab.jcore.consumer.acronyms de.julielab jcore-base - 2.5.0-SNAPSHOT + 2.6.0 @@ -39,11 +38,11 @@ de.julielab jcore-types - ${jcore-version} + ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe Acronym Writer @@ -58,5 +57,5 @@ https://github.com/JULIELab/jcore-base/tree/master/jcore-acronym-writer - Writes acronyms annotations from the CAS to a text file format. + Writes acronym annotations from the CAS to a text file format. diff --git a/jcore-acronym-writer/src/main/java/de/julielab/jcore/consumer/acronyms/AcronymWriter.java b/jcore-acronym-writer/src/main/java/de/julielab/jcore/consumer/acronyms/AcronymWriter.java index b1aabca29..ddc1ba416 100644 --- a/jcore-acronym-writer/src/main/java/de/julielab/jcore/consumer/acronyms/AcronymWriter.java +++ b/jcore-acronym-writer/src/main/java/de/julielab/jcore/consumer/acronyms/AcronymWriter.java @@ -15,6 +15,8 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; @@ -24,7 +26,7 @@ @ResourceMetaData(name = "JCoRe Acronym Writer", description = "Writes acronym annotation to a text file.") public class AcronymWriter extends JCasAnnotator_ImplBase { - +private final static Logger log = LoggerFactory.getLogger(AcronymWriter.class); public static final String PARAM_OUTPUTFILE = "OutputFile"; @ConfigurationParameter(name = PARAM_OUTPUTFILE) @@ -38,12 +40,15 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept try { os = FileUtilities.getOutputStreamToFile(new File(outputFile)); } catch (IOException e) { + log.error("Could not initialize acronym writer", e); throw new ResourceInitializationException(e); } + log.trace("AcronymWriter successfully initialized."); } @Override public void process(JCas jcas) throws AnalysisEngineProcessException { + log.trace("Processing with AcronymWriter"); try { String pubmedId = JCoReTools.getDocId(jcas); FSIterator it = jcas.getAnnotationIndex(Abbreviation.type).iterator(); @@ -70,7 +75,10 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { ++abbrCount; } } catch (CASRuntimeException | IOException e) { + log.error("Exception while writing acronyms", e); throw new AnalysisEngineProcessException(e); + } catch (Throwable t) { + log.error("Exception while writing acronyms", t); } } diff --git a/jcore-acronym-writer/src/main/resources/de/julielab/jcore/consumer/acronyms/desc/jcore-acronym-writer.xml b/jcore-acronym-writer/src/main/resources/de/julielab/jcore/consumer/acronyms/desc/jcore-acronym-writer.xml index 5f3073b02..31ce7af9a 100644 --- a/jcore-acronym-writer/src/main/resources/de/julielab/jcore/consumer/acronyms/desc/jcore-acronym-writer.xml +++ b/jcore-acronym-writer/src/main/resources/de/julielab/jcore/consumer/acronyms/desc/jcore-acronym-writer.xml @@ -2,11 +2,11 @@ org.apache.uima.java true - de.julielab.jcore.consumer.acronyms.AcronymWriter + de.julielab.jcore.consumer.coreference.AcronymWriter JCoRe Acronym Writer Writes acronym annotation to a text file. - 2.5.1-SNAPSHOT + 2.6.0 OutputFile diff --git a/jcore-acronym-writer/src/test/java/de/julielab/jcore/consumer/acronyms/AcronymWriterTest.java b/jcore-acronym-writer/src/test/java/de/julielab/jcore/consumer/acronyms/AcronymWriterTest.java index 243f4481a..c63bfd442 100644 --- a/jcore-acronym-writer/src/test/java/de/julielab/jcore/consumer/acronyms/AcronymWriterTest.java +++ b/jcore-acronym-writer/src/test/java/de/julielab/jcore/consumer/acronyms/AcronymWriterTest.java @@ -1,5 +1,5 @@ -package de.julielab.jcore.consumer.acronyms; +package de.julielab.jcore.consumer.coreference; /** * Unit tests for jcore-acronym-writer. diff --git a/jcore-annotation-adder-ae/README.md b/jcore-annotation-adder-ae/README.md index bf3d32b2c..c3e1f9fe0 100644 --- a/jcore-annotation-adder-ae/README.md +++ b/jcore-annotation-adder-ae/README.md @@ -28,11 +28,11 @@ For document class annotations, no offset mode is required, obviously. Whether t **3. External Resource Dependencies** -This component requires an external resource given with the `AnnotationSource` key. This dependency definition is present in the provided default descriptor. +This component requires an external resource given with the `AnnotationSource` key. This dependency definition is pre-configured in the provided default descriptor and must be adapted to point to the correct annotation source. The external dependency may currently be a file which is read completely into an in-memory map by the `de.julielab.jcore.ae.annotationadder.annotationsources.InMemoryFileTextAnnotationProvider` class for textual annotations with offsets or by the `de.julielab.jcore.ae.annotationadder.annotationsources.InMemoryFileDocumentClassAnnotationProvider` class for document classes. Both provider classes implement the required external resource interface `de.julielab.jcore.ae.annotationadder.annotationsources.AnnotationProvider`. -Other approaches, that are possible easier on the resources - might be implemented if necessary. +Other approaches that are possibly easier on the resources might be implemented if necessary. Currently, the external resource definition looks as follows: diff --git a/jcore-annotation-adder-ae/component.meta b/jcore-annotation-adder-ae/component.meta index 500127938..fe12dbf50 100644 --- a/jcore-annotation-adder-ae/component.meta +++ b/jcore-annotation-adder-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-annotation-adder-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Annotation Adder" } diff --git a/jcore-annotation-adder-ae/pom.xml b/jcore-annotation-adder-ae/pom.xml index 1473a562b..7cdc4c465 100644 --- a/jcore-annotation-adder-ae/pom.xml +++ b/jcore-annotation-adder-ae/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -41,8 +41,8 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine de.julielab @@ -53,6 +53,11 @@ commons-codec 1.13 + + com.h2database + h2 + 2.1.214 + JCoRe Annotation Adder diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java index b31fc7d05..ceaac7535 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java @@ -6,6 +6,7 @@ import de.julielab.jcore.utility.JCoReTools; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.fit.descriptor.ResourceMetaData; @@ -39,6 +40,7 @@ public enum OffsetMode {CHARACTER, TOKEN} @ConfigurationParameter(name = PARAM_PREVENT_PROCESSED_MARK, mandatory = false, description = "This setting is only in effect if an input format is used that contains document text SHA256 digests while also writing the annotation results into a JeDIS database. If then a CAS document text, to which annotations should be added, does not match the digest given by an annotation, this CAS will not marked as being finished processing by DBCheckpointAE that may follow in the pipeline. The idea is that the mismatched documents require a reprocessing of the original annotation creation algorithm because their text has been changed relative to the annotation on file. By not setting the document as being finished processed, it is straightforward to process only those documents again that failed to add one or multiple annotations.") private boolean preventProcessedOnDigestMismatch; + private List annotationAdders = Arrays.asList(new TextAnnotationListAdder(), new DocumentClassAnnotationAdder()); /** @@ -49,6 +51,7 @@ public enum OffsetMode {CHARACTER, TOKEN} public void initialize(final UimaContext aContext) throws ResourceInitializationException { offsetMode = OffsetMode.valueOf(Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_OFFSET_MODE)).orElse(OffsetMode.CHARACTER.name())); defaultUimaType = (String) aContext.getConfigParameterValue(PARAM_DEFAULT_UIMA_TYPE); + preventProcessedOnDigestMismatch = Optional.ofNullable((Boolean) aContext.getConfigParameterValue(PARAM_PREVENT_PROCESSED_MARK)).orElse(false); try { annotationProvider = (AnnotationProvider) aContext.getResourceObject(KEY_ANNOTATION_SOURCE); } catch (ResourceAccessException e) { @@ -65,23 +68,29 @@ public void initialize(final UimaContext aContext) throws ResourceInitialization * is where the actual work happens. */ @Override - public void process(final JCas aJCas) { - final String docId = JCoReTools.getDocId(aJCas); - if (docId == null) - log.error("The current document does not have a header. Cannot add external annotations."); - final AnnotationData annotations = annotationProvider.getAnnotations(docId); - final AnnotationAdderHelper helper = new AnnotationAdderHelper(); - if (annotations != null) { - boolean success = false; - int adderNum = 0; - // We are now iterating through the available annotation adders for the one that handles the obtained annotation data - while (adderNum < annotationAdders.size() && !(success = annotationAdders.get(adderNum).addAnnotations(annotations, helper, adderConfiguration, aJCas, preventProcessedOnDigestMismatch))) { - ++adderNum; + public void process(final JCas aJCas) throws AnalysisEngineProcessException { + try { + final String docId = JCoReTools.getDocId(aJCas); + if (docId == null) + log.error("The current document does not have a header. Cannot add external annotations."); + final AnnotationData annotations = annotationProvider.getAnnotations(docId); + final AnnotationAdderHelper helper = new AnnotationAdderHelper(); + if (annotations != null) { + log.trace("Found annotations for document ID {}.", docId); + boolean success = false; + int adderNum = 0; + // We are now iterating through the available annotation adders for the one that handles the obtained annotation data + while (adderNum < annotationAdders.size() && !(success = annotationAdders.get(adderNum).addAnnotations(annotations, helper, adderConfiguration, aJCas, preventProcessedOnDigestMismatch))) { + ++adderNum; + } + if (!success) + throw new IllegalArgumentException("There was no annotation adder to handle the annotation data of class " + annotations.getClass().getCanonicalName()); + } else { + log.debug("No external annotations were delivered for document ID {}", docId); } - if (!success) - throw new IllegalArgumentException("There was no annotation adder to handle the annotation data of class " + annotations.getClass().getCanonicalName()); - } else { - log.debug("No external annotations were delivered for document ID {}", docId); + } catch (Throwable t) { + log.error("Could not add annotations due to exception.", t); + throw new AnalysisEngineProcessException(t); } } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelper.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelper.java index 831ecb280..219d4d286 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelper.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelper.java @@ -1,31 +1,41 @@ package de.julielab.jcore.ae.annotationadder; +import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation; import de.julielab.jcore.ae.annotationadder.annotationrepresentations.TextAnnotation; import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; +import org.apache.commons.lang3.StringUtils; import org.apache.uima.cas.CASException; import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.TypeSystem; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Collectors; /** * Caches information for the current document. */ public class AnnotationAdderHelper { + private final static Logger log = LoggerFactory.getLogger(AnnotationAdderHelper.class); // Required for token-offsets private List tokenList; private Map> tokensBySentences; private Matcher wsFinder = Pattern.compile("\\s").matcher(""); private Matcher nonWsMatcher = Pattern.compile("[^\\s]+").matcher(""); - + /** + * Caches methods for feature + */ + private Map featureSetters; public void setAnnotationOffsetsRelativeToDocument(Annotation annotation, TextAnnotation a, AnnotationAdderConfiguration configuration) throws CASException, AnnotationOffsetException { if (configuration.getOffsetMode() == AnnotationAdderAnnotator.OffsetMode.CHARACTER) { @@ -68,8 +78,10 @@ public void setAnnotationOffsetsRelativeToSentence(Sentence sentence, Annotation List tokenList = tokensBySentences.get(sentence); int startTokenNum = a.getStart(); int endTokenNum = a.getEnd(); - if (startTokenNum < 1 || startTokenNum > tokenList.size()) + if (startTokenNum < 1 || startTokenNum > tokenList.size()) { + log.error("Cannot create entity because of a token offset mismatch. The entity should tart at token {} and end at {}. But there are only {} tokens available: {}", startTokenNum, endTokenNum, tokenList.size(), tokenList.stream().map(Annotation::getCoveredText).collect(Collectors.joining(" "))); throw new AnnotationOffsetException("The current annotation to add to the CAS starts at token " + startTokenNum + " which does not fit to the range of tokens in the sentence with ID " + sentence.getId() + " which is 1 - " + tokenList.size()); + } if (endTokenNum < 1 || endTokenNum > tokenList.size()) throw new AnnotationOffsetException("The current annotation to add to the CAS ends at token " + endTokenNum + " which does not fit to the range of tokens in the sentence with ID " + sentence.getId() + " which is 1 - " + tokenList.size()); if (endTokenNum < startTokenNum) @@ -134,4 +146,47 @@ public List createTokenList(JCas jCas, AnnotationAdderConfiguration confi } return tokenList; } + + public void setAnnotationPayloadsToFeatures(Annotation annotation, ExternalTextAnnotation a) { + final TypeSystem ts = annotation.getCAS().getTypeSystem(); + Collection keys = a.getPayloadKeys(); + if (!keys.isEmpty()) + featureSetters = new HashMap<>(); + try { + for (String key : keys) { + Object value = a.getPayload(key); + Method setter = featureSetters.get(key); + if (setter == null) { + Class valueClass = convertUimaTypeToJavaType(ts.getType(annotation.getClass().getCanonicalName()).getFeatureByBaseName(key).getRange()); + setter = annotation.getClass().getMethod("set" + StringUtils.capitalize(key), valueClass); + featureSetters.put(key, setter); + } + // We do this because it is possible a string feature could have values there are actually numbers. + // The automatic type detection of some formats will read those as numbers so we might need to + // convert here. + if (setter.getParameterTypes()[0].equals(String.class)) + value = String.valueOf(value); + setter.invoke(annotation, value); + } + } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) { + e.printStackTrace(); + } + } + + private Class convertUimaTypeToJavaType(Type type) { + switch (type.getName()) { + case "uima.cas.String": + return String.class; + case "uima.cas.Integer": + return int.class; + case "uima.cas.Double": + return double.class; + case "uima.cas.Boolean": + return boolean.class; + case "uima.cas.Long": + return long.class; + default: + throw new IllegalArgumentException("Unsupported type for arbitrary feature-based input columns: " + type); + } + } } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java index d249cf906..7626dce18 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java @@ -5,7 +5,6 @@ import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation; import de.julielab.jcore.types.ext.DBProcessingMetaData; import de.julielab.jcore.utility.JCoReAnnotationTools; -import de.julielab.jcore.utility.JCoReTools; import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.digest.DigestUtils; import org.apache.uima.cas.CASException; @@ -36,14 +35,18 @@ public boolean addAnnotations(AnnotationData data, AnnotationAdderHelper helper, String jCasDocTextSha = null; boolean shaMismatchWasReported = false; for (ExternalTextAnnotation a : annotationList) { - String uimaType = a.getUimaType() == null ? configuration.getDefaultUimaType() : a.getUimaType(); - if (uimaType == null) + String uimaType; + if (a.getUimaType() != null && jCas.getTypeSystem().getType(a.getUimaType()) != null) + uimaType = a.getUimaType(); + else if (configuration.getDefaultUimaType() != null) + uimaType = configuration.getDefaultUimaType(); + else throw new IllegalArgumentException("Missing annotation type: Neither the annotation of document " + a.getDocumentId() + " with offsets " + a.getStart() + "-" + a.getEnd() + " provides a type nor is the default type set."); if (jCas.getTypeSystem().getType(uimaType) == null) throw new IllegalArgumentException("The entity annotation type " + uimaType + " does not exist in the type system."); try { // The sha check is supposed to compare the document text on which the annotation was made with the - // document text the current CAS has. If the differ, the annotations will most likely have + // document text the current CAS has. If they differ, the annotations will most likely have // offset discrepancies which is why they won't be added and a warning will be issued. final String shaFromAnnotation = (String) a.getPayload("sha"); boolean shaMatches = true; @@ -59,15 +62,18 @@ public boolean addAnnotations(AnnotationData data, AnnotationAdderHelper helper, // that the SHA was the same as it was at time of the original entity tagging. if (a.getStart() >= 0) { final Annotation annotation = JCoReAnnotationTools.getAnnotationByClassName(jCas, uimaType); + if (annotation instanceof de.julielab.jcore.types.Annotation) + ((de.julielab.jcore.types.Annotation)annotation).setComponentId(AnnotationAdderAnnotator.class.getSimpleName()); helper.setAnnotationOffsetsRelativeToDocument(annotation, a, configuration); + helper.setAnnotationPayloadsToFeatures(annotation, a); + log.trace("Adding annotation of type {} with offsets {}-{} to document with ID {}", uimaType, annotation.getBegin(), annotation.getEnd(), annotationList.getDocId()); annotation.addToIndexes(); } else { log.trace("ExternalAnnotation for document {} has no entity offsets or offsets < 0, not adding anything to the CAS.", a.getDocumentId()); } } else { if (!shaMismatchWasReported) { - final String docId = JCoReTools.getDocId(jCas); - log.warn("The document with ID '{}' has a differing document text hash from a given annotation. The annotation will not be added to the document. Annotation hash: {}, current document text hash: {}", docId, shaFromAnnotation, jCasDocTextSha); + log.warn("The document with ID '{}' has a differing document text hash from a given annotation. The annotation will not be added to the document. Annotation hash: {}, current document text hash: {}", annotationList.getDocId(), shaFromAnnotation, jCasDocTextSha); shaMismatchWasReported = true; if (preventProcessedOnDigestMismatch) { try { diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/AnnotationFormat.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/AnnotationFormat.java index cb28d7d9f..d6d791256 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/AnnotationFormat.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/AnnotationFormat.java @@ -2,6 +2,37 @@ import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationData; +import java.util.List; + public interface AnnotationFormat { T parse(String data); + + void hasHeader(boolean withHeader); + + String[] getHeader(); + + List> getColumnDataTypes(); + + void setColumnNames(String[] header); + + int getDocumentIdColumnIndex(); + + default Class determineDataType(String value) { + Class dataType = String.class; + try { + Integer.parseInt(value); + dataType = Integer.class; + } catch (NumberFormatException e) { + try { + Double.parseDouble(value); + dataType = Double.class; + } catch (NumberFormatException e2) { + if (value.equalsIgnoreCase("false") || value.equalsIgnoreCase("true")) { + dataType = Boolean.class; + } + } + } + return dataType; + } + } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/DocumentClassAnnotationFormat.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/DocumentClassAnnotationFormat.java index 6376e803d..115d8de94 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/DocumentClassAnnotationFormat.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/DocumentClassAnnotationFormat.java @@ -2,6 +2,10 @@ import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalDocumentClassAnnotation; +import java.util.List; + +import static de.julielab.jcore.ae.annotationadder.annotationsources.TextAnnotationProvider.COL_DOC_ID; + public class DocumentClassAnnotationFormat implements AnnotationFormat { @Override public ExternalDocumentClassAnnotation parse(String data) { @@ -14,7 +18,32 @@ public ExternalDocumentClassAnnotation parse(String data) { String docId = record[1]; String documentClass = record[2].intern(); String componentId = record[3].intern(); - String type = null; return new ExternalDocumentClassAnnotation(docId, documentClass, confidence, componentId); } + + @Override + public void hasHeader(boolean withHeader) { + // does nothing right now + } + + @Override + public String[] getHeader() { + return new String[]{"confidence", COL_DOC_ID, "documentClass", "componentId"}; + } + + @Override + public List> getColumnDataTypes() { + return List.of(Double.class, String.class, String.class, String.class); + } + + @Override + public void setColumnNames(String[] header) { + // does nothing right now + } + + @Override + public int getDocumentIdColumnIndex() { + return 1; + } + } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/FeatureBasedTSVFormat.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/FeatureBasedTSVFormat.java new file mode 100644 index 000000000..1e83dc73d --- /dev/null +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/FeatureBasedTSVFormat.java @@ -0,0 +1,104 @@ +package de.julielab.jcore.ae.annotationadder.annotationformat; + +import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +import static de.julielab.jcore.ae.annotationadder.annotationsources.TextAnnotationProvider.COL_UIMA_TYPE; + +public class FeatureBasedTSVFormat implements AnnotationFormat { + private final static Logger log = LoggerFactory.getLogger(FeatureBasedTSVFormat.class); + private String[] header; + private boolean withHeader; + private Integer uimaTypeIndex; + private List> columnDataTypes; + + @Override + public ExternalTextAnnotation parse(String data) { + if (data == null || data.startsWith("#")) + return null; + final String[] record = data.split("\t"); + if (record.length < 3) + throw new IllegalArgumentException("Expected at least 3 column format providing document ID, begin and end offset for the annotation but got " + record.length + " columns: " + data); + if (withHeader && header == null) { + header = record; + return null; + } + if (columnDataTypes == null) + columnDataTypes = new ArrayList<>(header.length); + if (uimaTypeIndex == null) { + uimaTypeIndex = -1; + for (int i = 0; i < header.length; i++) { + if (header[i].equals(COL_UIMA_TYPE)) + uimaTypeIndex = i; + } + if (uimaTypeIndex == 0) + throw new IllegalArgumentException("Found the uima_type column at index 0. However, the first column is reserved for the document ID."); + } + if (columnDataTypes.isEmpty()) + determineColumnDataTypes(record); + String docId = record[0]; + String type = uimaTypeIndex >= 0 ? record[uimaTypeIndex] : null; + ExternalTextAnnotation externalTextAnnotation = new ExternalTextAnnotation(docId, 0, 0, type); + externalTextAnnotation.setPayloadFeatureValues(true); + for (int i = 1; i < Math.min(header.length, record.length); i++) { + String featureName = header[i]; + String columnValue = record[i]; + if (!featureName.equals(COL_UIMA_TYPE)) + externalTextAnnotation.addPayload(featureName, convertValueToFieldDataType(columnValue, i)); + } + + return externalTextAnnotation; + } + + private Object convertValueToFieldDataType(String columnValue, int columnIndex) { + final Class columnDataType = columnDataTypes.get(columnIndex); + if (columnDataType.equals(Integer.class)) + return Integer.parseInt(columnValue); + else if (columnDataType.equals(Double.class)) + return Double.parseDouble(columnValue); + else if (columnDataType.equals(Boolean.class)) + return Boolean.parseBoolean(columnValue); + return columnValue.intern(); + } + + private void determineColumnDataTypes(String[] record) { + for (int i = 0; i < record.length; i++) { + String value = record[i]; + Class dataType = determineDataType(value); + columnDataTypes.add(dataType); + } + log.info("Identified the data types of columns {} as {}", header, columnDataTypes); + } + + + @Override + public void hasHeader(boolean withHeader) { + this.withHeader = withHeader; + } + + @Override + public String[] getHeader() { + return header; + } + + @Override + public List> getColumnDataTypes() { + if (columnDataTypes == null) + throw new IllegalStateException("The column data types are not yet set. This call must come after the first line of data has been read."); + return columnDataTypes; + } + + @Override + public void setColumnNames(String[] header) { + this.header = header; + } + + @Override + public int getDocumentIdColumnIndex() { + return 0; + } +} diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormat.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormat.java index b35e4f26c..a47bc5d55 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormat.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormat.java @@ -2,7 +2,17 @@ import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static de.julielab.jcore.ae.annotationadder.annotationsources.TextAnnotationProvider.*; + public class SimpleTSVEntityAnnotationFormat implements AnnotationFormat { + private String[] header; + private boolean withHeader; + private List> columnDataTypes; + @Override public ExternalTextAnnotation parse(String data) { if (data == null || data.startsWith("#")) @@ -10,12 +20,63 @@ public ExternalTextAnnotation parse(String data) { final String[] record = data.split("\t"); if (record.length < 3) throw new IllegalArgumentException("Expected a 3 or 4-column format providing document ID, begin, end and UIMA type (optional if the default type is set to the AnnotationAdderAnnotator) for the annotation but got " + record.length + " columns: " + data); + if (withHeader && header == null) { + header = record; + return null; + } + boolean columnDataTypesWasNull = columnDataTypes == null; + if (columnDataTypesWasNull) { + columnDataTypes = Stream.of(String.class, Integer.class, Integer.class).collect(Collectors.toList()); + } String docId = record[0]; int begin = Integer.parseInt(record[1]); int end = Integer.parseInt(record[2]); String type = null; - if (record.length > 3) + if (record.length > 3) { type = record[3]; - return new ExternalTextAnnotation(docId, begin, end, type); + if (columnDataTypesWasNull) + columnDataTypes.add(String.class); + } + if (header == null && record.length <= 3) + header = new String[]{COL_DOC_ID, COL_BEGIN, COL_END, COL_UIMA_TYPE}; + ExternalTextAnnotation externalTextAnnotation = new ExternalTextAnnotation(docId, begin, end, type); + if (record.length > 4) { + if (header != null) { + for (int i = 4; i < record.length; i++) { + externalTextAnnotation.addPayload(header[i], record[i]); + if (columnDataTypesWasNull) { + columnDataTypes.add(determineDataType(record[i])); + } + } + } + } + return externalTextAnnotation; + } + + @Override + public void hasHeader(boolean withHeader) { + this.withHeader = withHeader; + } + + @Override + public String[] getHeader() { + return header; + } + + @Override + public List> getColumnDataTypes() { + if (columnDataTypes == null) + throw new IllegalStateException("The column data types are not yet set. This call must come after the first line of data has been read."); + return columnDataTypes; + } + + @Override + public void setColumnNames(String[] header) { + this.header = header; + } + + @Override + public int getDocumentIdColumnIndex() { + return 0; } } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityWithDocumentTextShaAnnotationFormat.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityWithDocumentTextShaAnnotationFormat.java index f46893595..39bdf0016 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityWithDocumentTextShaAnnotationFormat.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityWithDocumentTextShaAnnotationFormat.java @@ -2,7 +2,12 @@ import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation; +import java.util.List; + +import static de.julielab.jcore.ae.annotationadder.annotationsources.TextAnnotationProvider.*; + public class SimpleTSVEntityWithDocumentTextShaAnnotationFormat implements AnnotationFormat { + private List> columnDataTypes; @Override public ExternalTextAnnotation parse(String data) { if (data == null || data.startsWith("#")) @@ -17,8 +22,37 @@ public ExternalTextAnnotation parse(String data) { String type = null; if (record.length > 4) type = record[4].intern(); + if (columnDataTypes==null) + columnDataTypes = List.of(String.class, Integer.class, Integer.class, String.class, String.class); final ExternalTextAnnotation externalTextAnnotation = new ExternalTextAnnotation(docId, begin, end, type); externalTextAnnotation.addPayload("sha", sha); return externalTextAnnotation; } + + @Override + public void hasHeader(boolean withHeader) { + // does nothing right now + } + + @Override + public String[] getHeader() { + return new String[]{COL_DOC_ID, COL_BEGIN, COL_END, "sha", COL_UIMA_TYPE}; + } + + @Override + public List> getColumnDataTypes() { + if (columnDataTypes == null) + throw new IllegalStateException("The column data types are not yet set. This call must come after the first line of data has been read."); + return columnDataTypes; + } + + @Override + public void setColumnNames(String[] header) { + // does nothing right now + } + + @Override + public int getDocumentIdColumnIndex() { + return 0; + } } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/AnnotationList.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/AnnotationList.java index afa5e074d..44da0c57c 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/AnnotationList.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/AnnotationList.java @@ -1,8 +1,34 @@ package de.julielab.jcore.ae.annotationadder.annotationrepresentations; import java.util.ArrayList; +import java.util.Collection; public class AnnotationList extends ArrayList implements AnnotationData { + @Override + public boolean add(T t) { + setDocId(t.getDocumentId()); + return super.add(t); + } + + @Override + public void add(int index, T element) { + setDocId(element.getDocumentId()); + super.add(index, element); + } + + @Override + public boolean addAll(Collection c) { + if (c != null) + c.stream().findAny().ifPresent(annotation -> setDocId(annotation.getDocumentId())); + return super.addAll(c); + } + + @Override + public boolean addAll(int index, Collection c) { + if (c != null) + c.stream().findAny().ifPresent(annotation -> setDocId(annotation.getDocumentId())); + return super.addAll(index, c); + } private String docId; @@ -11,11 +37,12 @@ public String getDocId() { } public void setDocId(String docId) { + if (docId != null && this.docId != null && !docId.equals(this.docId)) + throw new IllegalArgumentException("This annotation list already contains annotations for document with ID " + this.docId + " but the document ID should now be set to " + docId + "."); this.docId = docId; } @Override - public String getDocumentId() { return docId; } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/ExternalTextAnnotation.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/ExternalTextAnnotation.java index bd1408f47..cd43296f0 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/ExternalTextAnnotation.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/ExternalTextAnnotation.java @@ -1,13 +1,18 @@ package de.julielab.jcore.ae.annotationadder.annotationrepresentations; +import java.util.Collection; +import java.util.Collections; import java.util.HashMap; import java.util.Map; +import static de.julielab.jcore.ae.annotationadder.annotationsources.TextAnnotationProvider.*; + public class ExternalTextAnnotation implements TextAnnotation { private String documentId; private int start; private int end; private String uimaType; + private boolean payloadFeatureValues; private Map payload; public ExternalTextAnnotation(String documentId, int start, int end, String uimaType) { @@ -56,7 +61,30 @@ public void addPayload(String key, Object value) { payload.put(key, value); } + public Map getAllFieldValuesAsMap() { + final Map values = new HashMap<>(); + values.put(COL_BEGIN, start); + values.put(COL_END, end); + values.put(COL_UIMA_TYPE, uimaType); + values.put(COL_DOC_ID, documentId); + if (payload != null) + values.putAll(payload); + return values; + } + public Object getPayload(String key) { return payload != null ? payload.get(key) : null; } + + public Collection getPayloadKeys() { + return payload != null ? payload.keySet() : Collections.emptySet(); + } + + public boolean isPayloadFeatureValues() { + return payloadFeatureValues; + } + + public void setPayloadFeatureValues(boolean payloadFeatureValues) { + this.payloadFeatureValues = payloadFeatureValues; + } } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/AnnotationSource.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/AnnotationSource.java index d7a1daad9..5a18be30e 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/AnnotationSource.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/AnnotationSource.java @@ -3,7 +3,12 @@ import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationData; import org.apache.uima.resource.DataResource; +import java.io.IOException; +import java.net.URI; + public interface AnnotationSource { - void initialize(DataResource dataResource); + void loadAnnotations(URI annotationUri) throws IOException; + + void initialize(DataResource dataResource) throws IOException; T getAnnotations(String id); } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/FileAnnotationSource.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/FileAnnotationSource.java deleted file mode 100644 index 4e6ba0a88..000000000 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/FileAnnotationSource.java +++ /dev/null @@ -1,44 +0,0 @@ -package de.julielab.jcore.ae.annotationadder.annotationsources; - -import de.julielab.java.utilities.FileUtilities; -import de.julielab.jcore.ae.annotationadder.annotationformat.AnnotationFormat; -import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationData; -import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationList; -import org.apache.uima.resource.DataResource; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.BufferedReader; -import java.io.File; -import java.io.IOException; -import java.util.Map; -import java.util.stream.Collectors; - -public class FileAnnotationSource implements AnnotationSource> { - private final static Logger log = LoggerFactory.getLogger(FileAnnotationSource.class); - private AnnotationFormat format; - private Map> entitiesByDocId; - - public FileAnnotationSource(AnnotationFormat format) { - this.format = format; - } - - public void loadAnnotations(File annotationfile) { - try (BufferedReader br = FileUtilities.getReaderFromFile(annotationfile)) { - entitiesByDocId = br.lines().map(format::parse).collect(Collectors.groupingBy(AnnotationData::getDocumentId, Collectors.toCollection(AnnotationList::new))); - } catch (IOException e) { - e.printStackTrace(); - } - } - - @Override - public void initialize(DataResource dataResource) { - log.info("Loading entity annotations from {}", dataResource.getUri()); - loadAnnotations(new File(dataResource.getUri())); - } - - @Override - public AnnotationList getAnnotations(String id) { - return entitiesByDocId.get(id); - } -} diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/H2AnnotationSource.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/H2AnnotationSource.java new file mode 100644 index 000000000..326c7746c --- /dev/null +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/H2AnnotationSource.java @@ -0,0 +1,229 @@ +package de.julielab.jcore.ae.annotationadder.annotationsources; + +import de.julielab.java.utilities.UriUtilities; +import de.julielab.jcore.ae.annotationadder.annotationformat.AnnotationFormat; +import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationData; +import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationList; +import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalDocumentClassAnnotation; +import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation; +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.NotImplementedException; +import org.apache.uima.resource.DataResource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.net.URI; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.*; +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static de.julielab.jcore.ae.annotationadder.annotationsources.TextAnnotationProvider.*; + +public class H2AnnotationSource implements AnnotationSource> { + private final static Logger log = LoggerFactory.getLogger(H2AnnotationSource.class); + private AnnotationFormat format; + private Path h2DbPath; + private Statement queryStmt; + private Class annotationDataClass; + + public H2AnnotationSource(AnnotationFormat format) { + this.format = format; + if (format.getHeader() == null) + throw new IllegalArgumentException("To use the H2AnnotationSource, the input format must define the column headers. The employed format " + format + " does not specify them itself. Thus, the header must be specified in the component descriptor external resource definition."); + try { + Class.forName("org.h2.Driver"); + } catch (ClassNotFoundException e) { + log.error("Could not load the h2 Driver through 'Class.forName(\"org.h2.Driver\")."); + throw new IllegalStateException(e); + } + } + + @Override + public void loadAnnotations(URI annotationUri) throws IOException { + final Path annotationFilePath = annotationUri.toString().contains("file:"+File.separator) ? Path.of(annotationUri) : Path.of(annotationUri.toString().replace("file:", "")); + h2DbPath = annotationFilePath.isAbsolute() ? Path.of(annotationFilePath + ".h2") : Path.of("."+ File.separator+annotationFilePath+".h2"); + if (!Files.exists(h2DbPath) || Files.getLastModifiedTime(annotationFilePath).toMillis() < Files.getLastModifiedTime(h2DbPath).toMillis()) { + log.info("Source annotation file {} is newer than database file {}. Creating a new database.", annotationFilePath, h2DbPath); + Files.list(h2DbPath.getParent()).filter(p -> p.toString().startsWith(h2DbPath.toString())).forEach(p -> FileUtils.deleteQuietly(p.toFile())); + try (Connection conn = DriverManager. + getConnection("jdbc:h2:" + h2DbPath, "sa", "")) { + conn.setAutoCommit(false); + PreparedStatement ps = null; + Map columnIndexes = new HashMap<>(); + try (BufferedReader br = UriUtilities.getReaderFromUri(annotationUri)) { + final Iterator iterator = br.lines().map(format::parse).filter(Objects::nonNull).iterator(); + boolean firstDataItem = true; + int psSize = 0; + int linesRead = 0; + while (iterator.hasNext()) { + ++linesRead; + T annotationData = iterator.next(); + // We need to create the table after the retrieval of the first annotation item because the + // format parser derive the data types from the data + if (firstDataItem) { + for (int i = 0; i < format.getHeader().length; i++) { + if (format.getHeader()[i].equals("begin")) + format.getHeader()[i] = COL_BEGIN; + else if (format.getHeader()[i].equals("end")) + format.getHeader()[i] = COL_END; + } + IntStream.range(0, format.getHeader().length).forEach(i -> columnIndexes.put(format.getHeader()[i], i)); + annotationDataClass = annotationData.getClass(); + createAnnotationTable(conn, annotationData); + String insertionSql = "INSERT INTO annotations VALUES (" + IntStream.range(0, format.getHeader().length).mapToObj(i -> "?").collect(Collectors.joining(",")) + ")"; + ps = conn.prepareStatement(insertionSql); + firstDataItem = false; + } + if (annotationData instanceof ExternalDocumentClassAnnotation) + throw new NotImplementedException("ExternalDocumentClassAnnotation data is currently not supprted by the H2AnnotationSource."); + ExternalTextAnnotation textAnnotation = (ExternalTextAnnotation) annotationData; + final Map fieldValues = textAnnotation.getAllFieldValuesAsMap(); + for (String columnName : format.getHeader()) { + ps.setObject(columnIndexes.get(columnName) + 1, fieldValues.get(columnName)); + } + ps.addBatch(); + ++psSize; + if (psSize % 50 == 0) { + ps.executeBatch(); + } + if (psSize % 10000 == 0 && log.isTraceEnabled()) { + int numRows = getCount(conn, "SELECT count(*) FROM annotations"); + int numDocIds = getCount(conn, "SELECT count(DISTINCT docId) FROM annotations"); + log.trace("Loaded {} entity annotations for {} document IDs.", numRows, numDocIds); + } + if (linesRead % 10000 == 0 && log.isTraceEnabled()) { + log.trace("Read {} lines from input {}", linesRead, annotationUri); + } + } + if (psSize > 0) + ps.executeBatch(); + } + if (log.isTraceEnabled()) { + int numRows = getCount(conn, "SELECT count(*) FROM annotations"); + int numDocIds = getCount(conn, "SELECT count(DISTINCT docId) FROM annotations"); + log.trace("Loaded {} entity annotations for {} document IDs.", numRows, numDocIds); + } + conn.commit(); + } catch (SQLException e) { + log.error("Could not create H2 database at {}", h2DbPath); + throw new IllegalStateException(e); + } + } + } + + private int getCount(Connection conn, String sql) { + try { + final ResultSet rs = conn.createStatement().executeQuery(sql); + if (rs.next()) + return rs.getInt(1); + } catch (SQLException e) { + log.error("Could not count rows via SQL query {}", sql, e); + throw new IllegalStateException(e); + } + return 0; + } + + private void createAnnotationTable(Connection conn, T annotationData) throws SQLException { + final Statement stmt = conn.createStatement(); + String tableCreationSql = getTableCreationSql(format.getHeader(), format.getColumnDataTypes(), annotationData); + try { + stmt.execute(tableCreationSql); + } catch (SQLException e) { + log.error("Could not create the annotation SQL table with command {}", tableCreationSql, e); + throw new IllegalStateException(e); + } + final String indexCreationSql = "CREATE INDEX annotations_doc_id_idx ON annotations (" + format.getHeader()[format.getDocumentIdColumnIndex()] + ")"; + try { + stmt.execute(indexCreationSql); + } catch (SQLException e) { + log.error("Could not create index on document ID column which should be found at index {} of the header {} with SQL {}.", format.getDocumentIdColumnIndex(), format.getHeader(), indexCreationSql, e); + throw new IllegalStateException(e); + } + } + + private String getTableCreationSql(String[] header, List> columnDataTypes, T annotationData) { + StringBuilder sb = new StringBuilder(); + sb.append("CREATE TABLE annotations ("); + for (int i = 0; i < header.length; i++) { + String columnName = header[i]; + Class dataType = columnDataTypes.get(i); + String dbDataType = getDbDataType(dataType); + sb.append(columnName).append(" ").append(dbDataType); + if (i < header.length - 1) + sb.append(","); + } + sb.append(")"); + return sb.toString(); + } + + private String getDbDataType(Class dataType) { + if (dataType.equals(Integer.class)) + return "INT"; + else if (dataType.equals(Double.class)) + return "DOUBLE"; + else if (dataType.equals(Boolean.class)) + return "BOOL"; + return "VARCHAR"; + } + + @Override + public void initialize(DataResource dataResource) throws IOException { + log.info("Loading entity annotations from {}", dataResource.getUri()); + loadAnnotations(dataResource.getUri()); + } + + @Override + public AnnotationList getAnnotations(String id) { + try { + if (queryStmt == null) { + Connection queryConn = DriverManager. + getConnection("jdbc:h2:" + h2DbPath, "sa", ""); + queryStmt = queryConn.createStatement(); + } + } catch (SQLException e) { + log.error("Could not connect to database at {}", h2DbPath, e); + throw new IllegalStateException(e); + } + final String sql = "SELECT * FROM annotations WHERE docId='" + id + "'"; + try { + final ResultSet rs = queryStmt.executeQuery(sql); + final AnnotationList annotationList = new AnnotationList<>(); + while (rs.next()) { + T textAnnotation = null; + if (annotationDataClass == null) + throw new IllegalStateException("The annotation data class should have been recorded when data was read from file but it is null."); + try { + if (annotationDataClass.equals(ExternalTextAnnotation.class)) + textAnnotation = (T) annotationDataClass.getConstructor(String.class, int.class, int.class, String.class).newInstance(rs.getString(COL_DOC_ID), rs.getInt(COL_BEGIN), rs.getInt(COL_END), rs.getString(COL_UIMA_TYPE)); + else + throw new NotImplementedException("The annotation class " + annotationDataClass + " is currently not supported by the H2AnnotationSource."); + } catch (Exception e) { + log.error("Could not create instance of annotation data class {}", annotationDataClass, e); + } + for (String columnName : format.getHeader()) { + final Object value = rs.getObject(columnName); + if (value != null && textAnnotation instanceof ExternalTextAnnotation && !columnName.equals(COL_UIMA_TYPE) && !columnName.equals(COL_DOC_ID)) { + ExternalTextAnnotation a = (ExternalTextAnnotation) textAnnotation; + String payLoadKey = columnName; + if(payLoadKey.equals(COL_BEGIN)) + payLoadKey = "begin"; + else if (payLoadKey.equals(COL_END)) + payLoadKey = "end"; + a.addPayload(payLoadKey, value); + } + } + annotationList.add(textAnnotation); + } + return annotationList; + } catch (SQLException e) { + log.error("Could not retrieve annotation values from the H2 database via SQL query '{}'", sql); + throw new IllegalStateException(e); + } + } +} diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/H2TextAnnotationProvider.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/H2TextAnnotationProvider.java new file mode 100644 index 000000000..a70c3af5f --- /dev/null +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/H2TextAnnotationProvider.java @@ -0,0 +1,17 @@ +package de.julielab.jcore.ae.annotationadder.annotationsources; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class H2TextAnnotationProvider extends TextAnnotationProvider { + private final static Logger log = LoggerFactory.getLogger(H2TextAnnotationProvider.class); + @Override + void initializeAnnotationSource() { + annotationSource = new H2AnnotationSource<>(format); + } + + @Override + Logger getLogger() { + return log; + } +} diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryAnnotationSource.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryAnnotationSource.java new file mode 100644 index 000000000..f82929792 --- /dev/null +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryAnnotationSource.java @@ -0,0 +1,46 @@ +package de.julielab.jcore.ae.annotationadder.annotationsources; + +import de.julielab.java.utilities.UriUtilities; +import de.julielab.jcore.ae.annotationadder.annotationformat.AnnotationFormat; +import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationData; +import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationList; +import org.apache.uima.resource.DataResource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.IOException; +import java.net.URI; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + +public class InMemoryAnnotationSource implements AnnotationSource> { + private final static Logger log = LoggerFactory.getLogger(InMemoryAnnotationSource.class); + private AnnotationFormat format; + private Map> entitiesByDocId; + + public InMemoryAnnotationSource(AnnotationFormat format) { + this.format = format; + } + + @Override + public void loadAnnotations(URI annotationUri) throws IOException { + try (BufferedReader br = UriUtilities.getReaderFromUri(annotationUri)) { + entitiesByDocId = br.lines().map(format::parse).filter(Objects::nonNull).collect(Collectors.groupingBy(AnnotationData::getDocumentId, Collectors.toCollection(AnnotationList::new))); + } + if (log.isTraceEnabled()) + log.trace("Loaded {} entity annotations for {} document IDs.", entitiesByDocId.values().stream().flatMap(AnnotationList::stream).count(), entitiesByDocId.size()); + } + + @Override + public void initialize(DataResource dataResource) throws IOException { + log.info("Loading entity annotations from {}", dataResource.getUri()); + loadAnnotations(dataResource.getUri()); + } + + @Override + public AnnotationList getAnnotations(String id) { + return entitiesByDocId.get(id); + } +} diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileDocumentClassAnnotationProvider.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileDocumentClassAnnotationProvider.java index ab95d5759..69e91f14a 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileDocumentClassAnnotationProvider.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileDocumentClassAnnotationProvider.java @@ -6,6 +6,8 @@ import org.apache.uima.resource.DataResource; import org.apache.uima.resource.ResourceInitializationException; +import java.io.IOException; + public class InMemoryFileDocumentClassAnnotationProvider implements AnnotationProvider { private AnnotationSource> annotationSource; @@ -17,8 +19,12 @@ public AnnotationList getAnnotations(String id) @Override public void load(DataResource dataResource) throws ResourceInitializationException { // This logic could be made configurable if required so in the future. - annotationSource = new FileAnnotationSource(new DocumentClassAnnotationFormat()); - annotationSource.initialize(dataResource); + annotationSource = new InMemoryAnnotationSource(new DocumentClassAnnotationFormat()); + try { + annotationSource.initialize(dataResource); + } catch (IOException e) { + throw new ResourceInitializationException(e); + } } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java index 6de11f4d3..950069570 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java @@ -1,42 +1,17 @@ package de.julielab.jcore.ae.annotationadder.annotationsources; -import de.julielab.jcore.ae.annotationadder.annotationformat.AnnotationFormat; -import de.julielab.jcore.ae.annotationadder.annotationformat.SimpleTSVEntityAnnotationFormat; -import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationList; -import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation; -import org.apache.uima.resource.DataResource; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.metadata.ConfigurationParameterSettings; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.lang.reflect.InvocationTargetException; -import java.util.Optional; - -public class InMemoryFileTextAnnotationProvider implements AnnotationProvider { - public static final String PARAM_ANNOTATION_FORMAT = "AnnotationFormatClass"; +public class InMemoryFileTextAnnotationProvider extends TextAnnotationProvider { private final static Logger log = LoggerFactory.getLogger(InMemoryFileTextAnnotationProvider.class); - private AnnotationSource annotationSource; - @Override - public AnnotationList getAnnotations(String id) { - return annotationSource.getAnnotations(id); + void initializeAnnotationSource() { + annotationSource = new InMemoryAnnotationSource<>(format); } @Override - public void load(DataResource dataResource) throws ResourceInitializationException { - final ConfigurationParameterSettings parameterSettings = dataResource.getMetaData().getConfigurationParameterSettings(); - final String formatClassName = (String) Optional.ofNullable(parameterSettings.getParameterValue(PARAM_ANNOTATION_FORMAT)).orElse(SimpleTSVEntityAnnotationFormat.class.getCanonicalName()); - AnnotationFormat format; - try { - format = (AnnotationFormat) Class.forName(formatClassName).getDeclaredConstructor().newInstance(); - } catch (NoSuchMethodException | InvocationTargetException | InstantiationException | IllegalAccessException | ClassNotFoundException e) { - log.error("Could not instantiate class {}", formatClassName); - throw new ResourceInitializationException(e); - } - annotationSource = new FileAnnotationSource(format); - annotationSource.initialize(dataResource); + Logger getLogger() { + return log; } - - } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/TextAnnotationProvider.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/TextAnnotationProvider.java new file mode 100644 index 000000000..007ac0bae --- /dev/null +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/TextAnnotationProvider.java @@ -0,0 +1,58 @@ +package de.julielab.jcore.ae.annotationadder.annotationsources; + +import de.julielab.jcore.ae.annotationadder.annotationformat.AnnotationFormat; +import de.julielab.jcore.ae.annotationadder.annotationformat.SimpleTSVEntityAnnotationFormat; +import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationList; +import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation; +import org.apache.uima.resource.DataResource; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.metadata.ConfigurationParameterSettings; +import org.slf4j.Logger; + +import java.io.IOException; +import java.lang.reflect.InvocationTargetException; +import java.util.Optional; + +public abstract class TextAnnotationProvider implements AnnotationProvider { + public static final String PARAM_ANNOTATION_FORMAT = "AnnotationFormatClass"; + public static final String PARAM_INPUT_HAS_HEADER = "InputHasHeader"; + public static final String PARAM_COLUMN_NAMES = "ColumnNames"; + public static final String COL_DOC_ID = "docId"; + public static final String COL_BEGIN = "beginOffset"; + public static final String COL_END = "endOffset"; + public static final String COL_UIMA_TYPE = "uimaType"; + protected Logger log; + protected AnnotationSource> annotationSource; + protected AnnotationFormat format; + + @Override + public AnnotationList getAnnotations(String id) { + return annotationSource.getAnnotations(id); + } + + abstract void initializeAnnotationSource(); + + abstract Logger getLogger(); + + @Override + public void load(DataResource dataResource) throws ResourceInitializationException { + final ConfigurationParameterSettings parameterSettings = dataResource.getMetaData().getConfigurationParameterSettings(); + final String formatClassName = (String) Optional.ofNullable(parameterSettings.getParameterValue(PARAM_ANNOTATION_FORMAT)).orElse(SimpleTSVEntityAnnotationFormat.class.getCanonicalName()); + final boolean hasHeader = (boolean) Optional.ofNullable(parameterSettings.getParameterValue(PARAM_INPUT_HAS_HEADER)).orElse(false); + final String[] columnNames = (String[])parameterSettings.getParameterValue(PARAM_COLUMN_NAMES); + try { + format = (AnnotationFormat) Class.forName(formatClassName).getDeclaredConstructor().newInstance(); + format.hasHeader(hasHeader); + format.setColumnNames(columnNames); + } catch (NoSuchMethodException | InvocationTargetException | InstantiationException | IllegalAccessException | ClassNotFoundException e) { + getLogger().error("Could not instantiate class {}", formatClassName); + throw new ResourceInitializationException(e); + } + initializeAnnotationSource(); + try { + annotationSource.initialize(dataResource); + } catch (IOException e) { + throw new ResourceInitializationException(e); + } + } +} diff --git a/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml b/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml index fcd2c1d27..585e4eeb7 100644 --- a/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml +++ b/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml @@ -6,7 +6,7 @@ JCoRe Annotation Adder This component helps to import annotations made on the exact CAS document text by an external process back into the CAS. To this end, the component is prepared to read several data formats. Currently, simple offset-based annotations are supported with configurable UIMA types. The component supports character and token based offsets. - 2.5.1-SNAPSHOT + 2.6.0 OffsetMode @@ -79,6 +79,20 @@ false false + + InputHasHeader + Indicates whether the the input TSV file has a header line. + Boolean + false + false + + + ColumnNames + For column formats without a header. Required when the columns should be mapped to annotation type features. Then, the headers must correspond to the feature names and are case sensitive. When specified, the number of elements for this parameter must equal the number of columns in the input file. Then, the i-th parameter value will be set as the name of the i-th column. + String + true + false + diff --git a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java index 65c0de306..d0be14929 100644 --- a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java +++ b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java @@ -1,9 +1,12 @@ package de.julielab.jcore.ae.annotationadder; +import de.julielab.jcore.ae.annotationadder.annotationsources.H2TextAnnotationProvider; import de.julielab.jcore.ae.annotationadder.annotationsources.InMemoryFileDocumentClassAnnotationProvider; import de.julielab.jcore.ae.annotationadder.annotationsources.InMemoryFileTextAnnotationProvider; +import de.julielab.jcore.ae.annotationadder.annotationsources.TextAnnotationProvider; import de.julielab.jcore.types.*; +import org.apache.commons.io.FileUtils; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.fit.factory.AnalysisEngineFactory; @@ -12,10 +15,13 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ExternalResourceDescription; +import org.apache.uima.resource.SharedResourceObject; import org.assertj.core.data.Offset; -import org.junit.Test; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; import java.io.File; +import java.nio.file.Path; import java.util.ArrayList; import java.util.List; @@ -25,10 +31,27 @@ * */ public class AnnotationAdderAnnotatorTest{ + + @AfterEach + public void cleanup() { + Path h2DbPath = Path.of("src", "test", "resources", "geneannotations_character_offsets.tsv.h2.mv.db"); + FileUtils.deleteQuietly(h2DbPath.toFile()); + } + + @Test + public void testCharacterOffsetsInMemory() throws Exception { + testCharacterOffsets(InMemoryFileTextAnnotationProvider.class); + } + @Test - public void testCharacterOffsets() throws Exception { + public void testCharacterOffsetsH2DB() throws Exception { + testCharacterOffsets(H2TextAnnotationProvider.class); + } + + + public void testCharacterOffsets(Class annotationProviderClass) throws Exception { final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types"); - final ExternalResourceDescription externalResourceDescription = ExternalResourceFactory.createExternalResourceDescription(InMemoryFileTextAnnotationProvider.class, new File("src/test/resources/geneannotations_character_offsets.tsv")); + final ExternalResourceDescription externalResourceDescription = ExternalResourceFactory.createExternalResourceDescription(annotationProviderClass, new File("src/test/resources/geneannotations_character_offsets.tsv"), TextAnnotationProvider.PARAM_COLUMN_NAMES, new String[]{"docId", "begin", "end", "uimaType", "confidence", "specificType"}); final AnalysisEngine engine = AnalysisEngineFactory.createEngine(AnnotationAdderAnnotator.class, AnnotationAdderAnnotator.KEY_ANNOTATION_SOURCE, externalResourceDescription); // Test doc1 (two gene annotations) jCas.setDocumentText("BRCA PRKII are the genes of this sentence."); @@ -47,7 +70,100 @@ public void testCharacterOffsets() throws Exception { assertThat(genes.get(1).getBegin()).isEqualTo(5); assertThat(genes.get(1).getEnd()).isEqualTo(10); - // Test doc2 (no gene annotations) + // Test doc2 (no gene annotations, there will be a warning on DEBUG level) + jCas.reset(); + jCas.setDocumentText("There are no gene mentions in here"); + Header h2 = new Header(jCas); + h2.setDocId("doc2"); + h2.addToIndexes(); + engine.process(jCas); + assertThat(JCasUtil.exists(jCas, Gene.class)).isFalse(); + + // Test doc3 (one gene annotation) + jCas.reset(); + jCas.setDocumentText("PRKAVI does not exist, I think. But this is just a test so it doesn't matter."); + Header h3 = new Header(jCas); + h3.setDocId("doc3"); + h3.addToIndexes(); + engine.process(jCas); + final Gene gene = JCasUtil.selectSingle(jCas, Gene.class); + assertThat(gene.getBegin()).isEqualTo(0); + assertThat(gene.getEnd()).isEqualTo(6); + } + + @Test + public void testPayload() throws Exception { + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types"); + final ExternalResourceDescription externalResourceDescription = ExternalResourceFactory.createExternalResourceDescription(InMemoryFileTextAnnotationProvider.class, new File("src/test/resources/geneannotations_character_offsets_payload.tsv"), InMemoryFileTextAnnotationProvider.PARAM_INPUT_HAS_HEADER, true); + final AnalysisEngine engine = AnalysisEngineFactory.createEngine(AnnotationAdderAnnotator.class, AnnotationAdderAnnotator.KEY_ANNOTATION_SOURCE, externalResourceDescription); + // Test doc1 (two gene annotations) + jCas.setDocumentText("BRCA PRKII are the genes of this sentence."); + final Header h = new Header(jCas); + h.setDocId("doc1"); + h.addToIndexes(); + + engine.process(jCas); + + final List genes = new ArrayList<>(JCasUtil.select(jCas, Gene.class)); + assertThat(genes).hasSize(2); + + assertThat(genes.get(0).getBegin()).isEqualTo(0); + assertThat(genes.get(0).getEnd()).isEqualTo(4); + assertThat(genes.get(0).getSpecificType()).isEqualTo("protein"); + + assertThat(genes.get(1).getBegin()).isEqualTo(5); + assertThat(genes.get(1).getEnd()).isEqualTo(10); + assertThat(genes.get(1).getSpecificType()).isEqualTo("dna"); + + // Test doc2 (no gene annotations, there will be a warning on DEBUG level) + jCas.reset(); + jCas.setDocumentText("There are no gene mentions in here"); + Header h2 = new Header(jCas); + h2.setDocId("doc2"); + h2.addToIndexes(); + engine.process(jCas); + assertThat(JCasUtil.exists(jCas, Gene.class)).isFalse(); + + // Test doc3 (one gene annotation) + jCas.reset(); + jCas.setDocumentText("PRKAVI does not exist, I think. But this is just a test so it doesn't matter."); + Header h3 = new Header(jCas); + h3.setDocId("doc3"); + h3.addToIndexes(); + engine.process(jCas); + final Gene gene = JCasUtil.selectSingle(jCas, Gene.class); + assertThat(gene.getBegin()).isEqualTo(0); + assertThat(gene.getEnd()).isEqualTo(6); + assertThat(gene.getComponentId()).isEqualTo("GoldData"); + } + + @Test + public void testHeaderParameter() throws Exception { + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types"); + final ExternalResourceDescription externalResourceDescription = ExternalResourceFactory.createExternalResourceDescription(InMemoryFileTextAnnotationProvider.class, new File("src/test/resources/geneannotations_character_offsets.tsv"), InMemoryFileTextAnnotationProvider.PARAM_COLUMN_NAMES, new String[]{"docId", "begin", "end", "uimaType", "specificType", "componentId"}); + final AnalysisEngine engine = AnalysisEngineFactory.createEngine(AnnotationAdderAnnotator.class, AnnotationAdderAnnotator.KEY_ANNOTATION_SOURCE, externalResourceDescription); + // Test doc1 (two gene annotations) + jCas.setDocumentText("BRCA PRKII are the genes of this sentence."); + final Header h = new Header(jCas); + h.setDocId("doc1"); + h.addToIndexes(); + + engine.process(jCas); + + final List genes = new ArrayList<>(JCasUtil.select(jCas, Gene.class)); + assertThat(genes).hasSize(2); + + assertThat(genes.get(0).getBegin()).isEqualTo(0); + assertThat(genes.get(0).getEnd()).isEqualTo(4); + assertThat(genes.get(0).getSpecificType()).isEqualTo("0.1234"); + assertThat(genes.get(0).getComponentId()).isEqualTo("additionalColumn2"); + + assertThat(genes.get(1).getBegin()).isEqualTo(5); + assertThat(genes.get(1).getEnd()).isEqualTo(10); + assertThat(genes.get(1).getSpecificType()).isEqualTo("0.1234"); + assertThat(genes.get(1).getComponentId()).isEqualTo("additionalColumn2"); + + // Test doc2 (no gene annotations, there will be a warning on DEBUG level) jCas.reset(); jCas.setDocumentText("There are no gene mentions in here"); Header h2 = new Header(jCas); @@ -66,6 +182,8 @@ public void testCharacterOffsets() throws Exception { final Gene gene = JCasUtil.selectSingle(jCas, Gene.class); assertThat(gene.getBegin()).isEqualTo(0); assertThat(gene.getEnd()).isEqualTo(6); + assertThat(gene.getSpecificType()).isEqualTo("0.1234"); + assertThat(gene.getComponentId()).isEqualTo("additionalColumn2"); } @Test diff --git a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelperTest.java b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelperTest.java new file mode 100644 index 000000000..bcb96ec08 --- /dev/null +++ b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelperTest.java @@ -0,0 +1,24 @@ +package de.julielab.jcore.ae.annotationadder; + +import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation; +import de.julielab.jcore.types.Gene; +import org.apache.uima.UIMAException; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class AnnotationAdderHelperTest { + + @Test + void setAnnotationPayloadsToFeatures() throws UIMAException { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types"); + Gene gene = new Gene(jCas); + ExternalTextAnnotation extAnnotation = new ExternalTextAnnotation("1", 0, 1, "dummy"); + extAnnotation.addPayload("specificType", "protein"); + AnnotationAdderHelper helper = new AnnotationAdderHelper(); + helper.setAnnotationPayloadsToFeatures(gene, extAnnotation); + assertEquals("protein", gene.getSpecificType()); + } +} \ No newline at end of file diff --git a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/FeatureBasedTSVFormatTest.java b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/FeatureBasedTSVFormatTest.java new file mode 100644 index 000000000..74e086220 --- /dev/null +++ b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/FeatureBasedTSVFormatTest.java @@ -0,0 +1,27 @@ +package de.julielab.jcore.ae.annotationadder.annotationformat; + +import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; + +class FeatureBasedTSVFormatTest { + + @Test + void parse() { + FeatureBasedTSVFormat format = new FeatureBasedTSVFormat(); + format.hasHeader(true); + // should be ignored + assertNull(format.parse("# comment")); + // should be stored as header but not return something + assertNull(format.parse("docId\tbegin\tend\tcomponentId\tuimaType\tspecificType")); + ExternalTextAnnotation extAnnotation = format.parse("123\t0\t5\tGoldAnnotation\tde.julielab.jcore.types.Gene\tprotein"); + assertEquals("123", extAnnotation.getDocumentId()); + assertEquals(0, extAnnotation.getStart()); + assertEquals(0, extAnnotation.getEnd()); + assertEquals("de.julielab.jcore.types.Gene", extAnnotation.getUimaType()); + assertEquals("protein", extAnnotation.getPayload("specificType")); + assertEquals("GoldAnnotation", extAnnotation.getPayload("componentId")); + } +} \ No newline at end of file diff --git a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormatTest.java b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormatTest.java new file mode 100644 index 000000000..848526c03 --- /dev/null +++ b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormatTest.java @@ -0,0 +1,27 @@ +package de.julielab.jcore.ae.annotationadder.annotationformat; + +import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; + +class SimpleTSVEntityAnnotationFormatTest { + + @Test + void parse() { + SimpleTSVEntityAnnotationFormat format = new SimpleTSVEntityAnnotationFormat(); + format.hasHeader(true); + // should be ignored + assertNull(format.parse("# comment")); + // should be stored as header but not return something + assertNull(format.parse("docId\tbegin\tend\ttype\tspecificType\tcomponentId")); + ExternalTextAnnotation extAnnotation = format.parse("123\t0\t5\tde.julielab.jcore.types.Gene\tprotein\tGoldAnnotation"); + assertEquals("123", extAnnotation.getDocumentId()); + assertEquals(0, extAnnotation.getStart()); + assertEquals(5, extAnnotation.getEnd()); + assertEquals("de.julielab.jcore.types.Gene", extAnnotation.getUimaType()); + assertEquals("protein", extAnnotation.getPayload("specificType")); + assertEquals("GoldAnnotation", extAnnotation.getPayload("componentId")); + } +} \ No newline at end of file diff --git a/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets.tsv b/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets.tsv index a3b4799ab..1f1f04a44 100644 --- a/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets.tsv +++ b/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets.tsv @@ -1,3 +1,3 @@ -doc1 0 4 de.julielab.jcore.types.Gene -doc1 5 10 de.julielab.jcore.types.Gene -doc3 0 6 de.julielab.jcore.types.Gene \ No newline at end of file +doc1 0 4 de.julielab.jcore.types.Gene 0.1234 additionalColumn2 +doc1 5 10 de.julielab.jcore.types.Gene 0.1234 additionalColumn2 +doc3 0 6 de.julielab.jcore.types.Gene 0.1234 additionalColumn2 \ No newline at end of file diff --git a/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets_payload.tsv b/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets_payload.tsv new file mode 100644 index 000000000..7606678d6 --- /dev/null +++ b/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets_payload.tsv @@ -0,0 +1,4 @@ +docId begin end uimaType specificType componentId +doc1 0 4 de.julielab.jcore.types.Gene protein GoldData +doc1 5 10 de.julielab.jcore.types.Gene dna GoldData +doc3 0 6 de.julielab.jcore.types.Gene gene GoldData \ No newline at end of file diff --git a/jcore-annotation-removal-ae/LICENSE b/jcore-annotation-removal-ae/LICENSE new file mode 100644 index 000000000..fbbd41e05 --- /dev/null +++ b/jcore-annotation-removal-ae/LICENSE @@ -0,0 +1,26 @@ +BSD 2-Clause License + +Copyright (c) 2017, JULIE Lab +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/jcore-annotation-removal-ae/README.md b/jcore-annotation-removal-ae/README.md new file mode 100644 index 000000000..563b7ad3e --- /dev/null +++ b/jcore-annotation-removal-ae/README.md @@ -0,0 +1,34 @@ +# JCoRe Annotation Removal AE + +**Descriptor Path**: +``` +de.julielab.jcore.ae.annotationremoval.desc.jcore-annotation-removal-ae +``` + +Removes annotations from the CAS that belong to one of the types specified as a parameter value in the descriptor. + + + +**1. Parameters** + +| Parameter Name | Parameter Type | Mandatory | Multivalued | Description | +|----------------|----------------|-----------|-------------|-------------| +| param1 | UIMA-Type | Boolean | Boolean | Description | +| param2 | UIMA-Type | Boolean | Boolean | Description | + +**2. Predefined Settings** + +| Parameter Name | Parameter Syntax | Example | +|----------------|------------------|---------| +| param1 | Syntax-Description | `Example` | +| param2 | Syntax-Description | `Example` | + +**3. Capabilities** + +| Type | Input | Output | +|------|:-----:|:------:| +| de.julielab.jcore.types.TYPE | | `+` | +| de.julielab.jcore.types.ace.TYPE | `+` | | + + +[1] Some Literature? diff --git a/jcore-annotation-removal-ae/component.meta b/jcore-annotation-removal-ae/component.meta new file mode 100644 index 000000000..04e9d8c1e --- /dev/null +++ b/jcore-annotation-removal-ae/component.meta @@ -0,0 +1,20 @@ +{ + "categories": [ + "ae" + ], + "description": "Removes annotations from the CAS that belong to one of the types specified as a parameter value in the descriptor.", + "descriptors": [ + { + "category": "ae", + "location": "de.julielab.jcore.ae.annotationremoval.desc.jcore-annotation-removal-ae" + } + ], + "exposable": true, + "group": "general", + "maven-artifact": { + "artifactId": "jcore-annotation-removal-ae", + "groupId": "de.julielab", + "version": "2.6.0" + }, + "name": "JCoRe Annotation Removal AE" +} diff --git a/jcore-annotation-removal-ae/pom.xml b/jcore-annotation-removal-ae/pom.xml new file mode 100644 index 000000000..e434a54b2 --- /dev/null +++ b/jcore-annotation-removal-ae/pom.xml @@ -0,0 +1,55 @@ + + + + 4.0.0 + jcore-annotation-removal-ae + jar + de.julielab + + + de.julielab + jcore-base + 2.6.0 + + + + + + + ch.qos.logback + logback-classic + test + + + org.slf4j + slf4j-api + + + de.julielab + jcore-types + ${jcore-types-version} + + + de.julielab + jcore-descriptor-creator + + + org.junit.jupiter + junit-jupiter-engine + + + JCoRe Annotation Removal AE + + JULIE Lab Jena, Germany + http://www.julielab.de + + https://github.com/JULIELab/jcore-base/tree/master/jcore-annotation-removal-ae + Removes annotations from the CAS that belong to one of the types specified as a parameter value in the descriptor. + + + BSD 2-Clause + https://opensource.org/licenses/BSD-2-Clause + + + diff --git a/jcore-annotation-removal-ae/src/main/java/de/julielab/jcore/ae/annotationremoval/AnnotationRemovalAnnotator.java b/jcore-annotation-removal-ae/src/main/java/de/julielab/jcore/ae/annotationremoval/AnnotationRemovalAnnotator.java new file mode 100644 index 000000000..019f06e02 --- /dev/null +++ b/jcore-annotation-removal-ae/src/main/java/de/julielab/jcore/ae/annotationremoval/AnnotationRemovalAnnotator.java @@ -0,0 +1,51 @@ + +package de.julielab.jcore.ae.annotationremoval; + +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; +import org.apache.uima.cas.Type; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.apache.uima.resource.ResourceInitializationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +@ResourceMetaData(name="JCoRe Annotation Removal AE", description = "Removes annotations from the CAS that belong to one of the types specified as a parameter value in the descriptor.", vendor = "JULIE Lab Jena, Germany") +public class AnnotationRemovalAnnotator extends JCasAnnotator_ImplBase { +public static final String PARAM_ANNOTATION_TYPES = "AnnotationTypes"; + private final static Logger log = LoggerFactory.getLogger(AnnotationRemovalAnnotator.class); + + @ConfigurationParameter(name=PARAM_ANNOTATION_TYPES, description="List of qualified UIMA type names for which all annotations should be removed from each CAS.") + private String[] annotationTypesForRemoval; + + /** + * This method is called a single time by the framework at component + * creation. Here, descriptor parameters are read and initial setup is done. + */ + @Override + public void initialize(final UimaContext aContext) throws ResourceInitializationException { + annotationTypesForRemoval = (String[]) aContext.getConfigParameterValue(PARAM_ANNOTATION_TYPES); + if (annotationTypesForRemoval.length == 0) + throw new ResourceInitializationException(new IllegalArgumentException("The list of annotations for removal, given through parameter '" + PARAM_ANNOTATION_TYPES + "' is empty.")); + } + + /** + * This method is called for each document going through the component. This + * is where the actual work happens. + */ + @Override + public void process(final JCas aJCas) { + List removalList = new ArrayList<>(); + for (String annotationTypeName : annotationTypesForRemoval) { + final Type type = aJCas.getTypeSystem().getType(annotationTypeName); + aJCas.getAnnotationIndex(type).forEach(removalList::add); + removalList.forEach(Annotation::removeFromIndexes); + } + } + +} diff --git a/jcore-annotation-removal-ae/src/main/resources/de/julielab/jcore/ae/annotationremoval/desc/jcore-annotation-removal-ae.xml b/jcore-annotation-removal-ae/src/main/resources/de/julielab/jcore/ae/annotationremoval/desc/jcore-annotation-removal-ae.xml new file mode 100644 index 000000000..3cebc1704 --- /dev/null +++ b/jcore-annotation-removal-ae/src/main/resources/de/julielab/jcore/ae/annotationremoval/desc/jcore-annotation-removal-ae.xml @@ -0,0 +1,34 @@ + + + org.apache.uima.java + true + de.julielab.jcore.ae.annotationremoval.AnnotationRemovalAnnotator + + JCoRe Annotation Removal AE + Removes annotations from the CAS that belong to one of the types specified as a parameter value in the descriptor. + 2.6.0 + JULIE Lab Jena, Germany + + + AnnotationTypes + List of qualified UIMA type names for which all annotations should be removed from each CAS. + String + true + true + + + + + + + + + + + + true + true + false + + + \ No newline at end of file diff --git a/jcore-annotation-removal-ae/src/test/java/de/julielab/jcore/ae/annotationremoval/AnnotationRemovalAnnotatorTest.java b/jcore-annotation-removal-ae/src/test/java/de/julielab/jcore/ae/annotationremoval/AnnotationRemovalAnnotatorTest.java new file mode 100644 index 000000000..a401c969f --- /dev/null +++ b/jcore-annotation-removal-ae/src/test/java/de/julielab/jcore/ae/annotationremoval/AnnotationRemovalAnnotatorTest.java @@ -0,0 +1,60 @@ +package de.julielab.jcore.ae.annotationremoval; + +import de.julielab.jcore.types.Gene; +import de.julielab.jcore.types.Sentence; +import de.julielab.jcore.types.Token; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + + +/** + * Unit tests for jcore-annotation-removal-ae. + */ +public class AnnotationRemovalAnnotatorTest { + private final static Logger log = LoggerFactory.getLogger(AnnotationRemovalAnnotatorTest.class); + + @Test + public void testAnnotator() throws Exception { + final AnalysisEngine engine = AnalysisEngineFactory.createEngine("de.julielab.jcore.ae.annotationremoval.desc.jcore-annotation-removal-ae", + AnnotationRemovalAnnotator.PARAM_ANNOTATION_TYPES, new String[]{"de.julielab.jcore.types.Token", "de.julielab.jcore.types.Gene"}); + final JCas jCas = engine.newJCas(); + jCas.setDocumentText("There is a gene in this sentence."); + addTokens(jCas); + new Sentence(jCas, 0, jCas.getDocumentText().length()).addToIndexes(); + new Gene(jCas, 11, 15).addToIndexes(); + + // Check that the annotations we just created are actually there. + assertFalse(JCasUtil.select(jCas, Sentence.class).isEmpty()); + assertFalse(JCasUtil.select(jCas, Token.class).isEmpty()); + assertFalse(JCasUtil.select(jCas, Gene.class).isEmpty()); + + engine.process(jCas); + + // And now check that the annotation that should be removed are really gone. + assertFalse(JCasUtil.select(jCas, Sentence.class).isEmpty()); + assertTrue(JCasUtil.select(jCas, Token.class).isEmpty()); + assertTrue(JCasUtil.select(jCas, Gene.class).isEmpty()); + } + + private void addTokens(JCas jCas) { + Matcher alphanumericalTokens = Pattern.compile("[A-Za-z0-9]+").matcher(jCas.getDocumentText()); + while (alphanumericalTokens.find()) { + new Token(jCas, alphanumericalTokens.start(), alphanumericalTokens.end()).addToIndexes(); + } + Matcher punctuation = Pattern.compile("\\p{Punct}").matcher(jCas.getDocumentText()); + while (alphanumericalTokens.find()) { + new Token(jCas, punctuation.start(), punctuation.end()).addToIndexes(); + } + } +} diff --git a/jcore-banner-ae/component.meta b/jcore-banner-ae/component.meta index 8785baa0c..4ba9b7c9e 100644 --- a/jcore-banner-ae/component.meta +++ b/jcore-banner-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-banner-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Banner" } diff --git a/jcore-banner-ae/pom.xml b/jcore-banner-ae/pom.xml index 6b10e4221..a4f8e8d32 100644 --- a/jcore-banner-ae/pom.xml +++ b/jcore-banner-ae/pom.xml @@ -37,6 +37,10 @@ log4j log4j + + junit + junit + @@ -54,20 +58,24 @@ jcore-mallet-2.0.9 2.1.2 + + de.julielab + jcore-descriptor-creator + de.julielab julielab-java-utilities - junit - junit + org.junit.jupiter + junit-jupiter-engine de.julielab jcore-base - 2.5.1-SNAPSHOT - .. + 2.6.0 + ../pom.xml @@ -75,4 +83,22 @@ https://opensource.org/licenses/BSD-2-Clause + + + + maven-dependency-plugin + + + prepare-package + + copy-dependencies + + + ${project.build.directory}/lib + + + + + + diff --git a/jcore-banner-ae/src/main/java/banner/annotation/BEAT.java b/jcore-banner-ae/src/main/java/banner/annotation/BEAT.java index 113f6139f..38281692f 100644 --- a/jcore-banner-ae/src/main/java/banner/annotation/BEAT.java +++ b/jcore-banner-ae/src/main/java/banner/annotation/BEAT.java @@ -18,8 +18,8 @@ import java.awt.*; import java.awt.event.*; import java.io.IOException; -import java.util.*; import java.util.List; +import java.util.*; public class BEAT extends JFrame implements ActionListener, CaretListener { diff --git a/jcore-banner-ae/src/main/java/banner/tagging/FeatureSet.java b/jcore-banner-ae/src/main/java/banner/tagging/FeatureSet.java index df6548577..009154e3c 100644 --- a/jcore-banner-ae/src/main/java/banner/tagging/FeatureSet.java +++ b/jcore-banner-ae/src/main/java/banner/tagging/FeatureSet.java @@ -111,9 +111,9 @@ private SerialPipes createPipe(TagFormat format, Lemmatiser lemmatiser, dragon.n //siddhartha added these; pipes.add(simFindFilename == null ? new Noop() : new SimFind(simFindFilename)); -// pipes.add(new ChemicalSuffix("CHEM_SUFF=")); -// pipes.add(new MentionTypeHint("MENTION_TYPE=")); -// pipes.add(new ProteinSymbols("PROT_SYM=")); + pipes.add(new ChemicalSuffix("CHEM_SUFF=")); + pipes.add(new MentionTypeHint("MENTION_TYPE=")); + pipes.add(new ProteinSymbols("PROT_SYM=")); pipes.add(new OffsetConjunctions(new int[][] { { -2 }, { -1 }, { 1 }, { 2 } })); pipes.add(new TokenSequence2FeatureVectorSequence(true, true)); diff --git a/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java b/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java index 1c28c28b0..e5cb62761 100644 --- a/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java +++ b/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java @@ -43,31 +43,37 @@ public LemmaPOS(Lemmatiser lemmatiser, Tagger posTagger) { public void setLemmatiser(Lemmatiser lemmatiser) { initResourcesMap(); getResources().lemmatiser = lemmatiser; + System.out.println("Setting lemmatiser to " + Thread.currentThread() + " in object " + this); } public void setPosTagger(Tagger posTagger) { initResourcesMap(); getResources().posTagger = posTagger; + System.out.println("Setting PoS Tagger to " + Thread.currentThread() + " in object " + this); } - private void initResourcesMap() { + synchronized private void initResourcesMap() { if (resourcesByThread == null) resourcesByThread = new HashMap<>(); } private Resources getResources() { - return resourcesByThread.compute(Thread.currentThread(), (t, r) -> { - Resources ret = r; - if (ret == null) - ret = new Resources(); - return ret; - }); + Thread currentThread = Thread.currentThread(); + Resources resources = resourcesByThread.get(currentThread); + if (resources == null) { + resources = new Resources(); + synchronized (resourcesByThread) { +// System.out.println("Creating resources for thread " + currentThread); + resourcesByThread.put(currentThread, resources); + } + } + return resources; } @Override public Instance pipe(Instance carrier) { if (expectLemmatiser != (getResources().lemmatiser != null)) - throw new IllegalStateException("Model was trained with lemmatiser; not present in current config"); + throw new IllegalStateException("Model was trained with lemmatiser; not present in current config; resource map: " + resourcesByThread + ", current thread: " + Thread.currentThread()); if (expectPOSTagger != (getResources().posTagger != null)) throw new IllegalStateException("Model was trained with POS tagger; not present in current config"); // TODO Add prefix ability @@ -112,5 +118,14 @@ public Instance pipe(Instance carrier) { private class Resources { public Lemmatiser lemmatiser; public Tagger posTagger; + + @Override + public String toString() { + return "Resources{" + + "lemmatiser=" + lemmatiser + + ", posTagger=" + posTagger + + ", idHashCode= " + System.identityHashCode(this) + + '}'; + } } } diff --git a/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java b/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java index 1f6077e17..43b29b9fd 100644 --- a/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java +++ b/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java @@ -9,8 +9,10 @@ import banner.types.Mention; import banner.types.Sentence; import de.julielab.jcore.types.EntityMention; +import de.julielab.jcore.types.pubmed.InternalReference; import de.julielab.jcore.utility.JCoReAnnotationTools; import de.julielab.jcore.utility.JCoReTools; +import de.julielab.jcore.utility.index.JCoReOverlapAnnotationIndex; import dragon.nlp.tool.Tagger; import dragon.nlp.tool.lemmatiser.EngLemmatiser; import org.apache.commons.configuration.ConfigurationException; @@ -34,6 +36,7 @@ import java.io.IOException; import java.io.InputStream; import java.util.Arrays; +import java.util.List; import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; @@ -47,6 +50,7 @@ public class BANNERAnnotator extends JCasAnnotator_ImplBase { public static final String PARAM_CONFIG_FILE = "ConfigFile"; public static final String PARAM_TYPE_MAPPING = "TypeMapping"; + public static final String PARAM_COMPONENT_ID = "ComponentId"; private final static Logger log = LoggerFactory.getLogger(BANNERAnnotator.class); private Tokenizer tokenizer; private DictionaryTagger dictionary; @@ -61,6 +65,8 @@ public class BANNERAnnotator extends JCasAnnotator_ImplBase { private String configFilePath; @ConfigurationParameter(name = PARAM_TYPE_MAPPING, mandatory = false, description = "A list of mappings from entity labels to UIMA types in the form - - de.julielab.jcore.types.Sentence + de.julielab.jcore.types.Sentence - de.julielab.jcore.types.Gene + de.julielab.jcore.types.Gene - + true true false - \ No newline at end of file diff --git a/jcore-banner-ae/src/main/resources/desc/BANNERAE.xml b/jcore-banner-ae/src/main/resources/desc/BANNERAE.xml index 28c2a1499..fb2981574 100644 --- a/jcore-banner-ae/src/main/resources/desc/BANNERAE.xml +++ b/jcore-banner-ae/src/main/resources/desc/BANNERAE.xml @@ -6,7 +6,7 @@ BANNERAE - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-banner-ae/src/main/resources/desc/bannerTS.xml b/jcore-banner-ae/src/main/resources/desc/bannerTS.xml index d25adc102..a78fd02a0 100644 --- a/jcore-banner-ae/src/main/resources/desc/bannerTS.xml +++ b/jcore-banner-ae/src/main/resources/desc/bannerTS.xml @@ -2,7 +2,7 @@ bannerTS basic typesystem started by sid - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java b/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java index 12e9e2776..ed1ce4cee 100644 --- a/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java +++ b/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java @@ -12,20 +12,21 @@ import de.julielab.jcore.types.Gene; import de.julielab.jcore.types.Sentence; +import de.julielab.jcore.types.pubmed.InternalReference; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.List; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class BANNERAnnotatorTest { private final static Logger log = LoggerFactory.getLogger(BANNERAnnotatorTest.class); @@ -34,7 +35,8 @@ public void testProcess() throws Exception { // just tag a single sentence with a test model that actually used that sentence as training data. JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-document-meta-types", - "de.julielab.jcore.types.jcore-semantics-biology-types"); + "de.julielab.jcore.types.jcore-semantics-biology-types", + "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); // this is sentence P00055040A0000 from the test BC2GM train data jcas.setDocumentText( "Ten out-patients with pustulosis palmaris et plantaris were examined with direct immunofluorescence (IF) technique for deposition of fibrinogen, fibrin or its degradation products (FR-antigen) in affected and unaffected skin, together with heparin-precipitable fraction (HPF), cryoglobulin and total plasma fibrinogen in the blood."); @@ -59,6 +61,40 @@ public void testProcess() throws Exception { assertEquals("fibrinogen", geneList.get(4).getCoveredText()); } + @Test + public void testInternalReferenceExclusion() throws Exception { + // Internal references in papers, e.g. for bibliography, often appear as numbers. If such a number is + // directly appended to a gene name, it is mostly included into the gene name by BANNER. + // Thus, such reference spans are removed afterwards in the annotator and this test is checking that it works. + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-meta-types", + "de.julielab.jcore.types.jcore-semantics-biology-types", + "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); + // this is sentence P00055040A0000 from the test BC2GM train data EXCEPT the '19' following 'fibrinogen' which + // is our internal reference for this test. + jcas.setDocumentText( + "Ten out-patients with pustulosis palmaris et plantaris were examined with direct immunofluorescence (IF) technique for deposition of fibrinogen19, fibrin or its degradation products (FR-antigen) in affected and unaffected skin, together with heparin-precipitable fraction (HPF), cryoglobulin and total plasma fibrinogen in the blood."); + new Sentence(jcas, 0, jcas.getDocumentText().length()).addToIndexes(); + new InternalReference(jcas, 143, 145).addToIndexes(); + AnalysisEngine bannerAe = AnalysisEngineFactory.createEngine(BANNERAnnotator.class, + BANNERAnnotator.PARAM_CONFIG_FILE, "src/test/resources/banner_ae_test.xml", BANNERAnnotator.PARAM_TYPE_MAPPING, new String[] {"GENE=de.julielab.jcore.types.Gene"}); + bannerAe.process(jcas); + + // expected result from the GENE.eval.small file: + // P00055040A0000|116 125|fibrinogen + // P00055040A0000|127 132|fibrin + // P00055040A0000|158 167|FR-antigen + // P00055040A0000|243 254|cryoglobulin + // P00055040A0000|269 278|fibrinogen + // However, we ignore the offsets because the eval offsets ignore white spaces + List geneList = new ArrayList(JCasUtil.select(jcas, Gene.class)); + assertEquals("fibrinogen", geneList.get(0).getCoveredText()); + assertEquals("fibrin", geneList.get(1).getCoveredText()); + assertEquals("FR-antigen", geneList.get(2).getCoveredText()); + assertEquals("cryoglobulin", geneList.get(3).getCoveredText()); + assertEquals("fibrinogen", geneList.get(4).getCoveredText()); + } + @Test public void testMultithreading() throws Exception { List ts = new ArrayList<>(); @@ -77,7 +113,8 @@ private void tagalot() throws UIMAException { // just tag a single sentence with a test model that actually used that sentence as training data. JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-document-meta-types", - "de.julielab.jcore.types.jcore-semantics-biology-types"); + "de.julielab.jcore.types.jcore-semantics-biology-types", + "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); // this is sentence P00055040A0000 from the test BC2GM train data jcas.setDocumentText( "Maintenance of skeletal muscle mass is regulated by the balance between anabolic and catabolic processes. Mammalian target of rapamycin (mTOR) is an evolutionarily conserved serine/threonine kinase, and is known to play vital roles in protein synthesis. Recent findings have continued to refine our understanding of the function of mTOR in maintaining skeletal muscle mass. mTOR controls the anabolic and catabolic signaling of skeletal muscle mass, resulting in the modulation of muscle hypertrophy and muscle wastage. This review will highlight the fundamental role of mTOR in skeletal muscle growth by summarizing the phenotype of skeletal-specific mTOR deficiency. In addition, the evidence that mTOR is a dual regulator of anabolism and catabolism in skeletal muscle mass will be discussed. A full understanding of mTOR signaling in the maintenance of skeletal muscle mass could help to develop mTOR-targeted therapeutics to prevent muscle wasting."); diff --git a/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/ModelTrainTest.java b/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/ModelTrainTest.java index 7604ae62f..9d5d4958c 100644 --- a/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/ModelTrainTest.java +++ b/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/ModelTrainTest.java @@ -12,11 +12,11 @@ import banner.eval.BANNER; import org.apache.commons.configuration.XMLConfiguration; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertTrue; public class ModelTrainTest { @Test diff --git a/jcore-banner-ae/src/test/java/de/julielab/jcore/banner/dataset/JCoReEntityDatasetTest.java b/jcore-banner-ae/src/test/java/de/julielab/jcore/banner/dataset/JCoReEntityDatasetTest.java index 35925ad84..843106130 100644 --- a/jcore-banner-ae/src/test/java/de/julielab/jcore/banner/dataset/JCoReEntityDatasetTest.java +++ b/jcore-banner-ae/src/test/java/de/julielab/jcore/banner/dataset/JCoReEntityDatasetTest.java @@ -11,14 +11,14 @@ package de.julielab.jcore.banner.dataset; import banner.tokenization.SimpleTokenizer; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.util.Set; import java.util.stream.Collectors; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; public class JCoReEntityDatasetTest { @Test diff --git a/jcore-bc2gm-reader/component.meta b/jcore-bc2gm-reader/component.meta index 748123c36..49b7f8c1a 100644 --- a/jcore-bc2gm-reader/component.meta +++ b/jcore-bc2gm-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-bc2gm-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe BioCreative II Gene Mention Reader" } diff --git a/jcore-bc2gm-reader/pom.xml b/jcore-bc2gm-reader/pom.xml index 1ec0602a9..9cc023682 100644 --- a/jcore-bc2gm-reader/pom.xml +++ b/jcore-bc2gm-reader/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-bc2gm-reader/src/main/resources/de/julielab/jcore/reader/bc2gm/desc/jcore-bc2gm-reader.xml b/jcore-bc2gm-reader/src/main/resources/de/julielab/jcore/reader/bc2gm/desc/jcore-bc2gm-reader.xml index 04e62abd2..7b932fbf9 100644 --- a/jcore-bc2gm-reader/src/main/resources/de/julielab/jcore/reader/bc2gm/desc/jcore-bc2gm-reader.xml +++ b/jcore-bc2gm-reader/src/main/resources/de/julielab/jcore/reader/bc2gm/desc/jcore-bc2gm-reader.xml @@ -5,7 +5,7 @@ JCoRe BioCreative II Gene Mention reader This component reads gene annotated sentences in the BioCreative II Gene Mention challenge format. Each CAS will contain one annotated sentence. - 2.5.1-SNAPSHOT + 2.6.0 SentencesFile diff --git a/jcore-bc2gmformat-writer/component.meta b/jcore-bc2gmformat-writer/component.meta index 384a54b21..ee98994c8 100644 --- a/jcore-bc2gmformat-writer/component.meta +++ b/jcore-bc2gmformat-writer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-bc2gmformat-writer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe BioCreative II Gene Mention Format Writer" } diff --git a/jcore-bc2gmformat-writer/pom.xml b/jcore-bc2gmformat-writer/pom.xml index c68e9f170..2f531a820 100644 --- a/jcore-bc2gmformat-writer/pom.xml +++ b/jcore-bc2gmformat-writer/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -28,14 +28,18 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine de.julielab jcore-utilities ${jcore-utilities-version} + + de.julielab + julielab-java-utilities + de.julielab jcore-descriptor-creator diff --git a/jcore-bc2gmformat-writer/src/main/resources/de/julielab/jcore/consumer/bc2gmformat/desc/jcore-bc2gmformat-writer.xml b/jcore-bc2gmformat-writer/src/main/resources/de/julielab/jcore/consumer/bc2gmformat/desc/jcore-bc2gmformat-writer.xml index 2e122f8b6..0504d2b1b 100644 --- a/jcore-bc2gmformat-writer/src/main/resources/de/julielab/jcore/consumer/bc2gmformat/desc/jcore-bc2gmformat-writer.xml +++ b/jcore-bc2gmformat-writer/src/main/resources/de/julielab/jcore/consumer/bc2gmformat/desc/jcore-bc2gmformat-writer.xml @@ -6,7 +6,7 @@ JCoRe BioCreative II Gene Mention Format writer This component writes gene annotations in the CAS to the format employed by the BioCreative II Gene Mention challenge. - 2.5.1-SNAPSHOT + 2.6.0 OutputDirectory diff --git a/jcore-bc2gmformat-writer/src/test/java/de/julielab/jcore/consumer/bc2gmformat/BC2GMFormatWriterTest.java b/jcore-bc2gmformat-writer/src/test/java/de/julielab/jcore/consumer/bc2gmformat/BC2GMFormatWriterTest.java index 41faec637..3752d67b5 100644 --- a/jcore-bc2gmformat-writer/src/test/java/de/julielab/jcore/consumer/bc2gmformat/BC2GMFormatWriterTest.java +++ b/jcore-bc2gmformat-writer/src/test/java/de/julielab/jcore/consumer/bc2gmformat/BC2GMFormatWriterTest.java @@ -2,13 +2,13 @@ package de.julielab.jcore.consumer.bc2gmformat; import org.apache.uima.fit.factory.UimaContextFactory; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.lang.reflect.Method; import java.util.TreeMap; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; /** diff --git a/jcore-biolemmatizer-ae/component.meta b/jcore-biolemmatizer-ae/component.meta index 66fd947c5..4e79fc201 100644 --- a/jcore-biolemmatizer-ae/component.meta +++ b/jcore-biolemmatizer-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-biolemmatizer-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe BioLemmatizer" } diff --git a/jcore-biolemmatizer-ae/pom.xml b/jcore-biolemmatizer-ae/pom.xml index bf56276d0..62e6a6234 100644 --- a/jcore-biolemmatizer-ae/pom.xml +++ b/jcore-biolemmatizer-ae/pom.xml @@ -8,7 +8,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -31,7 +31,11 @@ biolemmatizer-core 1.2 - junitjunit + + org.junit.jupiter + junit-jupiter-engine + + JCoRe BioLemmatizer JULIE Lab Jena, Germany @@ -43,8 +47,14 @@ BioNLP Repository - http://svn.code.sf.net/p/bionlp/code/repo + https://svn.code.sf.net/p/bionlp/code/repo + + + maven.aksw.internal + AKSW Internal Release Repository + https://maven.aksw.org/repository/internal + diff --git a/jcore-biolemmatizer-ae/src/main/resources/de/julielab/jcore/ae/biolemmatizer/desc/jcore-biolemmatizer-ae.xml b/jcore-biolemmatizer-ae/src/main/resources/de/julielab/jcore/ae/biolemmatizer/desc/jcore-biolemmatizer-ae.xml index 27b446003..9acb95f57 100644 --- a/jcore-biolemmatizer-ae/src/main/resources/de/julielab/jcore/ae/biolemmatizer/desc/jcore-biolemmatizer-ae.xml +++ b/jcore-biolemmatizer-ae/src/main/resources/de/julielab/jcore/ae/biolemmatizer/desc/jcore-biolemmatizer-ae.xml @@ -6,14 +6,25 @@ BioLemmatizer - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany - + + + + de.julielab.jcore.types.Token + de.julielab.jcore.types.PennBioIEPOSTag + + + de.julielab.jcore.types.Lemma + + + + true true diff --git a/jcore-biolemmatizer-ae/src/test/java/de/julielab/jcore/ae/biolemmatizer/BioLemmatizerTest.java b/jcore-biolemmatizer-ae/src/test/java/de/julielab/jcore/ae/biolemmatizer/BioLemmatizerTest.java index ada58be07..241aadaee 100644 --- a/jcore-biolemmatizer-ae/src/test/java/de/julielab/jcore/ae/biolemmatizer/BioLemmatizerTest.java +++ b/jcore-biolemmatizer-ae/src/test/java/de/julielab/jcore/ae/biolemmatizer/BioLemmatizerTest.java @@ -10,10 +10,10 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; /** * Unit tests for jcore-de.julielab.jcore.ae.biolemmatizer-ae. * @author diff --git a/jcore-bionlpformat-consumer/component.meta b/jcore-bionlpformat-consumer/component.meta index e4c0dedc0..e13edd578 100644 --- a/jcore-bionlpformat-consumer/component.meta +++ b/jcore-bionlpformat-consumer/component.meta @@ -22,7 +22,7 @@ "maven-artifact": { "artifactId": "jcore-bionlpformat-consumer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe BioNLP Format Consumer" } diff --git a/jcore-bionlpformat-consumer/pom.xml b/jcore-bionlpformat-consumer/pom.xml index bf58e21a4..676993028 100644 --- a/jcore-bionlpformat-consumer/pom.xml +++ b/jcore-bionlpformat-consumer/pom.xml @@ -6,7 +6,7 @@ jcore-base de.julielab - 2.5.1-SNAPSHOT + 2.6.0 @@ -29,8 +29,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe BioNLP Format Consumer diff --git a/jcore-bionlpformat-consumer/src/main/java/de/julielab/jcore/consumer/bionlpformat/main/SegmentConsumer.java b/jcore-bionlpformat-consumer/src/main/java/de/julielab/jcore/consumer/bionlpformat/main/SegmentConsumer.java index 287a79921..f09c3a48a 100644 --- a/jcore-bionlpformat-consumer/src/main/java/de/julielab/jcore/consumer/bionlpformat/main/SegmentConsumer.java +++ b/jcore-bionlpformat-consumer/src/main/java/de/julielab/jcore/consumer/bionlpformat/main/SegmentConsumer.java @@ -22,9 +22,6 @@ import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.context.annotation.Configuration; import java.io.*; import java.util.Iterator; diff --git a/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-biomedical-sharedtask.xml b/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-biomedical-sharedtask.xml index 45463be92..5b908ba63 100644 --- a/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-biomedical-sharedtask.xml +++ b/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-biomedical-sharedtask.xml @@ -5,7 +5,7 @@ JCoRe BioNLP Event Consumer - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-medical.xml b/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-medical.xml index 5ebfec59f..0cb5ea0e1 100644 --- a/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-medical.xml +++ b/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-medical.xml @@ -5,7 +5,7 @@ JCoRe BioNLP Format Event Consumer (Medical) - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-segment.xml b/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-segment.xml index dc654b37b..57287e038 100644 --- a/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-segment.xml +++ b/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-segment.xml @@ -7,7 +7,7 @@ JCoRe BioNLP Format Segment Consumer - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/main/EventConsumerTest.java b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/main/EventConsumerTest.java index 6668a969d..8a6659cfb 100644 --- a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/main/EventConsumerTest.java +++ b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/main/EventConsumerTest.java @@ -13,16 +13,16 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.*; import java.util.ArrayList; import java.util.List; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class EventConsumerTest { private static final String EVENT_E8 = "E8 Phosphorylation:T14 Theme:T17 Site:T13"; @@ -44,7 +44,7 @@ public class EventConsumerTest { private AnalysisEngine consumer; private FilenameFilter filter; - @Before + @BeforeEach public void setUp() throws Exception { cas = JCasFactory.createJCas("src/test/resources/types/jcore-all-types"); consumer = AnalysisEngineFactory.createEngine(BioEventConsumer.class, @@ -113,7 +113,7 @@ public boolean accept(File file, String name) { }; } - @After + @AfterEach public void tearDown() { File dataDirectory = new File(TARGET_DIRECTORY); diff --git a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/DocumentWriterTest.java b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/DocumentWriterTest.java index bdd89cc38..12e2baa53 100644 --- a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/DocumentWriterTest.java +++ b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/DocumentWriterTest.java @@ -10,8 +10,8 @@ import de.julielab.jcore.types.Title; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.io.Writer; @@ -28,7 +28,7 @@ public class DocumentWriterTest { private DocumentWriter documentWriter; private Writer writer; - @Before + @BeforeEach public void setUp() throws Exception{ cas = JCasFactory.createJCas("src/test/resources/types/jcore-all-types"); cas.setDocumentText(DOCUMENT_TITLE + "\n" + DOCUMENT_ABSTRACT); diff --git a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EntityWriterTest.java b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EntityWriterTest.java index d98cb4722..29cd9e064 100644 --- a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EntityWriterTest.java +++ b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EntityWriterTest.java @@ -9,15 +9,15 @@ import de.julielab.jcore.types.EntityMention; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.io.Writer; import static org.easymock.classextension.EasyMock.*; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; public class EntityWriterTest { private static final String ENTITY_T13 = "T13 Entity 322 330 tyrosine\n"; @@ -30,7 +30,7 @@ public class EntityWriterTest { private Writer writer; private EntityMention entityT13; - @Before + @BeforeEach public void setUp() throws Exception{ cas = JCasFactory.createJCas("src/test/resources/types/jcore-semantics-biology-types"); diff --git a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventTriggerWriterTest.java b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventTriggerWriterTest.java index 317dd0cef..2a04a48f1 100644 --- a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventTriggerWriterTest.java +++ b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventTriggerWriterTest.java @@ -9,15 +9,15 @@ import de.julielab.jcore.types.EventTrigger; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.io.Writer; import static org.easymock.classextension.EasyMock.*; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; public class EventTriggerWriterTest { private static final String TRIGGER_T1 = "T1 Negative_regulation 12 19 inhibit\n"; @@ -28,7 +28,7 @@ public class EventTriggerWriterTest { private Writer writer; private EventTrigger triggerT1; - @Before + @BeforeEach public void setUp() throws Exception{ cas = JCasFactory.createJCas("src/test/resources/types/jcore-semantics-biology-types"); diff --git a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventWriterTest.java b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventWriterTest.java index 5d8b717cf..58052dc0b 100644 --- a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventWriterTest.java +++ b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventWriterTest.java @@ -10,8 +10,8 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.FileWriter; import java.io.IOException; @@ -36,7 +36,7 @@ public class EventWriterTest { private Gene proteinT17; private EntityMention entityT13; - @Before + @BeforeEach public void setUp() throws Exception{ cas = JCasFactory.createJCas("src/test/resources/types/jcore-semantics-biology-types"); diff --git a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/ProteinWriterTest.java b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/ProteinWriterTest.java index 3871f07ff..2cdc5be50 100644 --- a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/ProteinWriterTest.java +++ b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/ProteinWriterTest.java @@ -9,16 +9,16 @@ import de.julielab.jcore.types.Gene; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.FileWriter; import java.io.IOException; import java.io.Writer; import static org.easymock.classextension.EasyMock.*; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; public class ProteinWriterTest { @@ -35,7 +35,7 @@ public class ProteinWriterTest { private static final String DOCUMENT_TEXT = "Interferons inhibit activation of STAT6 by interleukin 4 in human monocytes by inducing SOCS-1 gene expression.\n" + "Interferons (IFNs) inhibit induction by IL-4 of multiple genes in human monocytes. However, the mechanism by which IFNs mediate this inhibition has not been defined. IL-4 activates gene expression by inducing tyrosine phosphorylation, homodimerization, and nuclear translocation of the latent transcription factor, STAT6 (signal transducer and activator of transcription-6). STAT6-responsive elements are characteristically present in the promoters of IL-4-inducible genes. Because STAT6 activation is essential for IL-4-induced gene expression, we examined the ability of type I and type II IFNs to regulate activation of STAT6 by IL-4 in primary human monocytes. Pretreatment of monocytes with IFN-beta or IFN-gamma, but not IL-1, IL-2, macrophage colony-stimulating factor, granulocyte/macrophage colony-stimulating factor, IL-6, or transforming growth factor beta suppressed activation of STAT6 by IL-4. This inhibition was associated with decreased tyrosine phosphorylation and nuclear translocation of STAT6 and was not evident unless the cells were preincubated with IFN for at least 1 hr before IL-4 stimulation. Furthermore, inhibition by IFN could be blocked by cotreatment with actinomycin D and correlated temporally with induction of the JAK/STAT inhibitory gene, SOCS-1. Forced expression of SOCS-1 in a macrophage cell line, RAW264, markedly suppressed trans-activation of an IL-4-inducible reporter as well as IL-6- and IFN-gamma-induced reporter gene activity. These findings demonstrate that IFNs inhibit IL-4-induced activation of STAT6 and STAT6-dependent gene expression, at least in part, by inducing expression of SOCS-1."; - @Before + @BeforeEach public void setUp() throws Exception{ cas = JCasFactory.createJCas("src/test/resources/types/jcore-semantics-biology-types"); diff --git a/jcore-bionlpformat-consumer/src/test/resources/types/jcore-all-types.xml b/jcore-bionlpformat-consumer/src/test/resources/types/jcore-all-types.xml index 670239d8d..76d19c9c8 100644 --- a/jcore-bionlpformat-consumer/src/test/resources/types/jcore-all-types.xml +++ b/jcore-bionlpformat-consumer/src/test/resources/types/jcore-all-types.xml @@ -2,7 +2,7 @@ JCoRe All Types This is just a convenience file, assembling all JCoRe types - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-bionlpformat-consumer/src/test/resources/types/jcore-semantics-biology-types.xml b/jcore-bionlpformat-consumer/src/test/resources/types/jcore-semantics-biology-types.xml index 0f6fca3ac..a525162fe 100644 --- a/jcore-bionlpformat-consumer/src/test/resources/types/jcore-semantics-biology-types.xml +++ b/jcore-bionlpformat-consumer/src/test/resources/types/jcore-semantics-biology-types.xml @@ -2,7 +2,7 @@ JCoRe Semantics Biology Types The type system contains types of the biomedical domain. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-bionlpformat-reader/component.meta b/jcore-bionlpformat-reader/component.meta index 6f10e9e95..60e877ec5 100644 --- a/jcore-bionlpformat-reader/component.meta +++ b/jcore-bionlpformat-reader/component.meta @@ -22,7 +22,7 @@ "maven-artifact": { "artifactId": "jcore-bionlpformat-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe BioNLP Format Reader" } diff --git a/jcore-bionlpformat-reader/pom.xml b/jcore-bionlpformat-reader/pom.xml index 862c09d97..1c966f9e8 100644 --- a/jcore-bionlpformat-reader/pom.xml +++ b/jcore-bionlpformat-reader/pom.xml @@ -6,7 +6,7 @@ jcore-base de.julielab - 2.5.1-SNAPSHOT + 2.6.0 @@ -41,8 +41,8 @@ ${jcore-utilities-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-bionlpformat-reader/src/main/java/de/julielab/jcore/reader/bionlpformat/utils/AnnotationFileMapper.java b/jcore-bionlpformat-reader/src/main/java/de/julielab/jcore/reader/bionlpformat/utils/AnnotationFileMapper.java index 70efe8571..5a265d736 100644 --- a/jcore-bionlpformat-reader/src/main/java/de/julielab/jcore/reader/bionlpformat/utils/AnnotationFileMapper.java +++ b/jcore-bionlpformat-reader/src/main/java/de/julielab/jcore/reader/bionlpformat/utils/AnnotationFileMapper.java @@ -17,8 +17,8 @@ import java.io.BufferedReader; import java.io.IOException; - import java.util.*; import java.util.List; + import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; diff --git a/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-biomedical-sharedtask.xml b/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-biomedical-sharedtask.xml index ccd6c46f6..66a5945ca 100644 --- a/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-biomedical-sharedtask.xml +++ b/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-biomedical-sharedtask.xml @@ -5,7 +5,7 @@ JCoRe BioNLP Event Reader - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-medical.xml b/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-medical.xml index 74cdb9e62..602240c4e 100644 --- a/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-medical.xml +++ b/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-medical.xml @@ -5,7 +5,7 @@ BioNLP Format Reader Medical - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-segment.xml b/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-segment.xml index aea0bc469..7ed45b45a 100644 --- a/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-segment.xml +++ b/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-segment.xml @@ -5,7 +5,7 @@ BioNLP Format Reader Segment - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/CoreferenceReadingTest.java b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/CoreferenceReadingTest.java index ce2926f00..1b2a68ac9 100644 --- a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/CoreferenceReadingTest.java +++ b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/CoreferenceReadingTest.java @@ -16,8 +16,8 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Ignore; -import org.junit.Test; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; import org.xml.sax.SAXException; import java.io.FileOutputStream; @@ -25,7 +25,7 @@ import java.io.OutputStream; // Ignore because the data path does generally not exist; a fix should only contain some test data, not the whole dataset -@Ignore +@Disabled public class CoreferenceReadingTest { @Test public void testCoreferenceReading() throws UIMAException, IOException, diff --git a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/EventReaderTest.java b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/EventReaderTest.java index 9c7aea226..68c64fc94 100644 --- a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/EventReaderTest.java +++ b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/EventReaderTest.java @@ -17,23 +17,23 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.XMLInputSource; -import org.junit.Before; -import org.junit.Ignore; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; import java.util.Set; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; // This test's EventReaderTest.xml descriptor points to local directories of Ekaterina Buyko and as such, the test doesn't work this way. However it might, if the data is made available as proper test data. -@Ignore +@Disabled public class EventReaderTest { private static final String DESCRIPTOR_FILE = "src/test/resources/de/julielab/jcore/reader/bionlpformat/desc/EventReaderTest.xml"; private CollectionReader collectionReader; - @Before + @BeforeEach public void setUp() throws Exception { CollectionReaderDescription readerDescription = (CollectionReaderDescription) UIMAFramework .getXMLParser().parseCollectionReaderDescription( diff --git a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AbstractFileMapperTest.java b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AbstractFileMapperTest.java index 25685ec01..c95a9d148 100644 --- a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AbstractFileMapperTest.java +++ b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AbstractFileMapperTest.java @@ -18,15 +18,15 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.XMLInputSource; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.BufferedReader; import static org.easymock.EasyMock.expect; import static org.easymock.classextension.EasyMock.*; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; public class AbstractFileMapperTest { @@ -34,7 +34,7 @@ public class AbstractFileMapperTest { private JCas cas; private TextFileMapper abstractFileMapper; - @Before + @BeforeEach public void setUp() throws Exception { CollectionReaderDescription readerDescription = UIMAFramework.getXMLParser() .parseCollectionReaderDescription(new XMLInputSource(DESCRIPTOR_FILE)); diff --git a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AnnotationFileMapperTest.java b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AnnotationFileMapperTest.java index 46bf09ee6..85b582ed0 100644 --- a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AnnotationFileMapperTest.java +++ b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AnnotationFileMapperTest.java @@ -28,8 +28,8 @@ import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.XMLInputSource; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.BufferedReader; import java.util.HashMap; @@ -37,8 +37,8 @@ import static org.easymock.EasyMock.expect; import static org.easymock.classextension.EasyMock.*; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -68,7 +68,7 @@ public class AnnotationFileMapperTest { private Gene t3; private Map mappedProteins; - @Before + @BeforeEach public void setUp() throws Exception { CollectionReaderDescription readerDescription = (CollectionReaderDescription) UIMAFramework.getXMLParser().parseCollectionReaderDescription(new XMLInputSource(DESCRIPTOR_FILE)); CollectionReader collectionReader = UIMAFramework.produceCollectionReader(readerDescription); diff --git a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/OntoFormatReaderTest.java b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/OntoFormatReaderTest.java index 2abfcc03d..24a3d7805 100644 --- a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/OntoFormatReaderTest.java +++ b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/OntoFormatReaderTest.java @@ -11,7 +11,7 @@ package de.julielab.jcore.reader.bionlp09event.utils; import de.julielab.jcore.reader.bionlpformat.utils.OntoFormatReader; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.IOException; diff --git a/jcore-bionlpformat-reader/src/test/resources/de/julielab/jcore/reader/bionlpformat/desc/EventReaderTest.xml b/jcore-bionlpformat-reader/src/test/resources/de/julielab/jcore/reader/bionlpformat/desc/EventReaderTest.xml index 38ed5aed3..33b41c2c8 100644 --- a/jcore-bionlpformat-reader/src/test/resources/de/julielab/jcore/reader/bionlpformat/desc/EventReaderTest.xml +++ b/jcore-bionlpformat-reader/src/test/resources/de/julielab/jcore/reader/bionlpformat/desc/EventReaderTest.xml @@ -5,7 +5,7 @@ EventReader - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-biosem-ae/component.meta b/jcore-biosem-ae/component.meta index dd5fcf39d..08c8a1bba 100644 --- a/jcore-biosem-ae/component.meta +++ b/jcore-biosem-ae/component.meta @@ -9,7 +9,7 @@ "maven-artifact": { "artifactId": "jcore-biosem-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe BioSem Event Annotator" } diff --git a/jcore-biosem-ae/pom.xml b/jcore-biosem-ae/pom.xml index ece3b845a..7a667db9b 100644 --- a/jcore-biosem-ae/pom.xml +++ b/jcore-biosem-ae/pom.xml @@ -5,7 +5,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 jcore-biosem-ae JCoRe BioSem Event Annotator @@ -32,13 +32,19 @@ de.julielab jcore-bionlpformat-reader - 2.5.1-SNAPSHOT + 2.6.0 test de.julielab biosem-event-extractor 1.1.7 + + + commons-cli + commons-cli + + de.julielab @@ -48,12 +54,12 @@ de.julielab jcore-bionlpformat-consumer - 2.5.1-SNAPSHOT + 2.6.0 test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java b/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java index 9a9f16a35..8a42dd9dc 100644 --- a/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java +++ b/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java @@ -1,11 +1,11 @@ -/** - * +/** + * * Copyright (c) 2017, JULIE Lab. - * All rights reserved. This program and the accompanying materials + * All rights reserved. This program and the accompanying materials * are made available under the terms of the BSD-2-Clause License * - * Author: - * + * Author: + * * Description: **/ package de.julielab.jcore.ae.biosem; @@ -17,7 +17,10 @@ import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException; import org.apache.uima.cas.FSIterator; +import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ExternalResource; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; @@ -31,22 +34,27 @@ import utils.BioSemException; import utils.DBUtils; -import java.util.*; import java.util.List; +import java.util.*; import java.util.Map.Entry; +@ResourceMetaData(name="JCoRe BioSem Event Annotator", description = "Adds annotations for event triggers and events according to the BioNLP Shared Task event definition.") +@TypeCapability(inputs = {"de.julielab.jcore.types.Gene"}, outputs = {"de.julielab.jcore.types.EventTrigger", "de.julielab.jcore.types.EventMention"}) public class BioSemEventAnnotator extends JCasAnnotator_ImplBase { private final static Logger log = LoggerFactory.getLogger(BioSemEventAnnotator.class); + public static final String PARAM_COMPONENT_ID = "ComponentId"; public final static String RESOURCE_TRAINED_DB = "TrainedDB"; private DataLoader loader; private DBUtils trainedDb; - @ExternalResource(key = RESOURCE_TRAINED_DB, mandatory = true) + @ExternalResource(key = RESOURCE_TRAINED_DB) private DBUtilsProvider dbUtilsProvider; + @ConfigurationParameter(name=PARAM_COMPONENT_ID, mandatory = false, defaultValue = "BioSemEventAnnotator", description = "Optional. If set, the 'componentId' feature of the created annotations will be set to the value of this parameter.") + private String componentId; private EventExtraction xtr; @@ -64,6 +72,7 @@ public class BioSemEventAnnotator extends JCasAnnotator_ImplBase { public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); try { + componentId = (String) aContext.getConfigParameterValue(PARAM_COMPONENT_ID); dbUtilsProvider = (DBUtilsProvider) aContext.getResourceObject(RESOURCE_TRAINED_DB); trainedDb = dbUtilsProvider.getTrainedDatabase(); } catch (ResourceAccessException e) { @@ -198,6 +207,7 @@ private EventMention addEventToIndexes(PData event, Map proteinMap PData eventArg1 = event.getPdata1(); PData eventArg2 = event.getPdata2(); uimaEvent = new EventMention(aJCas, begin, end); + uimaEvent.setComponentId(componentId); uimaEvent.setId(event.PID); uimaEvent.setSpecificType(uimaTrigger.getSpecificType()); uimaEvent.setTrigger(uimaTrigger); @@ -227,7 +237,7 @@ private EventMention addEventToIndexes(PData event, Map proteinMap } /** - * + * * @param uimaEvent * The UIMA event annotation to add a new argument to * @param bioSemArg @@ -279,6 +289,7 @@ private void addUimaEventArgument(EventMention uimaEvent, Object bioSemArg, int // if we don't want to use the writer). protein.setSpecificType("protein"); uimaArg = new ArgumentMention(aJCas, protein.getBegin(), protein.getEnd()); + uimaArg.setComponentId(componentId); uimaArg.setRef(protein); uimaArg.setRole(determineArgumentRole(uimaEvent, uimaArg, argPos)); } else if (bioSemArg instanceof PData) { @@ -293,9 +304,10 @@ private void addUimaEventArgument(EventMention uimaEvent, Object bioSemArg, int } if (null == uimaEventArg) { throw new IllegalStateException("Creating UIMA EventMention annotation for BioSem event \"" - + eventArg.toString() + "\" failed, the UIMA EventMention is null."); + + eventArg + "\" failed, the UIMA EventMention is null."); } uimaArg = new ArgumentMention(aJCas, uimaEventArg.getBegin(), uimaEventArg.getEnd()); + uimaArg.setComponentId(componentId); uimaArg.setRef(uimaEventArg); uimaArg.setRole(determineArgumentRole(uimaEvent, uimaArg, argPos)); } else { @@ -330,7 +342,7 @@ private void addUimaEventArgument(EventMention uimaEvent, Object bioSemArg, int } /** - * + * * @param uimaEvent * @param uimaArg * @param argPos @@ -359,6 +371,7 @@ private EventTrigger addTriggerToIndexes(Word trg, JCas aJCas) { int end = trg.locs[1]; String type = trg.type; EventTrigger uimaTrigger = new EventTrigger(aJCas, begin, end); + uimaTrigger.setComponentId(componentId); uimaTrigger.setId(id); uimaTrigger.setSpecificType(type); return uimaTrigger; @@ -370,7 +383,7 @@ private EventTrigger addTriggerToIndexes(Word trg, JCas aJCas) { * ID<tab>Entity-Type[Protein]<tab>start<tab>end<tab>Mention name *
* Example: T3 Protein 166 174 TGF-beta - * + * * @return */ private List getProteinLines(Map proteins, String docId) throws AnnotatorProcessException { @@ -392,7 +405,7 @@ private List getProteinLines(Map proteins, String docId) t /** * Assigns an ID of the form Ti to each gene in the CAS, i * being an enumeration number beginning at 0. - * + * * @param aJCas * @return */ @@ -408,9 +421,7 @@ private Map enumerateProteins(JCas aJCas) { Gene gene = (Gene) geneIt.next(); if (gene.getBegin() < lastEnd) continue; - String id = gene.getId(); - // if (StringUtils.isBlank(id)) - id = "T" + i++; + String id = "T" + i++; gene.setId(id); proteins.put(id, gene); lastEnd = gene.getEnd(); diff --git a/jcore-biosem-ae/src/test/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotatorTest.java b/jcore-biosem-ae/src/test/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotatorTest.java index ae49970cd..da7a683de 100644 --- a/jcore-biosem-ae/src/test/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotatorTest.java +++ b/jcore-biosem-ae/src/test/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotatorTest.java @@ -22,15 +22,15 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ExternalResourceDescription; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileInputStream; import java.util.Collections; import java.util.List; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class BioSemEventAnnotatorTest { @Test @@ -53,7 +53,7 @@ public void testProcess() throws Exception { if (testOutputFile.exists()) testOutputFile.delete(); - assertTrue("Test document was not found by the BioNLP ST reader.", bioNlpSTReader.hasNext()); + assertTrue(bioNlpSTReader.hasNext(), "Test document was not found by the BioNLP ST reader."); bioNlpSTReader.getNext(jCas.getCas()); engine.process(jCas); bioNlpSTWriter.process(jCas); diff --git a/jcore-conll-consumer/component.meta b/jcore-conll-consumer/component.meta index e754ff444..2e94ca29d 100644 --- a/jcore-conll-consumer/component.meta +++ b/jcore-conll-consumer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-conll-consumer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe CONLL Consumer" } diff --git a/jcore-conll-consumer/pom.xml b/jcore-conll-consumer/pom.xml index fef60e5bf..cff35237d 100644 --- a/jcore-conll-consumer/pom.xml +++ b/jcore-conll-consumer/pom.xml @@ -4,7 +4,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 jcore-conll-consumer @@ -24,8 +24,8 @@ logback-classic
- junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-conll-consumer/src/main/resources/de/julielab/jcore/consumer/conll/desc/jcore-conll-consumer.xml b/jcore-conll-consumer/src/main/resources/de/julielab/jcore/consumer/conll/desc/jcore-conll-consumer.xml index 30f0366eb..288790254 100644 --- a/jcore-conll-consumer/src/main/resources/de/julielab/jcore/consumer/conll/desc/jcore-conll-consumer.xml +++ b/jcore-conll-consumer/src/main/resources/de/julielab/jcore/consumer/conll/desc/jcore-conll-consumer.xml @@ -6,7 +6,7 @@ JCoRe Conll Consumer - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-conll-consumer/src/test/java/de/julielab/jcore/consumer/cas2conll/test/ConllConsumerTest.java b/jcore-conll-consumer/src/test/java/de/julielab/jcore/consumer/cas2conll/test/ConllConsumerTest.java index cb66ca825..ad46ef663 100644 --- a/jcore-conll-consumer/src/test/java/de/julielab/jcore/consumer/cas2conll/test/ConllConsumerTest.java +++ b/jcore-conll-consumer/src/test/java/de/julielab/jcore/consumer/cas2conll/test/ConllConsumerTest.java @@ -21,7 +21,7 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.BufferedReader; import java.io.File; @@ -30,7 +30,7 @@ import java.util.ArrayList; import java.util.List; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertTrue; public class ConllConsumerTest { diff --git a/jcore-coordination-baseline-ae/component.meta b/jcore-coordination-baseline-ae/component.meta index c79a816e4..622f6ef43 100644 --- a/jcore-coordination-baseline-ae/component.meta +++ b/jcore-coordination-baseline-ae/component.meta @@ -26,7 +26,7 @@ "maven-artifact": { "artifactId": "jcore-coordination-baseline-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Coordination Tagger Baseline" } diff --git a/jcore-coordination-baseline-ae/pom.xml b/jcore-coordination-baseline-ae/pom.xml index eaff316fa..64cc11f48 100644 --- a/jcore-coordination-baseline-ae/pom.xml +++ b/jcore-coordination-baseline-ae/pom.xml @@ -13,7 +13,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -37,8 +37,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-conjunct.xml b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-conjunct.xml index 1e5a6c860..4da7a5bbe 100644 --- a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-conjunct.xml +++ b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-conjunct.xml @@ -6,7 +6,7 @@ JCoRe ConjunctAnnotator -2.5.1-SNAPSHOT +2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-coordination.xml b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-coordination.xml index b5db7b69b..706c3df7e 100644 --- a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-coordination.xml +++ b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-coordination.xml @@ -6,7 +6,7 @@ JCoRe CoordinationAnnotator -2.5.1-SNAPSHOT +2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-eee.xml b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-eee.xml index 50c01690b..41bb97345 100644 --- a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-eee.xml +++ b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-eee.xml @@ -6,7 +6,7 @@ JCoRe EEEAnnotator -2.5.1-SNAPSHOT +2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-ellipsis.xml b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-ellipsis.xml index 8e73905d3..bb4bfb5c1 100644 --- a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-ellipsis.xml +++ b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-ellipsis.xml @@ -6,7 +6,7 @@ JCoRe EllipsisAnnotator -2.5.1-SNAPSHOT +2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/ConjunctAnnotatorTest.java b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/ConjunctAnnotatorTest.java index fdca4b78e..6eb0c2ee6 100644 --- a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/ConjunctAnnotatorTest.java +++ b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/ConjunctAnnotatorTest.java @@ -7,7 +7,6 @@ package de.julielab.jcore.ae.coordbaseline.main; import de.julielab.jcore.types.*; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.cas.FSIterator; @@ -17,13 +16,16 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; +import static org.junit.jupiter.api.Assertions.assertTrue; -public class ConjunctAnnotatorTest extends TestCase + +public class ConjunctAnnotatorTest { private static final Logger LOGGER = LoggerFactory.getLogger(ConjunctAnnotatorTest.class); private static final String LOGGER_PROPERTIES = "src/test/java/log4j.properties"; @@ -32,19 +34,8 @@ public class ConjunctAnnotatorTest extends TestCase private static final String coordinationLabels2 = "antecedent,conjunct,conjunction,conjunct,antecedent,antecedent"; private static final String TEST_DESC = "src/test/resources/desc/ConjunctAnnotatorTest.xml"; - - - - - - -/*--------------------------------------------------------------------------------*/ - protected void setUp() throws Exception - { - super.setUp(); - } // of setUp -/*--------------------------------------------------------------------------------*/ - public void initCas(JCas jcas) + + public void initCas(JCas jcas) { jcas.reset(); @@ -558,6 +549,7 @@ public void initCas(JCas jcas) } // of initCas /*--------------------------------------------------------------------------------*/ + @Test public void testProcess() { XMLInputSource descriptor = null; @@ -595,7 +587,7 @@ public void testProcess() try { ae.process(jcas, null); - assertTrue("Invalid JCas!", checkJCas(jcas)); + assertTrue(checkJCas(jcas), "Invalid JCas!"); } // of try catch (Exception e) { diff --git a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/CoordinationAnnotatorTest.java b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/CoordinationAnnotatorTest.java index dd5416a7a..32662f928 100644 --- a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/CoordinationAnnotatorTest.java +++ b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/CoordinationAnnotatorTest.java @@ -22,7 +22,6 @@ package de.julielab.jcore.ae.coordbaseline.main; import de.julielab.jcore.types.*; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.cas.FSIterator; @@ -32,13 +31,16 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; +import static org.junit.jupiter.api.Assertions.assertTrue; -public class CoordinationAnnotatorTest extends TestCase + +public class CoordinationAnnotatorTest { private static final Logger LOGGER = LoggerFactory.getLogger(CoordinationAnnotatorTest.class); @@ -55,11 +57,7 @@ public class CoordinationAnnotatorTest extends TestCase private static final String TEST_DESC = "src/test/resources/desc/CoordinationAnnotatorTest.xml"; - protected void setUp() throws Exception - { - super.setUp(); - } // of setUp - + public void initCas(JCas jcas) { jcas.reset(); @@ -562,7 +560,7 @@ public void initCas(JCas jcas) } // of initCas - + @Test public void testProcess() { XMLInputSource descriptor = null; @@ -598,7 +596,7 @@ public void testProcess() try { ae.process(jcas, null); - assertTrue("Invalid JCas!", checkJCas(jcas)); + assertTrue(checkJCas(jcas), "Invalid JCas!"); } // of try catch (Exception e) diff --git a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EEEAnnotatorTest.java b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EEEAnnotatorTest.java index a010c3178..4203cdc16 100644 --- a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EEEAnnotatorTest.java +++ b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EEEAnnotatorTest.java @@ -7,7 +7,6 @@ package de.julielab.jcore.ae.coordbaseline.main; import de.julielab.jcore.types.*; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.cas.FSIterator; @@ -17,13 +16,16 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; +import static org.junit.jupiter.api.Assertions.assertTrue; -public class EEEAnnotatorTest extends TestCase + +public class EEEAnnotatorTest { private static final Logger LOGGER = LoggerFactory.getLogger(EEEAnnotatorTest.class); private static final String LOGGER_PROPERTIES = "src/test/java/log4j.properties"; @@ -33,12 +35,7 @@ public class EEEAnnotatorTest extends TestCase private static final String EEE2 = "simple upstream and downstream sequence elements"; private static final String TEST_DESC = "src/test/resources/desc/EEEAnnotatorTest.xml"; -/*--------------------------------------------------------------------------------*/ - protected void setUp() throws Exception - { - super.setUp(); - } // of setUp -/*--------------------------------------------------------------------------------*/ + public void initCas(JCas jcas) { jcas.reset(); @@ -538,6 +535,8 @@ public void initCas(JCas jcas) entity3.addToIndexes(); } // of initCas /*--------------------------------------------------------------------------------*/ + + @Test public void testProcess() { XMLInputSource descriptor = null; @@ -575,7 +574,7 @@ public void testProcess() try { ae.process(jcas, null); - assertTrue("Invalid JCas!", checkJCas(jcas)); + assertTrue(checkJCas(jcas), "Invalid JCas!"); } // of try diff --git a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EllipsisAnnotatorTest.java b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EllipsisAnnotatorTest.java index 749371a51..94d697619 100644 --- a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EllipsisAnnotatorTest.java +++ b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EllipsisAnnotatorTest.java @@ -7,7 +7,6 @@ package de.julielab.jcore.ae.coordbaseline.main; import de.julielab.jcore.types.*; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.cas.FSIterator; @@ -17,12 +16,15 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; -public class EllipsisAnnotatorTest extends TestCase +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class EllipsisAnnotatorTest { private static final Logger LOGGER = LoggerFactory.getLogger(EllipsisAnnotatorTest.class); private static final String LOGGER_PROPERTIES = "src/test/java/log4j.properties"; @@ -30,12 +32,7 @@ public class EllipsisAnnotatorTest extends TestCase private static final String ellipsis1 = "X cells, Y cells, and Z cells"; private static final String ellipsis2 = "simple upstream sequence elements and simple downstream sequence elements"; private static final String TEST_DESC = "src/test/resources/desc/EllipsisAnnotatorTest.xml"; -/*--------------------------------------------------------------------------------*/ - protected void setUp() throws Exception - { - super.setUp(); - } // of setUp -/*--------------------------------------------------------------------------------*/ + public void initCas(JCas jcas) { jcas.reset(); @@ -697,6 +694,8 @@ public void initCas(JCas jcas) c26.addToIndexes(); } // of initCas /*---------------------------------------------------------------------------*/ + + @Test public void testProcess() { XMLInputSource descriptor = null; @@ -734,7 +733,7 @@ public void testProcess() try { ae.process(jcas, null); - assertTrue("Invalid JCas!", checkJCas(jcas)); + assertTrue(checkJCas(jcas), "Invalid JCas!"); } // of try catch (Exception e) { diff --git a/jcore-coordination-baseline-ae/src/test/resources/desc/ConjunctAnnotatorTest.xml b/jcore-coordination-baseline-ae/src/test/resources/desc/ConjunctAnnotatorTest.xml index 50c97ebbc..29f9e5d35 100644 --- a/jcore-coordination-baseline-ae/src/test/resources/desc/ConjunctAnnotatorTest.xml +++ b/jcore-coordination-baseline-ae/src/test/resources/desc/ConjunctAnnotatorTest.xml @@ -6,7 +6,7 @@ ConjunctAnnotator -2.5.1-SNAPSHOT +2.6.0 diff --git a/jcore-coordination-baseline-ae/src/test/resources/desc/CoordinationAnnotatorTest.xml b/jcore-coordination-baseline-ae/src/test/resources/desc/CoordinationAnnotatorTest.xml index ca9a48170..c3245f36b 100644 --- a/jcore-coordination-baseline-ae/src/test/resources/desc/CoordinationAnnotatorTest.xml +++ b/jcore-coordination-baseline-ae/src/test/resources/desc/CoordinationAnnotatorTest.xml @@ -6,7 +6,7 @@ CoordinationAnnotator -2.5.1-SNAPSHOT +2.6.0 diff --git a/jcore-coordination-baseline-ae/src/test/resources/desc/EEEAnnotatorTest.xml b/jcore-coordination-baseline-ae/src/test/resources/desc/EEEAnnotatorTest.xml index 3683f5210..4fa87c0a9 100644 --- a/jcore-coordination-baseline-ae/src/test/resources/desc/EEEAnnotatorTest.xml +++ b/jcore-coordination-baseline-ae/src/test/resources/desc/EEEAnnotatorTest.xml @@ -6,7 +6,7 @@ EEEAnnotator -2.5.1-SNAPSHOT +2.6.0 diff --git a/jcore-coordination-baseline-ae/src/test/resources/desc/EllipsisAnnotatorTest.xml b/jcore-coordination-baseline-ae/src/test/resources/desc/EllipsisAnnotatorTest.xml index beea12e3e..85ce7558b 100644 --- a/jcore-coordination-baseline-ae/src/test/resources/desc/EllipsisAnnotatorTest.xml +++ b/jcore-coordination-baseline-ae/src/test/resources/desc/EllipsisAnnotatorTest.xml @@ -6,7 +6,7 @@ EllipsisAnnotator -2.5.1-SNAPSHOT +2.6.0 diff --git a/jcore-cord19-reader/component.meta b/jcore-cord19-reader/component.meta index 3fd15f733..fd42cd349 100644 --- a/jcore-cord19-reader/component.meta +++ b/jcore-cord19-reader/component.meta @@ -19,7 +19,7 @@ "maven-artifact": { "artifactId": "jcore-cord19-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe CORD-19 Reader" } diff --git a/jcore-cord19-reader/pom.xml b/jcore-cord19-reader/pom.xml index a5a7d9d00..833b22db6 100644 --- a/jcore-cord19-reader/pom.xml +++ b/jcore-cord19-reader/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -48,8 +48,8 @@ ${jcore-utilities-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine org.assertj diff --git a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReader.java b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReader.java index 5789d935b..3b8b9ff35 100644 --- a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReader.java +++ b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReader.java @@ -73,7 +73,11 @@ public void getNext(JCas jCas) throws CollectionException { Path p = currentFileBatch.get(currentBatchIndex); if (p != Cord19FileVisitor.END) { JCoReURI uri = new JCoReURI(jCas); - uri.setUri(p.toUri().toString()); + try { + uri.setUri(p.toUri().toString()); + } catch (NullPointerException e) { + log.error("Could not retrieve URI string for path {}, resolved URI {}", p, p!= null ? p.toUri() : ""); + } uri.addToIndexes(); ++completed; } diff --git a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19Reader.java b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19Reader.java index 54a9f1d5c..60939db2b 100644 --- a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19Reader.java +++ b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19Reader.java @@ -121,31 +121,40 @@ private void addBody(JCas jCas, StringBuilder doctext, Cord19Document document) } private void addAbstract(JCas jCas, StringBuilder doctext, Cord19Document document) { - List sections = new ArrayList<>(document.getAbstr().size()); - int abstractBegin = doctext.length(); - // Stores the end of the last paragraph before the newline - int lastEnd = 0; - for (Paragraph p : document.getAbstr()) { - int paragraphBegin = doctext.length(); - AbstractSection as = new AbstractSection(jCas, paragraphBegin, doctext.length() + p.getText().length()); - doctext.append(p.getText()); - lastEnd = doctext.length(); - doctext.append(linesep); - AbstractSectionHeading asHeading = new AbstractSectionHeading(jCas); - asHeading.setTitleType("abstract"); - asHeading.setLabel(p.getSection()); - as.setAbstractSectionHeading(asHeading); - sections.add(as); - addReferences(p, Paragraph::getRefSpans, paragraphBegin, jCas); - addReferences(p, Paragraph::getEqSpans, paragraphBegin, jCas); - addReferences(p, Paragraph::getCiteSpans, paragraphBegin, jCas); - } - if (lastEnd - abstractBegin > 0) { - AbstractText abstractText = new AbstractText(jCas, abstractBegin, lastEnd); - abstractText.setAbstractType("main"); - abstractText.setStructuredAbstractParts(JCoReTools.addToFSArray(null, sections)); - abstractText.addToIndexes(); - doctext.append(linesep); + MetadataRecord metadataRecord = metadataIdMap.get(document.getPaperId()); + if (metadataRecord != null && metadataRecord.getAbstractText() != null && !metadataRecord.getAbstractText().isBlank()) { + String abstractText = metadataRecord.getAbstractText(); + AbstractText abstractAnnotation = new AbstractText(jCas, doctext.length(), doctext.length() + abstractText.length()); + abstractAnnotation.setAbstractType("main"); + abstractAnnotation.addToIndexes(); + doctext.append(abstractText); + } else { + List sections = new ArrayList<>(document.getAbstr().size()); + int abstractBegin = doctext.length(); + // Stores the end of the last paragraph before the newline + int lastEnd = 0; + for (Paragraph p : document.getAbstr()) { + int paragraphBegin = doctext.length(); + AbstractSection as = new AbstractSection(jCas, paragraphBegin, doctext.length() + p.getText().length()); + doctext.append(p.getText()); + lastEnd = doctext.length(); + doctext.append(linesep); + AbstractSectionHeading asHeading = new AbstractSectionHeading(jCas); + asHeading.setTitleType("abstract"); + asHeading.setLabel(p.getSection()); + as.setAbstractSectionHeading(asHeading); + sections.add(as); + addReferences(p, Paragraph::getRefSpans, paragraphBegin, jCas); + addReferences(p, Paragraph::getEqSpans, paragraphBegin, jCas); + addReferences(p, Paragraph::getCiteSpans, paragraphBegin, jCas); + } + if (lastEnd - abstractBegin > 0) { + AbstractText abstractText = new AbstractText(jCas, abstractBegin, lastEnd); + abstractText.setAbstractType("main"); + abstractText.setStructuredAbstractParts(JCoReTools.addToFSArray(null, sections)); + abstractText.addToIndexes(); + doctext.append(linesep); + } } } @@ -164,7 +173,7 @@ private void addReferences(Paragraph p, Function> private void addTitle(JCas jCas, Cord19Document document, MetadataRecord metadataRecord, StringBuilder doctext) { if (metadataRecord != null) { String title = metadataRecord.getTitle(); - if (title != null) { + if (title != null && !title.isBlank()) { addTitle(jCas, title, doctext); } } else { @@ -221,9 +230,10 @@ private void readMetaData(String metadataFile) { String cordUid = record.get("cord_uid"); String sha = record.get("sha"); String title = record.get("title"); + String abstractText = record.get("abstract"); String pmcid = record.get("pmcid"); String pmid = record.get("pubmed_id"); - MetadataRecord metadataRecord = new MetadataRecord(cordUid, sha, pmcid, pmid, title); + MetadataRecord metadataRecord = new MetadataRecord(cordUid, sha, pmcid, pmid, title, abstractText); for (String hash : metadataRecord.hashes) metadataIdMap.put(hash, metadataRecord); if (pmcid != null) @@ -244,13 +254,19 @@ private static class MetadataRecord { private final String pmid; private final String[] hashes; private final String title; + private String abstractText; - public MetadataRecord(String cordUid, String sha, String pmcid, String pmid, String title) { + public MetadataRecord(String cordUid, String sha, String pmcid, String pmid, String title, String abstractText) { this.cordUid = cordUid; this.pmcid = pmcid; this.pmid = pmid; this.title = title; this.hashes = Arrays.stream(sha.split(";")).map(String::trim).toArray(String[]::new); + this.abstractText = abstractText; + } + + public String getAbstractText() { + return abstractText; } public String getCordUid() { diff --git a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/jsonformat/TabFigRef.java b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/jsonformat/TabFigRef.java index bfe873c48..d35bc534e 100644 --- a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/jsonformat/TabFigRef.java +++ b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/jsonformat/TabFigRef.java @@ -19,6 +19,16 @@ public class TabFigRef { private String text; private String type; private String latex; + private String html; + + public String getHtml() { + + return html; + } + + public void setHtml(String html) { + this.html = html; + } public String getLatex() { return latex; diff --git a/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier-reader.xml b/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier-reader.xml index 90f5da426..4cdd4203f 100644 --- a/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier-reader.xml +++ b/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier-reader.xml @@ -5,7 +5,7 @@ JCoRe CORD-19 Multiplier Reader This component reads file paths to JSON files and the CORD-19 (https://pages.semanticscholar.org/coronavirus-research) meta data file to send them to CAS multipliers. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier.xml b/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier.xml index b539b1511..c3da5e650 100644 --- a/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier.xml +++ b/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier.xml @@ -6,7 +6,7 @@ JCoRe CORD-19 CAS Multiplier This component reads the CORD-19 (https://pages.semanticscholar.org/coronavirus-research) JSON format into UIMA CAS instances. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReaderTest.java b/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReaderTest.java index b5922a816..0453a1cde 100644 --- a/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReaderTest.java +++ b/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReaderTest.java @@ -15,7 +15,7 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.nio.file.Path; import java.util.Collection; @@ -63,9 +63,8 @@ private void checkSecondDocument(JCas cas) { assertThat(documentTitles.get(0)).extracting(Annotation::getCoveredText).isEqualTo("Recombinant M protein-based ELISA test for detection of antibodies to canine coronavirus"); AbstractText abstractText = JCasUtil.selectSingle(cas, AbstractText.class); - assertThat(abstractText.getCoveredText()).startsWith("The membrane (M) protein of canine"); - assertThat(abstractText.getCoveredText()).endsWith("antibodies to CCoV in dog sera. #"); - assertThat(abstractText.getStructuredAbstractParts()).hasSize(1); + assertThat(abstractText.getCoveredText()).startsWith("Abstract The membrane (M) protein of canine"); + assertThat(abstractText.getCoveredText()).endsWith("antibodies to CCoV in dog sera."); Collection paragraphs = JCasUtil.select(cas, Paragraph.class); assertThat(paragraphs).hasSize(19); diff --git a/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/JsonFormatTest.java b/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/JsonFormatTest.java index dba932cac..5e39b79d0 100644 --- a/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/JsonFormatTest.java +++ b/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/JsonFormatTest.java @@ -4,7 +4,7 @@ import de.julielab.jcore.reader.cord19.jsonformat.Affiliation; import de.julielab.jcore.reader.cord19.jsonformat.Author; import de.julielab.jcore.reader.cord19.jsonformat.Cord19Document; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.nio.file.Path; diff --git a/jcore-coreference-writer/LICENSE b/jcore-coreference-writer/LICENSE new file mode 100644 index 000000000..7190118b3 --- /dev/null +++ b/jcore-coreference-writer/LICENSE @@ -0,0 +1,26 @@ +BSD 2-Clause License + +Copyright (c) 2021, JULIE Lab +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/jcore-coreference-writer/README.md b/jcore-coreference-writer/README.md new file mode 100644 index 000000000..da767a4d1 --- /dev/null +++ b/jcore-coreference-writer/README.md @@ -0,0 +1,26 @@ +# JCoRe Acronym Writer + +**Descriptor Path**: +``` +de.julielab.jcore.consumer.acronyms.desc.jcore-acronym-writer +``` + +Writes acronyms annotations from the CAS to a text file format. + + + +**1. Parameters** + +| Parameter Name | Parameter Type | Mandatory | Multivalued | Description | +|----------------|----------------|-----------|-------------|-------------| +| OutputFile | string | true | false | Path to the ourput file. | + + +**2. Capabilities** + +| Type | Input | Output | +|------|:-----:|:------:| +| de.julielab.jcore.types.Abbreviation | `+` | | + + + diff --git a/jcore-coreference-writer/component.meta b/jcore-coreference-writer/component.meta new file mode 100644 index 000000000..bbfba5b64 --- /dev/null +++ b/jcore-coreference-writer/component.meta @@ -0,0 +1,20 @@ +{ + "categories": [ + "consumer" + ], + "description": "Writes coreference annotations from the CAS to a text file format.", + "descriptors": [ + { + "category": "consumer", + "location": "de.julielab.jcore.consumer.coreference.desc.jcore-coreference-writer" + } + ], + "exposable": true, + "group": "general", + "maven-artifact": { + "artifactId": "jcore-coreference-writer", + "groupId": "de.julielab", + "version": "2.6.0" + }, + "name": "JCoRe Coreference Writer" +} diff --git a/jcore-coreference-writer/pom.xml b/jcore-coreference-writer/pom.xml new file mode 100644 index 000000000..ad4aac828 --- /dev/null +++ b/jcore-coreference-writer/pom.xml @@ -0,0 +1,61 @@ + + + + 4.0.0 + jcore-coreference-writer + jar + + + de.julielab + jcore-base + 2.6.0 + + + + + de.julielab + jcore-descriptor-creator + + + de.julielab + jcore-utilities + ${jcore-utilities-version} + + + de.julielab + julielab-java-utilities + + + ch.qos.logback + logback-classic + test + + + org.slf4j + slf4j-api + + + de.julielab + jcore-types + ${jcore-types-version} + + + org.junit.jupiter + junit-jupiter-engine + + + JCoRe Coreference Writer + + JULIE Lab Jena, Germany + http://www.julielab.de + + + + BSD-2-Clause + https://opensource.org/licenses/BSD-2-Clause + + + https://github.com/JULIELab/jcore-base/tree/master/jcore-coreference-writer + Writes coreference annotations from the CAS to a text file format. + diff --git a/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java b/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java new file mode 100644 index 000000000..c85dcfa82 --- /dev/null +++ b/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java @@ -0,0 +1,87 @@ +package de.julielab.jcore.consumer.coreference; + +import de.julielab.java.utilities.FileUtilities; +import de.julielab.jcore.types.CorefExpression; +import de.julielab.jcore.types.CorefRelation; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.commons.io.IOUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CASRuntimeException; +import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.FeatureStructure; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.util.Iterator; + +@ResourceMetaData(name = "JCoRe Coreference Writer", description = "Writes co-reference annotation to a text file.") +public class CoreferenceWriter extends JCasAnnotator_ImplBase { + + public static final String PARAM_OUTPUTFILE = "OutputFile"; + + @ConfigurationParameter(name = PARAM_OUTPUTFILE) + private String outputFile; + private OutputStream os; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException { + super.initialize(aContext); + outputFile = (String) aContext.getConfigParameterValue(PARAM_OUTPUTFILE); + try { + os = FileUtilities.getOutputStreamToFile(new File(outputFile)); + } catch (IOException e) { + throw new ResourceInitializationException(e); + } + } + + @Override + public void process(JCas jcas) throws AnalysisEngineProcessException { + try { + String pubmedId = JCoReTools.getDocId(jcas); + FSIterator it = jcas.getAnnotationIndex(CorefRelation.type).iterator(); + + int relcount = 0; + while (it.hasNext()) { + CorefRelation rel = it.next(); + de.julielab.jcore.types.Annotation anaphora = rel.getAnaphora(); + + String abbrId = "Ana" + relcount; + + IOUtils.write(String.join("\t", pubmedId, abbrId, String.valueOf(anaphora.getBegin()), + String.valueOf(anaphora.getEnd())) + "\n", os, "UTF-8"); + + Iterator antecedentsIt = rel.getAntecedents() != null ? rel.getAntecedents().iterator() : null; + while (antecedentsIt != null && antecedentsIt.hasNext()) { + CorefExpression antecedent = (CorefExpression) antecedentsIt.next(); + if (antecedent != null) { + String antecedentGroup = "Ant" + relcount; + IOUtils.write(String.join("\t", pubmedId, antecedentGroup, String.valueOf(antecedent.getBegin()), + String.valueOf(antecedent.getEnd())) + "\n", os, "UTF-8"); + } + } + + + ++relcount; + } + } catch (CASRuntimeException | IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + + @Override + public void collectionProcessComplete() throws AnalysisEngineProcessException { + try { + os.close(); + } catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + +} diff --git a/jcore-coreference-writer/src/main/resources/de/julielab/jcore/consumer/coreference/desc/jcore-coreference-writer.xml b/jcore-coreference-writer/src/main/resources/de/julielab/jcore/consumer/coreference/desc/jcore-coreference-writer.xml new file mode 100644 index 000000000..b31bb30bb --- /dev/null +++ b/jcore-coreference-writer/src/main/resources/de/julielab/jcore/consumer/coreference/desc/jcore-coreference-writer.xml @@ -0,0 +1,33 @@ + + + org.apache.uima.java + true + de.julielab.jcore.consumer.coreference.CoreferenceWriter + + JCoRe Coreference Writer + Writes coreference annotation to a text file. + 2.6.0 + + + OutputFile + + String + false + true + + + + + + + + + + + + true + true + false + + + \ No newline at end of file diff --git a/jcore-coreference-writer/src/test/java/de/julielab/jcore/consumer/coreference/CoreferenceWriterTest.java b/jcore-coreference-writer/src/test/java/de/julielab/jcore/consumer/coreference/CoreferenceWriterTest.java new file mode 100644 index 000000000..7b7bf0429 --- /dev/null +++ b/jcore-coreference-writer/src/test/java/de/julielab/jcore/consumer/coreference/CoreferenceWriterTest.java @@ -0,0 +1,10 @@ + +package de.julielab.jcore.consumer.coreference; + +/** + * Unit tests for jcore-coreference-writer. + * + */ +public class CoreferenceWriterTest { +// TODO +} diff --git a/jcore-cpe-db-runner/pom.xml b/jcore-cpe-db-runner/pom.xml index d84ab5a84..62e879169 100644 --- a/jcore-cpe-db-runner/pom.xml +++ b/jcore-cpe-db-runner/pom.xml @@ -71,8 +71,8 @@ ${project.parent.version} - junit - junit + org.junit.jupiter + junit-jupiter-engine https://github.com/JULIELab/jcore-base/tree/master/jcore-cpe-db-runner diff --git a/jcore-ct-reader/component.meta b/jcore-ct-reader/component.meta index a131ea835..6e0600b4f 100644 --- a/jcore-ct-reader/component.meta +++ b/jcore-ct-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-ct-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Clinical Trials Reader" } diff --git a/jcore-ct-reader/pom.xml b/jcore-ct-reader/pom.xml index bfc239518..4ea1f5969 100644 --- a/jcore-ct-reader/pom.xml +++ b/jcore-ct-reader/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -41,8 +41,8 @@ ${jcore-utilities-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe Clinical Trials Reader diff --git a/jcore-ct-reader/src/main/resources/de/julielab/jcore/reader/ct/desc/jcore-clinicaltrials-reader.xml b/jcore-ct-reader/src/main/resources/de/julielab/jcore/reader/ct/desc/jcore-clinicaltrials-reader.xml index 100df0acd..b0eaa2ae4 100644 --- a/jcore-ct-reader/src/main/resources/de/julielab/jcore/reader/ct/desc/jcore-clinicaltrials-reader.xml +++ b/jcore-ct-reader/src/main/resources/de/julielab/jcore/reader/ct/desc/jcore-clinicaltrials-reader.xml @@ -5,7 +5,7 @@ JCoRe Clinical Trials Reader This component reads the XML format provided by ClinicalTrials.gov. To this end, the JCoRe type system contains a number of types specifically created for this kind of document. Note that the CAS text created by this reader might be confusing without checking the corresponding annotations. This is due to the fact that the CT XML contains multiple enumerations which are not very well reflected in plain text. Also, enumerations with subitems, such as the outcomes, are not displayed in the expected groups of items. Instead, each item type is displayed separately. This could be changed, if necessary. Since all items are correctly annotated by their category, this might not even be an issue, depending on the downstream tasks. - 2.5.1-SNAPSHOT + 2.6.0 InputDirectory diff --git a/jcore-ct-reader/src/test/java/de/julielab/jcore/reader/ct/ClinicalTrialsReaderTest.java b/jcore-ct-reader/src/test/java/de/julielab/jcore/reader/ct/ClinicalTrialsReaderTest.java index b1aa75967..140b19874 100644 --- a/jcore-ct-reader/src/test/java/de/julielab/jcore/reader/ct/ClinicalTrialsReaderTest.java +++ b/jcore-ct-reader/src/test/java/de/julielab/jcore/reader/ct/ClinicalTrialsReaderTest.java @@ -11,13 +11,13 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.Collection; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; /** * Unit tests for jcore-ct-reader. diff --git a/jcore-db-checkpoint-ae/README.md b/jcore-db-checkpoint-ae/README.md index 6a4ed4f4b..a74f91d53 100644 --- a/jcore-db-checkpoint-ae/README.md +++ b/jcore-db-checkpoint-ae/README.md @@ -2,7 +2,8 @@ **Descriptor Path**: ``` -de.julielab.desc.jcore-db-checkpoint-ae +de.julielab.jcore.ae.checkpoint.desc.jcore-db-checkpoint-ae +de.julielab.jcore.ae.checkpoint.desc.jcore-db-checkpoint-consumer ``` This is a JeDiS[1] component. It can be used to set the 'last component' column in a subset table. This help to keep track of the pipeline status. diff --git a/jcore-db-checkpoint-ae/component.meta b/jcore-db-checkpoint-ae/component.meta index b703ae5c4..db83ca2a7 100644 --- a/jcore-db-checkpoint-ae/component.meta +++ b/jcore-db-checkpoint-ae/component.meta @@ -19,7 +19,7 @@ "maven-artifact": { "artifactId": "jcore-db-checkpoint-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Database Checkpoint AE" } diff --git a/jcore-db-checkpoint-ae/pom.xml b/jcore-db-checkpoint-ae/pom.xml index 3cac45687..13fede4b9 100644 --- a/jcore-db-checkpoint-ae/pom.xml +++ b/jcore-db-checkpoint-ae/pom.xml @@ -9,7 +9,7 @@ de.julielab jedis-parent - 2.5.1-SNAPSHOT + 2.6.0 ../jedis-parent diff --git a/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DBCheckpointAE.java b/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DBCheckpointAE.java index 1a70c23cd..cf6f77e9e 100644 --- a/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DBCheckpointAE.java +++ b/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DBCheckpointAE.java @@ -69,6 +69,7 @@ public class DBCheckpointAE extends JCasAnnotator_ImplBase { */ @Override public void initialize(final UimaContext aContext) throws ResourceInitializationException { + super.initialize(aContext); componentDbName = (String) aContext.getConfigParameterValue(PARAM_CHECKPOINT_NAME); dbcConfigPath = (String) aContext.getConfigParameterValue(PARAM_COSTOSYS_CONFIG); indicateFinished = Optional.ofNullable((Boolean) aContext.getConfigParameterValue(PARAM_INDICATE_FINISHED)).orElse(false); @@ -108,7 +109,7 @@ public void batchProcessComplete() throws AnalysisEngineProcessException { @Override public void collectionProcessComplete() throws AnalysisEngineProcessException { super.collectionProcessComplete(); - log.debug("BatchProcessComplete called, stashing {} documents to be ready for marked as being finished", docIds.size()); + log.debug("CollectionProcessComplete called, stashing {} documents to be ready for marked as being finished", docIds.size()); if (indicateFinished) docReleaseCheckpoint.release(jedisSyncKey, docIds.stream()); try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { @@ -120,6 +121,7 @@ public void collectionProcessComplete() throws AnalysisEngineProcessException { } private void customBatchProcessingComplete() throws AnalysisEngineProcessException { + log.debug("CustomBatchProcessComplete called, stashing {} documents to be ready for marked as being finished", docIds.size()); if (indicateFinished) docReleaseCheckpoint.release(jedisSyncKey, docIds.stream()); try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { @@ -134,6 +136,7 @@ private void customBatchProcessingComplete() throws AnalysisEngineProcessExcepti */ @Override public void process(final JCas aJCas) throws AnalysisEngineProcessException { + log.trace("Processing jCas instance " + aJCas); DocumentId documentId; try { final DBProcessingMetaData dbProcessingMetaData = JCasUtil.selectSingle(aJCas, DBProcessingMetaData.class); @@ -198,13 +201,25 @@ private void setLastComponent(CoStoSysConnection conn, String sqlMarkIsProcessed = String.format("UPDATE %s SET %s='%s', %s=TRUE, %s=FALSE WHERE %s", subsetTableName, Constants.LAST_COMPONENT, componentDbName, Constants.IS_PROCESSED, Constants.IN_PROCESS, primaryKeyPsString); if (!documentIdsToSetLastComponent.isEmpty()) { - log.debug("Setting the last component to {} for {} documents", componentDbName, documentIdsToSetLastComponent.size()); + log.debug("Setting the last component to '{}' for {} documents", componentDbName, documentIdsToSetLastComponent.size()); updateSubsetTable(conn, documentIdsToSetLastComponent, sqlSetLastComponent); } if (markIsProcessed) { - log.debug("Marking {} documents to having been processed by component \"{}\".", documentIdsToSetLastComponent.size(), componentDbName); + log.debug("Marking {} documents to having been processed by component \"{}\".", processedDocumentIds.size(), componentDbName); + log.debug("SQL: {}", sqlMarkIsProcessed); + log.trace("Marking the following document IDS as having been processed: {}", processedDocumentIds); updateSubsetTable(conn, processedDocumentIds, sqlMarkIsProcessed); } + try { + log.debug("Connection is auto commit: {}", conn.getAutoCommit()); + if (!conn.getAutoCommit()) { + log.debug("Committing changes"); + conn.commit(); + } + } catch (SQLException e) { + log.error("Could not commit the document processing status changes.", e); + throw new AnalysisEngineProcessException(e); + } } private void updateSubsetTable(CoStoSysConnection conn, Collection documentIdsToMark, String sql) throws AnalysisEngineProcessException { @@ -221,6 +236,7 @@ private void updateSubsetTable(CoStoSysConnection conn, Collection d ps.addBatch(); } try { + log.debug("Executing SQL command batch for being processed."); ps.executeBatch(); } catch (BatchUpdateException e) { if (e.getMessage().contains("deadlock detected")) { diff --git a/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DocumentReleaseCheckpoint.java b/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DocumentReleaseCheckpoint.java index e67750ed5..994063406 100644 --- a/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DocumentReleaseCheckpoint.java +++ b/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DocumentReleaseCheckpoint.java @@ -1,11 +1,11 @@ package de.julielab.jcore.ae.checkpoint; -import com.google.common.collect.HashMultiset; -import com.google.common.collect.Multiset; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.HashMap; import java.util.HashSet; +import java.util.Map; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -14,7 +14,7 @@ *

This is class is a synchronization point for JeDIS components to report documents as being completely finished * with processing.

*

Problem explanation: This synchronization is necessary because most database operating components work in batch mode for - * performance reasons. However, if multiple components use batching with might be out of sync due to different + * performance reasons. However, if multiple components use batching wich might be out of sync due to different * batch sizes and possibly other factors, one component may have sent a batch of document data to the database * while other components have not at a particular point in time. If at such a time point the pipeline crashes * or is manually interrupted, the actually written data is incoherent in the sense that some components have sent @@ -33,17 +33,18 @@ public class DocumentReleaseCheckpoint { "This is useful when document data is sent batchwise to the database by multiple components: In the case of a crash or manual cancellation of a pipeline run without synchronization is might happen " + "that some components have sent their data and others haven't at the time of termination. To avoid an inconsistent database state," + "a document will only be marked as finished " + - "processed in the JeDIS subset table if all synchronied components in the pipeline have released the document. " + + "processed in the JeDIS subset table if all synchronized components in the pipeline have released the document. " + "This is done by the DBCheckpointAE which must be at the end of the pipeline and have the 'IndicateFinished' parameter set to 'true'. " + "Synchronized components are those that disclose this parameter and have a value set to it."; public static final String PARAM_JEDIS_SYNCHRONIZATION_KEY = "JedisSynchronizationKey"; private final static Logger log = LoggerFactory.getLogger(DocumentReleaseCheckpoint.class); private static DocumentReleaseCheckpoint checkpoint; - private Multiset releasedDocuments; + private Map> releasedDocuments; private Set registeredComponents; + private long lastwarning = 1000; private DocumentReleaseCheckpoint() { - releasedDocuments = HashMultiset.create(); + releasedDocuments = new HashMap<>(); registeredComponents = new HashSet<>(); } @@ -82,7 +83,15 @@ public void release(String componentKey, Stream releasedDocumentIds) if (!registeredComponents.contains(componentKey)) throw new IllegalArgumentException("No component is registered for key " + componentKey); synchronized (releasedDocuments) { - releasedDocumentIds.forEach(d -> releasedDocuments.add(d)); + releasedDocumentIds.forEach(d -> releasedDocuments.compute(d, (k, v) -> { + if (v == null) { + Set ret = new HashSet<>(); + ret.add(componentKey); + return ret; + } + v.add(componentKey); + return v; + })); } } @@ -99,13 +108,20 @@ public Set getReleasedDocumentIds() { // Get all documents released by all components Set returnedIds; synchronized (releasedDocuments) { - returnedIds = this.releasedDocuments.entrySet().stream().filter(e -> e.getCount() == getNumberOfRegisteredComponents()).map(Multiset.Entry::getElement).collect(Collectors.toSet()); + log.trace("The following {} components are registered for document release: {}", getNumberOfRegisteredComponents(), registeredComponents); + log.trace("Released document counts: {}", this.releasedDocuments); + returnedIds = this.releasedDocuments.keySet().stream().filter(k -> this.releasedDocuments.get(k).containsAll(this.registeredComponents)).collect(Collectors.toSet()); + log.trace("Final Document IDs to release: {}", returnedIds); // Remove the completely released documents from the pool of potentially not yet completely released documents. - returnedIds.forEach(id -> this.releasedDocuments.remove(id, Integer.MAX_VALUE)); + returnedIds.forEach(id -> this.releasedDocuments.remove(id)); } log.debug("Returning {} documents released by all registered components. {} document IDs remain that have not yet been released by all registered components.", returnedIds.size(), this.releasedDocuments.size()); - if (this.releasedDocuments.size() > 1000) - log.warn("The number of document IDs that have not been released by all registered components has grown to {}. If it does not increase again, there is likely an errorneous component which does not release its documents.", releasedDocuments.size()); + if (this.releasedDocuments.size() > lastwarning) { + log.warn("The number of document IDs that have not been released by all registered components has grown to {}. If it does not decrease again, there is likely an errorneous component which does not release its documents. Currently registered components: {}", releasedDocuments.size(), registeredComponents); + lastwarning *= 2; + } else if (this.releasedDocuments.size() < 50) { + lastwarning = 1000; + } return returnedIds; } diff --git a/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-ae.xml b/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-ae.xml index 31e3605e8..6340c7355 100644 --- a/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-ae.xml +++ b/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-ae.xml @@ -6,7 +6,7 @@ JCoRe Database Checkpoint AE This component can be used when using a JCoRe database reader that reads from a CoStoSys/JeDIS subset. Enters the configured component name in the 'last component' column. Can also mark documents as being completely processed. - 2.5.1-SNAPSHOT + 2.6.0 CheckpointName diff --git a/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-consumer.xml b/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-consumer.xml index 5ac25514c..be7df82ea 100644 --- a/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-consumer.xml +++ b/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-consumer.xml @@ -6,7 +6,7 @@ JCoRe Database Checkpoint Writer This component can be used when using a JCoRe database reader that reads from a CoStoSys/JeDIS subset. Enters the configured component name in the 'last component' column. Can also mark documents as being completely processed. - 2.5.1-SNAPSHOT + 2.6.0 CheckpointName diff --git a/jcore-db-reader/component.meta b/jcore-db-reader/component.meta index a6793b944..1272e620f 100644 --- a/jcore-db-reader/component.meta +++ b/jcore-db-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-db-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Database Reader" } diff --git a/jcore-db-reader/pom.xml b/jcore-db-reader/pom.xml index 2129cc7e0..577dca679 100644 --- a/jcore-db-reader/pom.xml +++ b/jcore-db-reader/pom.xml @@ -1,16 +1,17 @@ - + jedis-parent de.julielab - 2.5.1-SNAPSHOT + 2.6.0 ../jedis-parent 4.0.0 jcore-db-reader JCoRe Database Reader Abstract database reader for database driven processing - + de.julielab @@ -44,7 +45,7 @@ de.julielab jcore-xml-mapper - 2.5.1-SNAPSHOT + 2.6.0 test @@ -57,38 +58,114 @@ jcore-db-test-utilities test + + org.junit.jupiter + junit-jupiter-engine + org.apache.uima uima-ducc-user ${uima-ducc-version} - org.apache.uimauimaj-as-activemq - org.apache.activemqactivemq-camel - org.apache.camelcamel-core - org.apache.camelcamel-xstream - org.apache.commonscommons-pool2 - org.eclipse.jettyjetty-server - xpp3xpp3 - org.apache.httpcomponentshttpclient - xmlpullxmlpull - org.apache.httpcomponentshttpclient-cache - org.apache.httpcomponentshttpcore - org.slf4jjcl-over-slf4j - org.apache.camelcamel-context - org.apache.camelcamel-http4 - org.apache.camelcamel-http - org.apache.camelcamel-http-common - org.apache.camelcamel-jetty-common - org.apache.camelcamel-mina - org.apache.camelcamel-xmlbeans - org.apache.minamina-core - org.apache.camelcamel-servlet - org.apache.camelcamel-test-spring - org.apache.camelcamel-test - org.apache.camelcamel-stream + + org.apache.uima + uimaj-as-activemq + + + org.apache.activemq + activemq-camel + + + org.apache.camel + camel-core + + + org.apache.camel + camel-xstream + + + org.apache.commons + commons-pool2 + + + org.eclipse.jetty + jetty-server + + + xpp3 + xpp3 + + + org.apache.httpcomponents + httpclient + + + xmlpull + xmlpull + + + org.apache.httpcomponents + httpclient-cache + + + org.apache.httpcomponents + httpcore + + + org.slf4j + jcl-over-slf4j + + + org.apache.camel + camel-context + + + org.apache.camel + camel-http4 + + + org.apache.camel + camel-http + + + org.apache.camel + camel-http-common + + + org.apache.camel + camel-jetty-common + + + org.apache.camel + camel-mina + + + org.apache.camel + camel-xmlbeans + + + org.apache.mina + mina-core + + + org.apache.camel + camel-servlet + + + org.apache.camel + camel-test-spring + + + org.apache.camel + camel-test + + + org.apache.camel + camel-stream + - + BSD-2-Clause diff --git a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java index 195e30de7..2dcc1e0d9 100644 --- a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java +++ b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java @@ -9,15 +9,20 @@ import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.OperationalProperties; import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.cas.StringArray; import org.apache.uima.resource.ResourceInitializationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.FileNotFoundException; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; +import java.util.stream.Collectors; /** * A multiplier retrieving feature structures of type of {@link RowBatch} in its {@link #process(JCas)} method. @@ -34,8 +39,9 @@ "populate CASes with them. This component is a part of the Jena Document Information System, JeDIS.", vendor = "JULIE Lab Jena, Germany", copyright = "JULIE Lab Jena, Germany") @OperationalProperties(outputsNewCases = true) +@TypeCapability(inputs = {"de.julielab.jcore.types.casmultiplier.RowBatch"}) public abstract class DBMultiplier extends JCasMultiplier_ImplBase { - +private final static Logger log = LoggerFactory.getLogger(DBMultiplier.class); protected DataBaseConnector dbc; protected DBCIterator documentDataIterator; protected String[] tables; @@ -56,7 +62,7 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept initialized = false; } - private DataBaseConnector getDataBaseConnector(String costosysConfig) throws AnalysisEngineProcessException { + protected DataBaseConnector getDataBaseConnector(String costosysConfig) throws AnalysisEngineProcessException { DataBaseConnector dbc; try { dbc = new DataBaseConnector(costosysConfig); @@ -70,6 +76,10 @@ private DataBaseConnector getDataBaseConnector(String costosysConfig) throws Ana @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { RowBatch rowbatch = JCasUtil.selectSingle(aJCas, RowBatch.class); + if (rowbatch.getIdentifiers() == null) + throw new AnalysisEngineProcessException(new IllegalArgumentException("The identifiers of the passed row batch are null.")); + if (rowbatch.getIdentifiers().size() == 0) + throw new AnalysisEngineProcessException(new IllegalArgumentException("The identifiers of the passed row batch are empty.")); tables = rowbatch.getTables().toStringArray(); schemaNames = rowbatch.getTableSchemas().toStringArray(); tableName = rowbatch.getTableName(); @@ -98,6 +108,9 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { StringArray primaryKey = (StringArray) identifiers.get(i); documentIdsForQuery.add(primaryKey.toArray()); } + if (log.isTraceEnabled()) { + log.trace("Received document IDs: {}", documentIdsForQuery.stream().map(o -> Arrays.stream(o).map(Object::toString).collect(Collectors.joining(","))).collect(Collectors.joining(" ; "))); + } documentDataIterator = dbc.retrieveColumnsByTableSchema(documentIdsForQuery, tables, schemaNames); diff --git a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplierReader.java b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplierReader.java index 83370feae..37922d46d 100644 --- a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplierReader.java +++ b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplierReader.java @@ -19,6 +19,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -49,6 +50,9 @@ public class DBMultiplierReader extends DBSubsetReader { @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); + // reset the state in case of reconfigure() + retriever = null; + dataTableDocumentIds = null; // Check whether a subset table name or a data table name was given. if (readDataTable) { @@ -62,9 +66,12 @@ public void initialize(UimaContext context) throws ResourceInitializationExcepti } @Override - public void getNext(JCas jCas) throws CollectionException { + public void getNext(JCas jCas) throws CollectionException, IOException { + log.trace("jCas instance: " + jCas); log.trace("Requesting next batch of document IDs from the database."); List idList = getNextDocumentIdBatch(); + if (idList.isEmpty()) + throw new CollectionException(new IllegalStateException("There are no documents to read in the database. Please call hasNext() to check if there is more data to read. Retriever: " + retriever)); log.trace("Received a list of {} ID from the database.", idList.size()); RowBatch rowbatch = new RowBatch(jCas); FSArray ids = new FSArray(jCas, idList.size()); @@ -114,12 +121,16 @@ public void getNext(JCas jCas) throws CollectionException { * * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#hasNext() */ - public boolean hasNext() { + public boolean hasNext() throws IOException, CollectionException { boolean hasNext = this.hasNext; - if (retriever != null) + if (retriever != null) { + if (retriever.isConsumed()) + retriever.run(); hasNext = !retriever.getDocumentIds().isEmpty(); + } if (!hasNext) close(); + log.trace("hasNext returns {}", hasNext); return hasNext; } @@ -169,6 +180,7 @@ private List getNextFromSubset() { retriever = new DBMultiplierReader.RetrievingThread(); } idList = retriever.getDocumentIds(); + retriever.setConsumed(true); // While returning the current set of IDs, already fetch the next batch if (fetchIdsProactively) retriever = new DBMultiplierReader.RetrievingThread(); @@ -182,7 +194,7 @@ public Progress[] getProgress() { } @Override - public void close() { + public void close() throws IOException { if (dbc != null) dbc.close(); dbc = null; @@ -207,18 +219,32 @@ public void close() { */ protected class RetrievingThread extends Thread { private List ids; + private long timestamp = System.currentTimeMillis(); + private boolean consumed; public RetrievingThread() { // Only fetch ID batches in advance when the parameter is set to // true. if (fetchIdsProactively) { - log.debug("Fetching ID batches in a background thread."); + log.debug("[{}] Fetching ID batches in a background thread.", timestamp); setName(DBMultiplierReader.class.getSimpleName() + " RetrievingThread (" + getName() + ")"); start(); + } else { + log.debug("[{}] Fetching ID batches without a background thread.", timestamp); + run(); } } + public boolean isConsumed() { + return consumed; + } + + public void setConsumed(boolean consumed) { + this.consumed = consumed; + } + public void run() { + consumed = false; // Remember: If the Limit parameter is set, totalDocumentCount is // that limit (or the remaining number of documents, if that's // lower). @@ -228,14 +254,14 @@ public void run() { int limit = Math.min(batchSize, totalDocumentCount - numberFetchedDocIDs); try { try (CoStoSysConnection ignored = dbc.obtainOrReserveConnection()) { - log.trace("Using connection {} to retrieveAndMark", ignored.getConnection()); + log.trace("[{}] Using connection {} to retrieveAndMark", timestamp, ignored.getConnection()); ids = dbc.retrieveAndMark(tableName, getClass().getSimpleName(), hostName, pid, limit, selectionOrder); if (log.isTraceEnabled()) { - log.trace("Retrieved the following IDs from the database: {}", ids.stream().map(Arrays::toString).collect(Collectors.joining(", "))); + log.trace("[{}] Retrieved the following IDs from the database: {}", timestamp, ids.stream().map(Arrays::toString).collect(Collectors.joining(", "))); } } numberFetchedDocIDs += ids.size(); - log.debug("Retrieved {} document IDs to fetch from the database.", ids.size()); + log.debug("[{}] Retrieved {} document IDs to fetch from the database.", timestamp, ids.size()); } catch (TableSchemaMismatchException e) { log.error("Table schema mismatch: The active table schema {} specified in the CoStoSys configuration" + " file {} does not match the columns in the subset table {}: {}", dbc.getActiveTableSchema(), @@ -249,22 +275,36 @@ public void run() { public List getDocumentIds() { // If we don't use this as a background thread, we have to get the - // IDs now in a sequential manner. + // IDs now in a classic sequential manner. if (!fetchIdsProactively) { // Use run as we don't have a use for real threads anyway. log.debug("Fetching new documents (without employing a background thread)."); - run(); } try { // If this is a background thread started with start(): Wait for // the IDs to be retrieved, i.e. that run() ends. log.debug("Waiting for the background thread to finish fetching documents to return them."); join(); + log.debug("[{}] Delivering {} document IDs", timestamp, ids.size()); return ids; } catch (InterruptedException e) { - log.error("Background ID fetching thread was interrupted", e); + e.printStackTrace(); } return null; +// try { +// if (fetchIdsProactively) {// If this is a background thread started with start(): Wait for +// // the IDs to be retrieved, i.e. that run() ends. +// log.debug("[{}] Waiting for the background thread to finish fetching documents to return them.", timestamp); +// join(); +// } +// log.debug("[{}] Delivering {} document IDs", timestamp, ids.size()); +// List ret = ids; +// ids = Collections.emptyList(); +// return ret; +// } catch (InterruptedException e) { +// log.error("Background ID fetching thread was interrupted", e); +// } +// return null; } } diff --git a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBReader.java b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBReader.java index 5a21db4be..798d24782 100644 --- a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBReader.java +++ b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBReader.java @@ -118,17 +118,20 @@ public abstract class DBReader extends DBSubsetReader { private DBCIterator xmlBytes; public static String setDBProcessingMetaData(DataBaseConnector dbc, boolean readDataTable, String tableName, byte[][] data, JCas cas) { - String pkString = null; // remove previously added dbMetaData JCasUtil.select(cas, DBProcessingMetaData.class).forEach(x -> x.removeFromIndexes()); DBProcessingMetaData dbMetaData = new DBProcessingMetaData(cas); List pkIndices = dbc.getPrimaryKeyIndices(); StringArray pkArray = new StringArray(cas, pkIndices.size()); + StringBuilder pkBuilder = new StringBuilder(); for (int i = 0; i < pkIndices.size(); ++i) { Integer index = pkIndices.get(i); String pkElementValue = new String(data[index], Charset.forName("UTF-8")); pkArray.set(i, pkElementValue); + pkBuilder.append(pkElementValue); + if (i < pkIndices.size() - 1) + pkBuilder.append(","); } if (log.isDebugEnabled()) log.trace("Setting primary key for DBProcessingMetaData to {}", Arrays.toString(pkArray.toArray())); @@ -142,10 +145,9 @@ public static String setDBProcessingMetaData(DataBaseConnector dbc, boolean read } else { log.trace("Not setting the subset to DBProcessingMetaData because reading the data table is set to {}", readDataTable); } - - dbMetaData.addToIndexes(); - return pkString; + + return pkBuilder.toString(); } @Override @@ -177,7 +179,7 @@ public boolean hasNext() throws IOException, CollectionException { public byte[][] getNextArtifactData() throws CollectionException { log.trace("Fetching next document from the current database batch"); - byte[][] next = null; + byte[][] next; if (readDataTable) next = getNextFromDataTable(); else @@ -257,6 +259,7 @@ public void close() { * pipeline status field */ protected abstract String getReaderComponentName(); + /** *

* This class is charged with retrieving batches of document IDs and documents while previously fetched documents diff --git a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBReaderBase.java b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBReaderBase.java index 082909cb5..c46d6a105 100644 --- a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBReaderBase.java +++ b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBReaderBase.java @@ -95,6 +95,8 @@ public void initialize(UimaContext context) throws ResourceInitializationExcepti } catch (FileNotFoundException e) { throw new ResourceInitializationException(e); } + + numberFetchedDocIDs = 0; } private void checkTableExists() throws ResourceInitializationException { diff --git a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/jmx/DBReaderInfo.java b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/jmx/DBReaderInfo.java new file mode 100644 index 000000000..0c341c642 --- /dev/null +++ b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/jmx/DBReaderInfo.java @@ -0,0 +1,24 @@ +package de.julielab.jcore.reader.db.jmx; + +public class DBReaderInfo implements DBReaderInfoMBean{ + private String currentDocumentId; + private String componentId; + + public void setComponentId(String componentId) { + this.componentId = componentId; + } + + public void setCurrentDocumentId(String currentDocumentId) { + this.currentDocumentId = currentDocumentId; + } + + @Override + public String getCurrentDocumentId() { + return currentDocumentId; + } + + @Override + public String getComponentId() { + return componentId; + } +} diff --git a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/jmx/DBReaderInfoMBean.java b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/jmx/DBReaderInfoMBean.java new file mode 100644 index 000000000..d82b04c1a --- /dev/null +++ b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/jmx/DBReaderInfoMBean.java @@ -0,0 +1,6 @@ +package de.julielab.jcore.reader.db.jmx; + +public interface DBReaderInfoMBean { + String getCurrentDocumentId(); + String getComponentId(); +} diff --git a/jcore-db-reader/src/main/resources/de/julielab/jcore/reader/db/desc/jcore-db-multiplier-reader.xml b/jcore-db-reader/src/main/resources/de/julielab/jcore/reader/db/desc/jcore-db-multiplier-reader.xml index 489b2b92a..81fd1c7a5 100644 --- a/jcore-db-reader/src/main/resources/de/julielab/jcore/reader/db/desc/jcore-db-multiplier-reader.xml +++ b/jcore-db-reader/src/main/resources/de/julielab/jcore/reader/db/desc/jcore-db-multiplier-reader.xml @@ -10,7 +10,7 @@ sent by this reader. The component leverages the corpus storage system (CoStoSys) for this purpose and is part of the Jena Document Information System, JeDIS. - 2.5.1-SNAPSHOT + 2.6.0 ResetTable @@ -162,7 +162,7 @@ BatchSize - 50 + 100 diff --git a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierReaderTest.java b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierReaderTest.java index 579613897..e602844a9 100644 --- a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierReaderTest.java +++ b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierReaderTest.java @@ -1,7 +1,6 @@ package de.julielab.jcore.reader.db; import de.julielab.costosys.Constants; -import de.julielab.costosys.dbconnection.CoStoSysConnection; import de.julielab.costosys.dbconnection.DataBaseConnector; import de.julielab.jcore.db.test.DBTestUtils; import de.julielab.jcore.types.casmultiplier.RowBatch; @@ -12,34 +11,34 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.BeforeClass; -import org.junit.ClassRule; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import org.testcontainers.containers.PostgreSQLContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; import java.io.IOException; import java.sql.SQLException; import static de.julielab.jcore.reader.db.TableReaderConstants.*; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; +@Testcontainers public class DBMultiplierReaderTest { - @ClassRule - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + @Container + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:" + DataBaseConnector.POSTGRES_VERSION); - @BeforeClass + @BeforeAll public static void setup() throws SQLException { DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); - try (final CoStoSysConnection ignore = dbc.obtainOrReserveConnection()) { - DBTestUtils.setupDatabase(dbc, "src/test/resources/pubmedsample18n0001.xml.gz", "medline_2017", 20, postgres); - } - dbc.close(); + dbc.obtainOrReserveConnection(); + DBTestUtils.setupDatabase(dbc, "src/test/resources/pubmedsample18n0001.xml.gz", "medline_2017", 20, postgres); } @Test public void testDBMultiplierReader() throws UIMAException, IOException, ConfigurationException { - String costosysConfig = DBTestUtils.createTestCostosysConfig("medline_2017", 1, postgres); + String costosysConfig = DBTestUtils.createTestCostosysConfig("medline_2017", 2, postgres); CollectionReader reader = CollectionReaderFactory.createReader(DBMultiplierReader.class, PARAM_BATCH_SIZE, 5, PARAM_TABLE, "testsubset", diff --git a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierTest.java b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierTest.java index 2816f9535..350f610fb 100644 --- a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierTest.java +++ b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierTest.java @@ -19,12 +19,13 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.BeforeClass; -import org.junit.ClassRule; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.testcontainers.containers.PostgreSQLContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; import java.io.File; import java.io.FileInputStream; @@ -32,14 +33,15 @@ import java.sql.SQLException; import static de.julielab.jcore.reader.db.TableReaderConstants.*; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; +@Testcontainers public class DBMultiplierTest { private final static Logger log = LoggerFactory.getLogger(DBMultiplierTest.class); - @ClassRule - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + @Container + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); - @BeforeClass + @BeforeAll public static void setup() throws SQLException, IOException { DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); dbc.reserveConnection(); diff --git a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBReaderTest.java b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBReaderTest.java index 8ed7c86bf..c681a369f 100644 --- a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBReaderTest.java +++ b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBReaderTest.java @@ -12,37 +12,36 @@ import org.apache.uima.fit.factory.CollectionReaderFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.BeforeClass; -import org.junit.ClassRule; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.testcontainers.containers.PostgreSQLContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; -import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.sql.SQLException; import static de.julielab.jcore.reader.db.TableReaderConstants.*; -import static org.junit.Assert.*; - +import static org.junit.jupiter.api.Assertions.*; +@Testcontainers public class DBReaderTest { - @ClassRule - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + @Container + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); - @BeforeClass + @BeforeAll public static void setup() throws SQLException { DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); dbc.reserveConnection(); DBTestUtils.setupDatabase("src/test/resources/pubmedsample18n0001.xml.gz", "medline_2017", 20, postgres); - dbc.close(); } @Test public void testDBReader() throws UIMAException, IOException, ConfigurationException { - String costosysConfig = DBTestUtils.createTestCostosysConfig("medline_2017", 1, postgres); + String costosysConfig = DBTestUtils.createTestCostosysConfig("medline_2017", 2, postgres); CollectionReader reader = CollectionReaderFactory.createReader(DBReaderTestImpl.class, PARAM_BATCH_SIZE, 5, PARAM_TABLE, "testsubset", @@ -72,7 +71,9 @@ public void testReadDataTable() throws ConfigurationException, UIMAException, IO int docCount = 0; JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.types.jcore-document-structure-types"); + int i = 0; while (reader.hasNext()) { + System.out.println(++i); reader.getNext(jCas.getCas()); assertNotNull(JCoReTools.getDocId(jCas)); ++docCount; @@ -94,7 +95,7 @@ public void getNext(JCas jCas) throws IOException, CollectionException { byte[][] artifactData = getNextArtifactData(); log.trace("Getting next document from database"); - XMLMapper xmlMapper = new XMLMapper(new FileInputStream(new File("src/test/resources/medline2016MappingFile.xml"))); + XMLMapper xmlMapper = new XMLMapper(new FileInputStream("src/test/resources/medline2016MappingFile.xml")); xmlMapper.parse(artifactData[1], artifactData[0], jCas); } } diff --git a/jcore-descriptor-creator/component.meta b/jcore-descriptor-creator/component.meta new file mode 100644 index 000000000..ac2d6ce7d --- /dev/null +++ b/jcore-descriptor-creator/component.meta @@ -0,0 +1,20 @@ +{ + "categories": [ + "reader" + ], + "description": "A simple project for the automatic creation of descriptors for UIMAfit-enabled components.", + "descriptors": [ + { + "category": "reader", + "location": "de.julielab.jcore.reader.testreader.desc.de.julielab.jcore.reader.testreader.TestReader" + } + ], + "exposable": true, + "group": "general", + "maven-artifact": { + "artifactId": "jcore-descriptor-creator", + "groupId": "de.julielab", + "version": "2.6.0" + }, + "name": "JCoRe Descriptor Creator" +} diff --git a/jcore-descriptor-creator/pom.xml b/jcore-descriptor-creator/pom.xml index 0336524bf..5c82749e4 100644 --- a/jcore-descriptor-creator/pom.xml +++ b/jcore-descriptor-creator/pom.xml @@ -4,7 +4,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 jcore-descriptor-creator @@ -46,8 +46,8 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-descriptor-creator/src/main/java/de/julielab/jcore/misc/DescriptorCreator.java b/jcore-descriptor-creator/src/main/java/de/julielab/jcore/misc/DescriptorCreator.java index 3f5ca368a..69253935b 100644 --- a/jcore-descriptor-creator/src/main/java/de/julielab/jcore/misc/DescriptorCreator.java +++ b/jcore-descriptor-creator/src/main/java/de/julielab/jcore/misc/DescriptorCreator.java @@ -1,17 +1,6 @@ package de.julielab.jcore.misc; -import static java.util.stream.Collectors.joining; -import static java.util.stream.Collectors.toList; - -import java.io.*; -import java.lang.reflect.Modifier; -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.function.Predicate; -import java.util.stream.Stream; - +import de.julielab.java.utilities.FileUtilities; import io.github.classgraph.ClassGraph; import io.github.classgraph.ScanResult; import org.apache.commons.lang.StringUtils; @@ -21,14 +10,26 @@ import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.CollectionReaderFactory; +import org.apache.uima.fit.factory.FlowControllerFactory; import org.apache.uima.fit.factory.TypeSystemDescriptionFactory; +import org.apache.uima.flow.FlowController; +import org.apache.uima.flow.FlowControllerDescription; import org.apache.uima.resource.ResourceCreationSpecifier; import org.apache.uima.resource.metadata.TypeSystemDescription; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; -import de.julielab.java.utilities.FileUtilities; +import java.io.*; +import java.lang.reflect.Modifier; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Stream; + +import static java.util.stream.Collectors.joining; +import static java.util.stream.Collectors.toList; public class DescriptorCreator { @@ -38,35 +39,45 @@ public class DescriptorCreator { private static final String DESC = "desc"; public static void main(String[] args) throws Exception { + String basePackage = "de.julielab.jcore"; + if (args.length > 0) + basePackage = args[0]; DescriptorCreator creator = new DescriptorCreator(); - creator.run(); + creator.run(basePackage); } public static String getComponentName() { return new File(".").getAbsoluteFile().getParentFile().getName(); } - public void run() throws Exception { - run(DEFAULT_OUTPUT_ROOT); + public void run(String basePackage) throws Exception { + run(basePackage, DEFAULT_OUTPUT_ROOT); } - public void run(String outputRoot) throws Exception { - List> readers; - List> aes; - readers = findSubclasses(CollectionReader.class.getCanonicalName()); - aes = findSubclasses(AnalysisComponent.class.getCanonicalName()); + public void run(String basePackage, String outputRoot) throws Exception { + List> readers = findSubclasses(CollectionReader.class.getCanonicalName()); + List> aes = findSubclasses(AnalysisComponent.class.getCanonicalName()); + List> flowControllers = findSubclasses(FlowController.class.getCanonicalName()); - readers = readers.stream().filter(c -> c.getPackage().getName().contains("de.julielab.jcore.reader")) + // Now filter all found classes for being in the target package and adhering to the naming conventions. + readers = readers.stream().filter(c -> c.getPackage().getName().startsWith(basePackage) && (c.getPackage().getName().contains("reader") || c.getName().toLowerCase().contains("reader"))) .collect(toList()); - // Since consumers and also multipliers can be or are AnalysisComponents, were may list all component categories here. + // Since consumers and also multipliers can be or are AnalysisComponents, we may list all component categories here. // Also, remove abstract classes aes = aes.stream().filter(c -> !Modifier.isAbstract(c.getModifiers())). - filter(c -> c.getPackage().getName().contains("de.julielab.jcore.ae") - || c.getPackage().getName().contains("de.julielab.jcore.consumer") - || c.getPackage().getName().contains("de.julielab.jcore.multiplier") - || c.getPackage().getName().contains("de.julielab.jcore.reader")).collect(toList()); + filter(c -> c.getPackage().getName().startsWith(basePackage) && + (c.getPackage().getName().contains("ae") || c.getName().toLowerCase().contains("ae") || c.getName().toLowerCase().contains("annotator") + || c.getPackage().getName().contains("consumer") || c.getName().toLowerCase().contains("consumer") || c.getName().toLowerCase().contains("writer") + || c.getPackage().getName().contains("multiplier") || c.getName().toLowerCase().contains("multiplier")) + ).collect(toList()); + + flowControllers = flowControllers.stream().filter(c -> !Modifier.isAbstract((c.getModifiers()))). + filter(c -> c.getPackage().getName().startsWith(basePackage) && + (c.getPackage().getName().contains("flow") || c.getPackage().getName().toLowerCase().contains("flow"))) + .collect(toList()); + - if (readers.isEmpty() && aes.isEmpty()) { + if (readers.isEmpty() && aes.isEmpty() && flowControllers.isEmpty()) { log.warn("No JCoRe UIMA component classes were found."); } else { Stream typeDescNamesStream = Stream.of(TypeSystemDescriptionFactory.scanTypeDescriptors()). @@ -86,6 +97,10 @@ public void run(String outputRoot) throws Exception { AnalysisEngineDescription d = AnalysisEngineFactory.createEngineDescription(cls, tsd); writeComponentDescriptor(outputRoot, cls, d, "analysis engine / consumer"); } + for (Class cls : flowControllers) { + FlowControllerDescription d = FlowControllerFactory.createFlowControllerDescription(cls); + writeComponentDescriptor(outputRoot, cls, d, "flow controller"); + } } } diff --git a/jcore-descriptor-creator/src/test/java/de/julielab/jcore/misc/DescriptorCreatorTest.java b/jcore-descriptor-creator/src/test/java/de/julielab/jcore/misc/DescriptorCreatorTest.java index 534fadda9..31961e62d 100644 --- a/jcore-descriptor-creator/src/test/java/de/julielab/jcore/misc/DescriptorCreatorTest.java +++ b/jcore-descriptor-creator/src/test/java/de/julielab/jcore/misc/DescriptorCreatorTest.java @@ -1,35 +1,34 @@ package de.julielab.jcore.misc; -import static java.util.stream.Collectors.joining; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import de.julielab.java.utilities.IOStreamUtilities; +import org.apache.commons.io.FileUtils; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.nio.file.Path; import java.util.Arrays; -import java.util.Optional; import java.util.stream.Stream; -import de.julielab.java.utilities.IOStreamUtilities; -import org.apache.commons.io.FileUtils; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.Test; +import static java.util.stream.Collectors.joining; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; public class DescriptorCreatorTest { - @BeforeClass - @AfterClass + @BeforeAll + @AfterAll public static void shutdown() throws IOException { - //FileUtils.deleteDirectory(new File(Arrays.asList("src", "test", "resources", "de").stream().collect(joining(File.separator)))); + FileUtils.deleteDirectory(new File(Arrays.asList("src", "test", "resources", "de").stream().collect(joining(File.separator)))); } @Test public void testRun() throws Exception { DescriptorCreator creator = new DescriptorCreator(); String outputRoot = "src" + File.separator + "test" + File.separator + "resources" + File.separator; - creator.run(outputRoot); + creator.run("de.julielab.jcore", outputRoot); File crDir = new File(outputRoot + Stream.of("de", "julielab", "jcore", "reader", "testreader", "desc").collect(joining(File.separator))); File aeDir = new File(outputRoot + Stream.of("de", "julielab", "jcore", "ae", "testae", "desc").collect(joining(File.separator))); File consumerDir = new File(outputRoot + Stream.of("de", "julielab", "jcore", "consumer", "testconsumer", "desc").collect(joining(File.separator))); diff --git a/jcore-descriptor-creator/src/test/java/de/julielab/jcore/reader/testreader/TestReader.java b/jcore-descriptor-creator/src/test/java/de/julielab/jcore/reader/testreader/TestReader.java index 937c00e4d..36e70a5cc 100644 --- a/jcore-descriptor-creator/src/test/java/de/julielab/jcore/reader/testreader/TestReader.java +++ b/jcore-descriptor-creator/src/test/java/de/julielab/jcore/reader/testreader/TestReader.java @@ -1,12 +1,12 @@ package de.julielab.jcore.reader.testreader; -import java.io.IOException; - import org.apache.uima.cas.CAS; import org.apache.uima.collection.CollectionException; import org.apache.uima.collection.CollectionReader_ImplBase; import org.apache.uima.util.Progress; +import java.io.IOException; + public class TestReader extends CollectionReader_ImplBase { @Override diff --git a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/ae/testae/desc/de.julielab.jcore.ae.testae.TestAE.xml b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/ae/testae/desc/de.julielab.jcore.ae.testae.TestAE.xml deleted file mode 100644 index 34208ad32..000000000 --- a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/ae/testae/desc/de.julielab.jcore.ae.testae.TestAE.xml +++ /dev/null @@ -1,50 +0,0 @@ - - - org.apache.uima.java - true - de.julielab.jcore.ae.testae.TestAE - - de.julielab.jcore.ae.testae.TestAE - Descriptor automatically generated by uimaFIT - 2.5.1-SNAPSHOT - de.julielab.jcore.ae.testae - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - true - true - false - - - \ No newline at end of file diff --git a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/consumer/testconsumer/desc/de.julielab.jcore.consumer.testconsumer.Testconsumer.xml b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/consumer/testconsumer/desc/de.julielab.jcore.consumer.testconsumer.Testconsumer.xml deleted file mode 100644 index 7d1d5a224..000000000 --- a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/consumer/testconsumer/desc/de.julielab.jcore.consumer.testconsumer.Testconsumer.xml +++ /dev/null @@ -1,50 +0,0 @@ - - - org.apache.uima.java - true - de.julielab.jcore.consumer.testconsumer.Testconsumer - - de.julielab.jcore.consumer.testconsumer.Testconsumer - Descriptor automatically generated by uimaFIT - 2.5.1-SNAPSHOT - de.julielab.jcore.consumer.testconsumer - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - true - true - false - - - \ No newline at end of file diff --git a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/multiplier/testmultiplier/desc/de.julielab.jcore.multiplier.testmultiplier.TestMultiplier.xml b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/multiplier/testmultiplier/desc/de.julielab.jcore.multiplier.testmultiplier.TestMultiplier.xml deleted file mode 100644 index 8167fbb68..000000000 --- a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/multiplier/testmultiplier/desc/de.julielab.jcore.multiplier.testmultiplier.TestMultiplier.xml +++ /dev/null @@ -1,50 +0,0 @@ - - - org.apache.uima.java - true - de.julielab.jcore.multiplier.testmultiplier.TestMultiplier - - de.julielab.jcore.multiplier.testmultiplier.TestMultiplier - Descriptor automatically generated by uimaFIT - 2.5.1-SNAPSHOT - de.julielab.jcore.multiplier.testmultiplier - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - true - true - false - - - \ No newline at end of file diff --git a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/reader/testreader/desc/de.julielab.jcore.reader.testreader.TestReader.xml b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/reader/testreader/desc/de.julielab.jcore.reader.testreader.TestReader.xml deleted file mode 100644 index 016fc36bf..000000000 --- a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/reader/testreader/desc/de.julielab.jcore.reader.testreader.TestReader.xml +++ /dev/null @@ -1,49 +0,0 @@ - - - org.apache.uima.java - de.julielab.jcore.reader.testreader.TestReader - - de.julielab.jcore.reader.testreader.TestReader - Descriptor automatically generated by uimaFIT - 2.5.1-SNAPSHOT - de.julielab.jcore.reader.testreader - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - true - false - true - - - \ No newline at end of file diff --git a/jcore-dta-reader/component.meta b/jcore-dta-reader/component.meta index 44239af00..c097a90ef 100644 --- a/jcore-dta-reader/component.meta +++ b/jcore-dta-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-dta-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe DTA Reader" } diff --git a/jcore-dta-reader/pom.xml b/jcore-dta-reader/pom.xml index f05d13a93..706a0f2c5 100644 --- a/jcore-dta-reader/pom.xml +++ b/jcore-dta-reader/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -70,7 +70,11 @@ org.slf4j slf4j-api - junitjunit + + org.junit.jupiter + junit-jupiter-engine + + Reader for DTA files (German digital humanities corpus) http://www.julielab.de diff --git a/jcore-dta-reader/src/main/resources/de/julielab/jcore/reader/dta/desc/jcore-dta-reader.xml b/jcore-dta-reader/src/main/resources/de/julielab/jcore/reader/dta/desc/jcore-dta-reader.xml index 1e17bdb36..c6827e0d4 100644 --- a/jcore-dta-reader/src/main/resources/de/julielab/jcore/reader/dta/desc/jcore-dta-reader.xml +++ b/jcore-dta-reader/src/main/resources/de/julielab/jcore/reader/dta/desc/jcore-dta-reader.xml @@ -5,7 +5,7 @@ JCoRe DTA Reader - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-dta-reader/src/test/java/de/julielab/jcore/reader/dta/DTAFileReaderTest.java b/jcore-dta-reader/src/test/java/de/julielab/jcore/reader/dta/DTAFileReaderTest.java index eadb4101b..0e2b0f995 100644 --- a/jcore-dta-reader/src/test/java/de/julielab/jcore/reader/dta/DTAFileReaderTest.java +++ b/jcore-dta-reader/src/test/java/de/julielab/jcore/reader/dta/DTAFileReaderTest.java @@ -24,13 +24,13 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileInputStream; import java.util.*; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class DTAFileReaderTest { diff --git a/jcore-dta-reader/src/test/java/de/julielab/jcore/reader/dta/util/DTAUtilsTest.java b/jcore-dta-reader/src/test/java/de/julielab/jcore/reader/dta/util/DTAUtilsTest.java index 3f9780106..b971e6cfc 100644 --- a/jcore-dta-reader/src/test/java/de/julielab/jcore/reader/dta/util/DTAUtilsTest.java +++ b/jcore-dta-reader/src/test/java/de/julielab/jcore/reader/dta/util/DTAUtilsTest.java @@ -14,14 +14,14 @@ import de.julielab.jcore.reader.dta.DTAFileReaderTest.Version; import de.julielab.jcore.types.extensions.dta.DTABelletristik; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class DTAUtilsTest { diff --git a/jcore-ec-code-ae/component.meta b/jcore-ec-code-ae/component.meta index 995049c32..4bfd58959 100644 --- a/jcore-ec-code-ae/component.meta +++ b/jcore-ec-code-ae/component.meta @@ -9,7 +9,7 @@ "maven-artifact": { "artifactId": "jcore-ecn-code-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Enzyme Commission Number AE" } diff --git a/jcore-ec-code-ae/pom.xml b/jcore-ec-code-ae/pom.xml index 14428b6cf..d234bff87 100644 --- a/jcore-ec-code-ae/pom.xml +++ b/jcore-ec-code-ae/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -28,8 +28,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-ec-code-ae/src/test/java/de/julielab/jcore/ae/ec/ECNumberAnnotatorTest.java b/jcore-ec-code-ae/src/test/java/de/julielab/jcore/ae/ec/ECNumberAnnotatorTest.java index 8408f9d56..3960c59a9 100644 --- a/jcore-ec-code-ae/src/test/java/de/julielab/jcore/ae/ec/ECNumberAnnotatorTest.java +++ b/jcore-ec-code-ae/src/test/java/de/julielab/jcore/ae/ec/ECNumberAnnotatorTest.java @@ -18,9 +18,9 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class ECNumberAnnotatorTest { @Test diff --git a/jcore-elasticsearch-consumer/README.md b/jcore-elasticsearch-consumer/README.md index a034187c7..c8e69c1da 100644 --- a/jcore-elasticsearch-consumer/README.md +++ b/jcore-elasticsearch-consumer/README.md @@ -1,9 +1,9 @@ # JCoRe ElasticSearchConsumer -**Descriptor Path**: +**Descriptor Paths**: ``` -.jcore-elasticsearch-consumer.src.main.resources.de.julielab.jcore.consumer.es.desc.jcore-elasticsearch-consumer -.jcore-elasticsearch-consumer.src.main.resources.de.julielab.jcore.consumer.es.desc.jcore-json-consumer +de.julielab.jcore.consumer.es.desc.jcore-elasticsearch-consumer +de.julielab.jcore.consumer.es.desc.jcore-json-consumer ``` ### Objective diff --git a/jcore-elasticsearch-consumer/component.meta b/jcore-elasticsearch-consumer/component.meta index 584bbdc82..366bc2715 100644 --- a/jcore-elasticsearch-consumer/component.meta +++ b/jcore-elasticsearch-consumer/component.meta @@ -18,7 +18,7 @@ "maven-artifact": { "artifactId": "jcore-elasticsearch-consumer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe ElasticSearch Consumer" } diff --git a/jcore-elasticsearch-consumer/pom.xml b/jcore-elasticsearch-consumer/pom.xml index d7ad38562..c54f0c3a5 100644 --- a/jcore-elasticsearch-consumer/pom.xml +++ b/jcore-elasticsearch-consumer/pom.xml @@ -4,7 +4,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 jcore-elasticsearch-consumer JCoRe ElasticSearch Consumer @@ -32,7 +32,7 @@ org.apache.lucene lucene-analyzers-common - 6.4.2 + 8.9.0 org.assertj @@ -68,25 +68,38 @@ icu4j 55.1 + + org.apache.commons + commons-lang3 + org.mapdb mapdb 3.0.7 - provided org.testng testng + + ch.qos.logback + logback-classic + test + + + org.junit.jupiter + junit-jupiter + org.testcontainers testcontainers - 1.12.0 + 1.15.3 test - ch.qos.logback - logback-classic + org.testcontainers + junit-jupiter + 1.15.3 test diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/AbstractCasToJsonConsumer.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/AbstractCasToJsonConsumer.java index 7cd73ef1b..ecbde6219 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/AbstractCasToJsonConsumer.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/AbstractCasToJsonConsumer.java @@ -124,25 +124,27 @@ protected Document convertCasToDocument(JCas aJCas) throws AnalysisEngineProcess if (doc.isEmpty()) log.debug("Document for document with ID {} does not contain any non-empty fields.", JCoReTools.getDocId(aJCas)); - String docId = JCoReTools.getDocId(aJCas); - if (null != idField) { - IFieldValue idFieldValue = doc.get(idField); - if (idFieldValue instanceof RawToken) { - docId = String.valueOf(((RawToken) idFieldValue).token); - } else if (idFieldValue instanceof PreanalyzedFieldValue) { - PreanalyzedFieldValue preAnalyzedIdValue = (PreanalyzedFieldValue) idFieldValue; - docId = preAnalyzedIdValue.fieldString; - } else - throw new IllegalArgumentException("Class " + idFieldValue.getClass() + " for value of field " - + idField + " is not supported as ID field value"); + if (doc.getId() == null || doc.getId().isBlank()) { + String docId = JCoReTools.getDocId(aJCas); + if (null != idField) { + IFieldValue idFieldValue = doc.get(idField); + if (idFieldValue instanceof RawToken) { + docId = String.valueOf(((RawToken) idFieldValue).token); + } else if (idFieldValue instanceof PreanalyzedFieldValue) { + PreanalyzedFieldValue preAnalyzedIdValue = (PreanalyzedFieldValue) idFieldValue; + docId = preAnalyzedIdValue.fieldString; + } else + throw new IllegalArgumentException("Class " + idFieldValue.getClass() + " for value of field " + + idField + " is not supported as ID field value"); + } + if (null != idPrefix) + docId = idPrefix + docId; + if (docId == null) + throw new AnalysisEngineProcessException(new IllegalStateException( + "Could neither get a document ID from the generated document nor from the CAS directly. The generated document is: " + + gson.toJson(doc))); + doc.setId(docId); } - if (null != idPrefix) - docId = idPrefix + docId; - if (docId == null) - throw new AnalysisEngineProcessException(new IllegalStateException( - "Could neither get a document ID from the generated document nor from the CAS directly. The generated document is: " - + gson.toJson(doc))); - doc.setId(docId); return doc; } catch (Exception e) { log.error("Error with document ID {}.", JCoReTools.getDocId(aJCas)); diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/AbstractFieldGenerator.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/AbstractFieldGenerator.java index 54e1c91d7..44dd6d012 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/AbstractFieldGenerator.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/AbstractFieldGenerator.java @@ -499,8 +499,8 @@ else if (fieldValues.size() == 1) * featurePaths and reset once per featurePath. * * @param a - * @param featurePath - * @param f + * @param featurePaths + * @param filters * @return * @throws CASException */ @@ -730,10 +730,12 @@ public ArrayFieldValue createRawFieldValueForAnnotations(FeatureStructure[] a, S /** * Applies the - * {@link #createRawFieldValueForAnnotation(FeatureStructure, String[], Filter[]) + * {@link #createRawFieldValueForAnnotation(FeatureStructure, String[], Filter[])} * method to all feature structures in fss. Thus, the feature paths and * filters are expected to be parallel: Each feature path has its own - * filter. If the filters array is shorter than the feature paths array, the + * filter. But: The feature paths and filters are applied to all feature structures. + * See {@link #createRawFieldValueForParallelAnnotations(FeatureStructure[], String[], Filter[], Filter)} to apply the ith feature path to the ith feature structure. + * If the filters array is shorter than the feature paths array, the * missing filters will be treated as if they were null. Finally, after all * values have been created in this way, if the overallFilter is not * null, it will be applied to all resulting values. It will be reset once @@ -772,6 +774,46 @@ public ArrayFieldValue createRawFieldValueForAnnotations(FeatureStructure[] fss, return arrayFieldValue; } + /** + * Calls {@link #createRawFieldValueForAnnotation(FeatureStructure, String, Filter)} for all tuples + *

+	 * (fss[i], featurePaths[i], filters[i]), i in {0,..,fss.length-1}
+	 * 
, thus handling feature structures, feature paths and filters separately for each index. fss and + * featurePaths must be non-null and of equal length. filters may be null or shorter. The + * overallFilter will be applied to all values resulting from the previous process. + * @param fss + * @param featurePaths + * @param filters + * @param overallFilter + * @return + * @throws CASException + */ + public ArrayFieldValue createRawFieldValueForParallelAnnotations(FeatureStructure[] fss, String[] featurePaths, + Filter[] filters, Filter overallFilter) throws CASException { + ArrayFieldValue arrayFieldValue = new ArrayFieldValue(); + for (int i = 0; i < fss.length; i++) { + FeatureStructure annotation = fss[i]; + IFieldValue fieldValueForAnnotation = createRawFieldValueForAnnotation(annotation, featurePaths[i], + filters != null && i < filters.length ? filters[i] : null); + arrayFieldValue.addFlattened(fieldValueForAnnotation); + } + if (null != overallFilter) { + overallFilter.reset(); + ArrayFieldValue filteredArrayFieldValue = new ArrayFieldValue(); + for (IFieldValue fieldValue : arrayFieldValue) { + RawToken token = (RawToken) fieldValue; + String tokenString = String.valueOf(token.token); + List filteredTokens = overallFilter.filter(tokenString); + if (!filteredTokens.isEmpty()) { + for (String filteredToken : filteredTokens) + filteredArrayFieldValue.add(new RawToken(filteredToken)); + } + } + arrayFieldValue = filteredArrayFieldValue; + } + return arrayFieldValue; + } + /** * Creates a single array of all field values derived by the given feature paths * and filters. The filters array is taken to be parallel to diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/ElasticSearchConsumer.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/ElasticSearchConsumer.java index e92be2e6a..94d8ba622 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/ElasticSearchConsumer.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/ElasticSearchConsumer.java @@ -35,18 +35,26 @@ public class ElasticSearchConsumer extends AbstractCasToJsonConsumer { */ public static final String PARAM_TYPE = "type"; public static final String PARAM_BATCH_SIZE = "batchSize"; + public static final String PARAM_DELETE_DOCS_BEFORE_INDEXING = "deleteDocumentsBeforeIndexing"; + public static final String PARAM_DOC_ID_FIELD = "documentIdField"; final Logger log = LoggerFactory.getLogger(ElasticSearchConsumer.class); @ConfigurationParameter(name = PARAM_URLS, description = "A list of URLs pointing to different nodes of the ElasticSearch cluster, e.g. http://localhost:9300/. Documents will be sent bulk-wise to the nodes in a round-robin fashion.") private String[] urls; @ConfigurationParameter(name = PARAM_INDEX_NAME, description = "The ElasticSearch index name to send the created documents to.") private String indexName; - @ConfigurationParameter(name = PARAM_TYPE, mandatory = false, description = "The index type the generated documents should have. The types are removed from ElasticSearch with version 7 and should omitted for ES >= 7.") + @ConfigurationParameter(name = PARAM_TYPE, mandatory = false, description = "The index type the generated documents should have. The types are removed from ElasticSearch with version 7 and should be omitted for ES >= 7.") private String type; @ConfigurationParameter(name = PARAM_BATCH_SIZE, mandatory = false, description = "The number of documents to be sent to ElasticSearch in a single batch. Defaults to 50.") private int batchSize; + @ConfigurationParameter(name = PARAM_DELETE_DOCS_BEFORE_INDEXING, mandatory = false, description = "Whether or not to delete documents with the docId of the UIMA CASes in ElasticSearch prior to indexing. This is useful when parts of the document are indexed whose IDs are not stable or that might change after document updates and would not just be overwritten when indexing anew. Defaults to false.") + private boolean deleteDocsBeforeIndexing; + @ConfigurationParameter(name = PARAM_DOC_ID_FIELD, mandatory = false, description = "Required when " + PARAM_DELETE_DOCS_BEFORE_INDEXING + " is set to true. This should be an existing index field that contains the document ID of each CAS. It is used to remove existing index documents related to the CAS document ID prior to indexing.") + private String docIdField; private List bulkCommand; + private List docIdsToDelete; private HttpPost[] indexPosts; + private HttpPost[] indexDeletes; private int urlIndex = 0; @@ -62,6 +70,11 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept type = (String) getContext().getConfigParameterValue(PARAM_TYPE); batchSize = Optional.ofNullable((Integer) getContext().getConfigParameterValue(PARAM_BATCH_SIZE)).orElse(50); bulkCommand = new ArrayList<>(4000); + deleteDocsBeforeIndexing = (boolean) Optional.ofNullable(getContext().getConfigParameterValue(PARAM_DELETE_DOCS_BEFORE_INDEXING)).orElse(false); + docIdField = (String) getContext().getConfigParameterValue(PARAM_DOC_ID_FIELD); + + if (deleteDocsBeforeIndexing && docIdField == null) + throw new ResourceInitializationException(new IllegalArgumentException(PARAM_DELETE_DOCS_BEFORE_INDEXING + " is true but no " + PARAM_DOC_ID_FIELD + " was specified.")); httpclient = HttpClientBuilder.create().build(); if (urls != null) { @@ -75,10 +88,26 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept } } + if (deleteDocsBeforeIndexing) { + indexDeletes = new HttpPost[urls.length]; + for (int i = 0; i < urls.length; i++) { + String url = urls[i]; + if (null != url && url.endsWith("/_bulk")) + url = url.replace("/_bulk/?", ""); + url += "/" + indexName + "/" + "_delete_by_query"; + indexDeletes[i] = new HttpPost(url); + indexDeletes[i].addHeader("Content-Type", "application/x-ndjson"); + + } + docIdsToDelete = new ArrayList<>(); + } + if (log.isInfoEnabled()) { log.info("{}: {}", PARAM_URLS, Arrays.toString(urls)); log.info("{}: {}", PARAM_INDEX_NAME, indexName); log.info("{}: {}", PARAM_TYPE, type); + log.info("{}: {}", PARAM_DELETE_DOCS_BEFORE_INDEXING, deleteDocsBeforeIndexing); + log.info("{}: {}", PARAM_DOC_ID_FIELD, docIdField); } } @@ -89,6 +118,10 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { w.start(); Gson gson = new Gson(); + if (deleteDocsBeforeIndexing) { + docIdsToDelete.add(JCoReTools.getDocId(aJCas)); + } + // This is the default case: For each CAS, create one document. This // document is populated with fields by field generators. The field // generator classes are delivered by the user. @@ -157,6 +190,7 @@ public void customBatchProcessComplete() throws AnalysisEngineProcessException { super.batchProcessComplete(); log.debug("Batch of {} documents is sent to ElasticSearch.", docNum); docNum = 0; + deleteDocuments(); postBulkIndexAction(); } @@ -164,9 +198,59 @@ public void customBatchProcessComplete() throws AnalysisEngineProcessException { public void collectionProcessComplete() throws AnalysisEngineProcessException { super.collectionProcessComplete(); log.info("Collection complete."); + deleteDocuments(); postBulkIndexAction(); } + private void deleteDocuments() throws AnalysisEngineProcessException { + if (deleteDocsBeforeIndexing) { + // Post to all the ElasticSearch nodes in a round-robin fashion. + HttpPost indexDelete = indexDeletes[urlIndex]; + urlIndex = (urlIndex + 1) % indexDeletes.length; + try { + int lastIndex = 0; + List subList; + do { + subList = docIdsToDelete.subList(lastIndex, Math.min(docIdsToDelete.size(), lastIndex + 1000)); + if (subList.isEmpty()) + continue; + lastIndex += subList.size(); + log.debug("Delete {} documents in index {}.", subList.size(), indexName); + long time = System.currentTimeMillis(); + StringBuilder deleteQuery = new StringBuilder(); + deleteQuery.append("{\"query\":{\"terms\":{\"").append(docIdField).append("\":["); + for (int i = 0; i < subList.size(); i++) { + String docId = subList.get(i); + deleteQuery.append("\"").append(docId).append("\""); + if (i < subList.size() - 1) + deleteQuery.append(","); + } + deleteQuery.append("]}}}"); + StringEntity deleteByQueryEntity = new StringEntity(deleteQuery.toString(), "UTF-8"); + indexDelete.setEntity(deleteByQueryEntity); + HttpResponse response = httpclient.execute(indexDelete); + int statusCode = response.getStatusLine().getStatusCode(); + HttpEntity responseEntity = response.getEntity(); + if (statusCode > 200) { + log.error("The server responded with a non-OK status code: {}", statusCode); + log.error("Response status line: {}", response.getStatusLine()); + log.error("Response body: {}", EntityUtils.toString(responseEntity)); + log.error("Delete-by-query command was: {}", deleteQuery); + } + EntityUtils.consume(responseEntity); + time = System.currentTimeMillis() - time; + log.debug("Sending took {}ms ({}s) and returned status code {}", time, time / 1000, statusCode); + } while (null != subList && !subList.isEmpty()); + } catch (IOException e) { + log.error("Error when sending data to ElasticSearch:", e); + throw new AnalysisEngineProcessException(e); + } finally { + indexDelete.reset(); + docIdsToDelete.clear(); + } + } + } + private void postBulkIndexAction() throws AnalysisEngineProcessException { if (bulkCommand.isEmpty()) return; @@ -175,13 +259,13 @@ private void postBulkIndexAction() throws AnalysisEngineProcessException { urlIndex = (urlIndex + 1) % indexPosts.length; try { int lastIndex = 0; - List subList = null; + List subList; do { subList = bulkCommand.subList(lastIndex, Math.min(bulkCommand.size(), lastIndex + 1000)); if (subList.isEmpty()) continue; lastIndex += subList.size(); - log.debug("Sending {} documents to index {}.", subList.size() / 2, indexName); + log.debug("Sending {} documents to index {}.", subList.size(), indexName); long time = System.currentTimeMillis(); // The bulk format requires us to have a newline also after the // last diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/filter/AddonTermsFilter.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/filter/AddonTermsFilter.java index b37e52348..9114109be 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/filter/AddonTermsFilter.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/filter/AddonTermsFilter.java @@ -6,33 +6,36 @@ public class AddonTermsFilter extends AbstractFilter { - private Map addonTerms; + private Map addonTerms; - public AddonTermsFilter(Map addonTerms) { - this.addonTerms = addonTerms; - } + public AddonTermsFilter(Map addonTerms) { + this.addonTerms = addonTerms; + } - @Override - public List filter(String input) { - newOutput(); - if (null != input) { - output.add(input); - String[] hypernymArray = addonTerms.get(input); - if (null != hypernymArray) { - output = new ArrayList<>(hypernymArray.length + 1); - output.add(input); - for (int i = 0; i < hypernymArray.length; i++) { - String hypernym = hypernymArray[i]; - output.add(hypernym); - } - } - } - return output; - } + @Override + public List filter(String input) { + newOutput(); + if (null != input) { + output.add(input); + String[] addonArray = addonTerms.get(input); + if (null != addonArray) { + // Only create a new output array when the default ArrayList size can't hold all the elements + if (addonArray.length >= 10) { + output = new ArrayList<>(addonArray.length + 1); + output.add(input); + } + for (int i = 0; i < addonArray.length; i++) { + String addonTerm = addonArray[i]; + output.add(addonTerm); + } + } + } + return output; + } - @Override - public Filter copy() { - return new AddonTermsFilter(addonTerms); - } + @Override + public Filter copy() { + return new AddonTermsFilter(addonTerms); + } } diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/filter/RegExReplaceFilter.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/filter/RegExReplaceFilter.java new file mode 100644 index 000000000..5eb554b76 --- /dev/null +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/filter/RegExReplaceFilter.java @@ -0,0 +1,36 @@ +package de.julielab.jcore.consumer.es.filter; + +import java.util.List; + +/** + * Replaces portions of terms according to the given regular expression and replacement string. + * @author faessler + * + */ +public class RegExReplaceFilter extends AbstractFilter { + + private String regex; + private String replacement; + private boolean replaceAll; + + public RegExReplaceFilter(String regex, String replacement, boolean replaceAll) { + this.regex = regex; + this.replacement = replacement; + this.replaceAll = replaceAll; + } + + @Override + public List filter(String input) { + newOutput(); + if (input != null) { + output.add(replaceAll ? input.replaceAll(regex, replacement) : input.replaceFirst(regex, replacement)); + } + return output; + } + + @Override + public Filter copy() { + return new RegExReplaceFilter(regex, replacement, replaceAll); + } + +} diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/filter/SingleAddonTermsFilter.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/filter/SingleAddonTermsFilter.java new file mode 100644 index 000000000..1e83f2b9f --- /dev/null +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/filter/SingleAddonTermsFilter.java @@ -0,0 +1,35 @@ +package de.julielab.jcore.consumer.es.filter; + +import java.util.List; +import java.util.Map; + +/** + *

Like {@link AddonTermsFilter} but accepts single string values instead of string arrays.

+ */ +public class SingleAddonTermsFilter extends AbstractFilter { + + private Map addonTerms; + + public SingleAddonTermsFilter(Map addonTerms) { + this.addonTerms = addonTerms; + } + + @Override + public List filter(String input) { + newOutput(); + if (null != input) { + output.add(input); + String addonTerm = addonTerms.get(input); + if (null != addonTerm) { + output.add(addonTerm); + } + } + return output; + } + + @Override + public Filter copy() { + return new SingleAddonTermsFilter(addonTerms); + } + +} diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/preanalyzed/RawToken.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/preanalyzed/RawToken.java index 4c73f5b9a..4c4b08e58 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/preanalyzed/RawToken.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/preanalyzed/RawToken.java @@ -64,6 +64,8 @@ public void write(JsonWriter out, RawToken token) throws IOException { out.value((String) tokenValue); else if (tokenValue instanceof Number) out.value((Number) tokenValue); + else if (tokenValue instanceof Boolean) + out.value((boolean) tokenValue); else if (tokenValue == null) out.nullValue(); else diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AbstractMapProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AbstractMapProvider.java index 6491627cf..a02b81797 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AbstractMapProvider.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AbstractMapProvider.java @@ -4,7 +4,6 @@ import org.apache.uima.resource.DataResource; import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.BufferedReader; import java.io.IOException; @@ -12,11 +11,25 @@ import java.util.HashMap; import java.util.Map; - +/** + *

Base class for resources that map one term to another. Uses a HashMap. The trivial instantiable subclass is {@link MapProvider}.

+ *

This class is abstract because it is generic. To work with other data types than strings, the {@link #getKey(String)} and {@link #getValue(String)} + * methods are overridden by subclasses to deliver the correct data types from the string input.

+ *

Subclasses deal with maps where the keys and/or values are not strings but numbers. Other subclasses deal with + * String but use a persistent data structure to deal with very large maps.

+ * + * @param + * @param + */ public abstract class AbstractMapProvider implements IMapProvider { - private final static Logger log = LoggerFactory.getLogger(AbstractMapProvider.class); + protected final Logger log; protected boolean reverse = false; - private HashMap map; + protected Map map; + + public AbstractMapProvider(Logger log) { + this.log = log; + map = new HashMap<>(); + } @Override public void load(DataResource aData) throws ResourceInitializationException { @@ -29,12 +42,14 @@ public void load(DataResource aData) throws ResourceInitializationException { throw new IOException("Resource " + aData.getUri() + " not found"); } br = new BufferedReader(is); - map = new HashMap<>(); +// map = new HashMap<>(); String line; String splitExpression = "\t"; + int numEntries = 0; while ((line = br.readLine()) != null) { if (line.trim().length() == 0 || line.startsWith("#")) continue; + ++numEntries; String[] split = line.split(splitExpression); if (split.length != 2) { splitExpression = "\\s+"; @@ -44,16 +59,11 @@ public void load(DataResource aData) throws ResourceInitializationException { throw new IllegalArgumentException("Format error in map file: Expected format is 'originalValuemappedValue' but the input line '" + line + "' has " + split.length + " columns."); if (reverse) - map.put(getKey(split[1]), getValue(split[0])); + put(getKey(split[1]), getValue(split[0])); else - map.put(getKey(split[0]), getValue(split[1])); + put(getKey(split[0]), getValue(split[1])); } - log.info("Finished reading resource {}", aData.getUri()); - log.info("Copying {} values into a fresh HashMap of the exactly correct size", map.size()); - HashMap tmp = new HashMap<>(map.size(), 1f); - tmp.putAll(map); - map = tmp; - log.info("Done."); + log.info("Finished reading resource {} and got {} entries.", aData.getUri(), numEntries); } catch (IOException e) { throw new ResourceInitializationException(e); } finally { @@ -66,6 +76,8 @@ public void load(DataResource aData) throws ResourceInitializationException { } } + protected abstract void put(K key, V value); + protected abstract V getValue(String valueString); protected abstract K getKey(String keyString); diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AddonTermsProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AddonTermsProvider.java index 7b4adb2d0..5118d8be4 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AddonTermsProvider.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AddonTermsProvider.java @@ -6,24 +6,37 @@ import org.apache.uima.resource.DataResource; import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.InputStream; +import java.net.URI; import java.util.HashMap; import java.util.List; import java.util.Map; +/** + *

Base class for addon terms (i.e. terms to be added to some key term, like synonyms or hypernyms) that uses a HashMap.

+ *

Subclasses of this class use other data structures to store and retrieve the addon terms. Useful for large numbers of such terms.

+ */ public class AddonTermsProvider implements IAddonTermsProvider { - Logger log = LoggerFactory.getLogger(AddonTermsProvider.class); + protected final Logger log; - private Map addonTerms; + protected Map addonTerms; + + public AddonTermsProvider(Logger log) { + this.log = log; + addonTerms = new HashMap<>(); + } + + protected void put(String term, String[] addonArray) { + addonTerms.put(term, addonArray); + } @Override public void load(DataResource aData) throws ResourceInitializationException { try { - addonTerms = new HashMap<>(); - log.info("Loading addon terms from " + aData.getUri()); + URI uri = aData.getUri(); + log.info("Loading addon terms from " + uri); int addons = 0; InputStream inputStream; try { @@ -56,7 +69,7 @@ public void load(DataResource aData) throws ResourceInitializationException { addonArray[i] = trimmedAddon.intern(); addons++; } - addonTerms.put(term, addonArray); + put(term, addonArray); } log.info("Loaded {} addons for {} terms.", addons, addonTerms.size()); } catch (IOException e) { diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/LuceneIndex.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/LuceneIndex.java new file mode 100644 index 000000000..0a889a6b9 --- /dev/null +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/LuceneIndex.java @@ -0,0 +1,166 @@ +package de.julielab.jcore.consumer.es.sharedresources; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StoredField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.*; +import org.apache.lucene.search.*; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.NIOFSDirectory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.util.Arrays; + +public class LuceneIndex implements StringIndex { + private final static Logger log = LoggerFactory.getLogger(LuceneIndex.class); + private IndexWriter iw; + private final FSDirectory directory; + private IndexSearcher searcher; + + public LuceneIndex(String indexDirectory) { + try { + Path lucene = Path.of(indexDirectory); + File directoryFile = lucene.toFile(); + boolean indexExists = directoryFile.exists() && directoryFile.isDirectory() && directoryFile.list().length != 0; + directory = NIOFSDirectory.open(lucene); + // Do not open a writer to an existing index. This causes locking issues when starting multiple + // pipelines in parallel. + // Of course, the first pipeline still needs to create the index, so this must be a one-time effort + // that has to be completed before the other pipelines are started. + if (!indexExists) { + log.debug("Creating index writer for index directory {}.", indexDirectory); + IndexWriterConfig iwc = new IndexWriterConfig(); + iw = new IndexWriter(directory, iwc); + } else { + log.debug("Index directory {} already exists.", indexDirectory); + } + } catch (IOException e) { + log.error("could not initialize Lucene index", e); + throw new IllegalStateException(e); + } + } + + @Override + public String get(String key) { + TermQuery tq = new TermQuery(new Term("key", key)); + BooleanQuery.Builder b = new BooleanQuery.Builder(); + b.add(tq, BooleanClause.Occur.FILTER); + BooleanQuery q = b.build(); + try { + TopDocs topDocs = searcher.search(q, 1); + if (topDocs.scoreDocs.length > 0) { + Document doc = searcher.getIndexReader().document(topDocs.scoreDocs[0].doc); + return doc.getField("value").stringValue(); + } + } catch (IOException e) { + log.error("Could not retrieve results for '{}' in Lucene index.", key, e); + throw new IllegalStateException(e); + } + return null; + } + + @Override + public String[] getArray(String key) { + TermQuery tq = new TermQuery(new Term("key", key)); + BooleanQuery.Builder b = new BooleanQuery.Builder(); + b.add(tq, BooleanClause.Occur.FILTER); + BooleanQuery q = b.build(); + try { + TopDocs topDocs = searcher.search(q, 1); + if (topDocs.scoreDocs.length > 0) { + Document doc = searcher.getIndexReader().document(topDocs.scoreDocs[0].doc); + return Arrays.stream(doc.getFields("value")).map(IndexableField::stringValue).toArray(String[]::new); + } + } catch (IOException e) { + log.error("Could not retrieve results for '{}' in Lucene index.", key, e); + throw new IllegalStateException(e); + } + return null; + } + + @Override + public void put(String key, String value) { + Field keyField = new StringField("key", key, Field.Store.NO); + Field valueField = new StoredField("value", value); + Document doc = new Document(); + doc.add(keyField); + doc.add(valueField); + try { + iw.addDocument(doc); + } catch (IOException e) { + log.error("Could not index key-value pair {}:{} with Lucene", key, value, e); + throw new IllegalStateException(e); + } + } + + @Override + public void put(String key, String[] value) { + Field keyField = new StringField("key", key, Field.Store.NO); + Document doc = new Document(); + doc.add(keyField); + for (var v : value) + doc.add(new StoredField("value", v)); + try { + iw.addDocument(doc); + } catch (IOException e) { + log.error("Could not index key-value pair {}:{} with Lucene", key, value, e); + throw new IllegalStateException(e); + } + } + + @Override + public void commit() { + try { + iw.commit(); + } catch (IOException e) { + log.error("Could not commit Lucene index", e); + throw new IllegalStateException(e); + } + } + + @Override + public boolean requiresExplicitCommit() { + return true; + } + + @Override + public void close() { + try { + if (searcher != null) { + searcher.getIndexReader().close(); + searcher = null; + } + if (iw != null) { + iw.close(); + iw = null; + } + } catch (IOException e) { + log.error("Could not close Lucene index reader.", e); + throw new IllegalStateException(e); + } + } + + @Override + public void open() { + try { + searcher = new IndexSearcher(DirectoryReader.open(directory)); + } catch (IOException e) { + log.error("Could not open Lucene index searcher.", e); + throw new IllegalStateException(e); + } + } + + @Override + public int size() { + if (iw != null && iw.isOpen()) + return iw.getDocStats().numDocs; + else if (searcher != null) + return searcher.getIndexReader().numDocs(); + return 0; + } +} diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/MapDBReversedDoubleMapProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/MapDBReversedDoubleMapProvider.java deleted file mode 100644 index a12a082a5..000000000 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/MapDBReversedDoubleMapProvider.java +++ /dev/null @@ -1,70 +0,0 @@ -package de.julielab.jcore.consumer.es.sharedresources; - -import de.julielab.jcore.utility.JCoReTools; -import org.apache.uima.resource.DataResource; -import org.apache.uima.resource.ResourceInitializationException; -import org.mapdb.DB; -import org.mapdb.DBMaker; -import org.mapdb.Serializer; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.Map; - -public class MapDBReversedDoubleMapProvider implements IMapProvider { - - private Map map; - - @Override - public void load(DataResource aData) throws ResourceInitializationException { - BufferedReader br = null; - try { - final DB filedb = DBMaker.tempFileDB().fileMmapEnableIfSupported().cleanerHackEnable().closeOnJvmShutdownWeakReference().make(); - map = filedb.hashMap("JCoReElasticSearchReverseMapProvider"). - keySerializer(Serializer.STRING).valueSerializer(Serializer.DOUBLE). - create(); - InputStreamReader is; - try { - is = new InputStreamReader(JCoReTools.resolveExternalResourceGzipInputStream(aData)); - } catch (Exception e) { - throw new IOException("Resource " + aData.getUri() + " not found"); - } - br = new BufferedReader(is); - String line; - String splitExpression = "\t"; - while ((line = br.readLine()) != null) { - if (line.trim().length() == 0 || line.startsWith("#")) - continue; - String[] split = line.split(splitExpression); - if (split.length != 2) { - splitExpression = "\\s+"; - split = line.split(splitExpression); - } - if (split.length != 2) - throw new IllegalArgumentException("Format error in map file: Expected format is 'originalValuemappedValue' but the input line '" + line - + "' has " + split.length + " columns."); - map.put(split[1].trim(), Double.parseDouble(split[0].trim())); - } - } catch (IOException e) { - throw new ResourceInitializationException(e); - } finally { - try { - if (null != br) - br.close(); - } catch (IOException e) { - throw new ResourceInitializationException(e); - } - } - - } - - /** - * Returns the loaded map. All strings - keys and values - are internalized. - */ - @Override - public Map getMap() { - return map; - } - -} diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/MapProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/MapProvider.java index ebd90f8ed..0b8393ed7 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/MapProvider.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/MapProvider.java @@ -1,6 +1,20 @@ package de.julielab.jcore.consumer.es.sharedresources; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + public class MapProvider extends AbstractMapProvider { + private final static Logger log = LoggerFactory.getLogger(MapProvider.class); + + public MapProvider() { + super(log); + } + + @Override + protected void put(String key, String value) { + map.put(key, value); + } + @Override protected String getValue(String valueString) { return valueString.trim().intern(); diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentIndexAddonTermsProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentIndexAddonTermsProvider.java new file mode 100644 index 000000000..18d45b5b0 --- /dev/null +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentIndexAddonTermsProvider.java @@ -0,0 +1,173 @@ +package de.julielab.jcore.consumer.es.sharedresources; + +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.CacheLoader; +import com.google.common.cache.LoadingCache; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.FilenameUtils; +import org.apache.commons.lang3.NotImplementedException; +import org.apache.uima.resource.DataResource; +import org.apache.uima.resource.ResourceInitializationException; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; +import org.slf4j.Logger; + +import java.io.File; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URI; +import java.time.Duration; +import java.util.Collection; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.ExecutionException; + +/** + * Reads the original input file and converts it into a persistent index. This index is re-used in subsequent pipeline runs. + */ +abstract public class PersistentIndexAddonTermsProvider extends AddonTermsProvider { + public static final int MAXIMUM_MEMCACHE_SIZE = 10000; + private final LoadingCache> cache; + private StringIndex index; + + public PersistentIndexAddonTermsProvider(Logger log) { + super(log); + addonTerms = new Map<>() { + @Override + public int size() { + return index.size(); + } + + @Override + public boolean isEmpty() { + throw new NotImplementedException(); + } + + @Override + public boolean containsKey(Object key) { + throw new NotImplementedException(); + } + + @Override + public boolean containsValue(Object value) { + throw new NotImplementedException(); + } + + @Override + public String[] get(Object key) { + try { + return cache.get((String) key).orElse(null); + } catch (ExecutionException e) { + log.error("Could not retrieve value from the cache for key '{}'.", key); + throw new IllegalStateException(); + } + } + + @Nullable + @Override + public String[] put(String key, String[] value) { + throw new NotImplementedException(); + } + + @Override + public String[] remove(Object key) { + throw new NotImplementedException(); + } + + @Override + public void putAll(@NotNull Map m) { + throw new NotImplementedException(); + } + + @Override + public void clear() { + throw new NotImplementedException(); + } + + @NotNull + @Override + public Set keySet() { + throw new NotImplementedException(); + } + + @NotNull + @Override + public Collection values() { + throw new NotImplementedException(); + } + + @NotNull + @Override + public Set> entrySet() { + throw new NotImplementedException(); + } + }; + cache = CacheBuilder.newBuilder().maximumSize(MAXIMUM_MEMCACHE_SIZE).expireAfterAccess(Duration.ofHours(1)).build(new CacheLoader<>() { + @Override + public Optional load(String s) { + return Optional.ofNullable(index.getArray(s)); + } + }); + } + + protected abstract StringIndex initializeIndex(String cachePath); + + @Override + public void load(DataResource aData) throws ResourceInitializationException { + // prepare the persistent index + URI uri = aData.getUri(); + File indexFile = null; + boolean loadData = true; + try { + File resourceFile; + try { + resourceFile = new File(uri); + } catch (IllegalArgumentException e) { + // to support relative file paths like file:resources/somefile.txt + resourceFile = new File(uri.getSchemeSpecificPart()); + } + String resourceFileName = FilenameUtils.getName(uri.toURL().getPath()); + indexFile = new File("es-consumer-cache", resourceFileName); + if (resourceFile.exists() && indexFile.exists() && resourceFile.lastModified() > indexFile.lastModified()) { + log.info("Resource file {} is newer than the existing cached index at {}. Creating new index.", resourceFile, indexFile); + if (indexFile.isDirectory()) { + log.info("Deleting index directory {}", indexFile); + FileUtils.deleteDirectory(indexFile); + } else { + log.info("Deleting index file {}", indexFile); + indexFile.delete(); + } + } else { + boolean indexFileExisted = indexFile.exists(); + if (!indexFileExisted) { + log.info("Creating persistent cache for resource {} at {}.", uri, indexFile); + } + else { + log.info("Using existing persistent cache {} for resource {}.", indexFile, uri); + loadData = false; + } + } + index = initializeIndex(indexFile.getAbsolutePath()); + } catch (MalformedURLException e) { + log.error("Could obtain file name from resource URI '{}'", uri, e); + throw new IllegalStateException(e); + } catch (IOException e) { + log.error("Could not delete index file {}", indexFile, e); + throw new ResourceInitializationException(e); + } + if (loadData) { + super.load(aData); + if (index.requiresExplicitCommit()) + index.commit(); + } + index.close(); + index.open(); + log.info("There are {} entries in the cache at {}.", index.size(), indexFile); + } + + @Override + protected void put(String term, String[] addonArray) { + index.put(term, addonArray); + } +} diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentLuceneIndexAddonTermsProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentLuceneIndexAddonTermsProvider.java new file mode 100644 index 000000000..40ac75e83 --- /dev/null +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentLuceneIndexAddonTermsProvider.java @@ -0,0 +1,17 @@ +package de.julielab.jcore.consumer.es.sharedresources; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class PersistentLuceneIndexAddonTermsProvider extends PersistentIndexAddonTermsProvider{ + private final static Logger log = LoggerFactory.getLogger(PersistentLuceneIndexAddonTermsProvider.class); + + public PersistentLuceneIndexAddonTermsProvider() { + super(log); + } + + @Override + protected StringIndex initializeIndex(String cachePath) { + return new LuceneIndex(cachePath); + } +} diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentLuceneStringMapProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentLuceneStringMapProvider.java new file mode 100644 index 000000000..c49ed7350 --- /dev/null +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentLuceneStringMapProvider.java @@ -0,0 +1,17 @@ +package de.julielab.jcore.consumer.es.sharedresources; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class PersistentLuceneStringMapProvider extends PersistentStringIndexMapProvider { + private final static Logger log = LoggerFactory.getLogger(PersistentLuceneStringMapProvider.class); + + public PersistentLuceneStringMapProvider() { + super(log); + } + + @Override + protected StringIndex initializeIndex(String cachePath) { + return new LuceneIndex(cachePath); + } +} diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentStringIndexMapProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentStringIndexMapProvider.java new file mode 100644 index 000000000..39994dc9c --- /dev/null +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentStringIndexMapProvider.java @@ -0,0 +1,179 @@ +package de.julielab.jcore.consumer.es.sharedresources; + +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.CacheLoader; +import com.google.common.cache.LoadingCache; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.FilenameUtils; +import org.apache.commons.lang3.NotImplementedException; +import org.apache.uima.resource.DataResource; +import org.apache.uima.resource.ResourceInitializationException; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; +import org.slf4j.Logger; + +import java.io.File; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URI; +import java.time.Duration; +import java.util.Collection; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.ExecutionException; + +abstract public class PersistentStringIndexMapProvider extends AbstractMapProvider { + public static final int MAXIMUM_MEMCACHE_SIZE = 10000; + private final LoadingCache> cache; + private StringIndex index; + + public PersistentStringIndexMapProvider(Logger log) { + super(log); + map = new Map<>() { + @Override + public int size() { + return index.size(); + } + + @Override + public boolean isEmpty() { + throw new NotImplementedException(); + } + + @Override + public boolean containsKey(Object key) { + throw new NotImplementedException(); + } + + @Override + public boolean containsValue(Object value) { + throw new NotImplementedException(); + } + + @Override + public String get(Object key) { + try { + return cache.get((String) key).orElse(null); + } catch (ExecutionException e) { + log.error("Could not retrieve value from the cache for key '{}'.", key); + throw new IllegalStateException(); + } + } + + @Nullable + @Override + public String put(String key, String value) { + throw new NotImplementedException(); + } + + @Override + public String remove(Object key) { + throw new NotImplementedException(); + } + + @Override + public void putAll(@NotNull Map m) { + throw new NotImplementedException(); + } + + @Override + public void clear() { + throw new NotImplementedException(); + } + + @NotNull + @Override + public Set keySet() { + throw new NotImplementedException(); + } + + @NotNull + @Override + public Collection values() { + throw new NotImplementedException(); + } + + @NotNull + @Override + public Set> entrySet() { + throw new NotImplementedException(); + } + }; + cache = CacheBuilder.newBuilder().maximumSize(MAXIMUM_MEMCACHE_SIZE).expireAfterAccess(Duration.ofHours(1)).build(new CacheLoader<>() { + @Override + public Optional load(String s) { + return Optional.ofNullable(index.get(s)); + } + }); + } + + @Override + protected void put(String key, String value) { + index.put(key, value); + } + + protected abstract StringIndex initializeIndex(String cachePath); + + @Override + public void load(DataResource aData) throws ResourceInitializationException { + // prepare the persistent index + URI uri = aData.getUri(); + File indexFile = null; + boolean loadData = true; + try { + File resourceFile; + try { + resourceFile = new File(uri); + } catch (IllegalArgumentException e) { + // to support relative file paths like file:resources/somefile.txt + resourceFile = new File(uri.getSchemeSpecificPart()); + } + String resourceFileName = FilenameUtils.getName(uri.toURL().getPath()); + indexFile = new File("es-consumer-cache", resourceFileName); + if (resourceFile.exists() && indexFile.exists() && resourceFile.lastModified() > indexFile.lastModified()) { + log.info("Resource file {} is newer than the existing cached index at {}. Creating new index.", resourceFile, indexFile); + if (indexFile.isDirectory()) { + log.info("Deleting index directory {}", indexFile); + FileUtils.deleteDirectory(indexFile); + } else { + log.info("Deleting index file {}", indexFile); + indexFile.delete(); + } + } else { + boolean indexFileExisted = indexFile.exists(); + if (!indexFileExisted) { + log.info("Creating persistent cache for resource {} at {}.", uri, indexFile); + } else { + log.info("Using existing persistent cache {} for resource {}.", indexFile, uri); + loadData = false; + } + } + index = initializeIndex(indexFile.getAbsolutePath()); + } catch (MalformedURLException e) { + log.error("Could obtain file name from resource URI '{}'", uri, e); + throw new IllegalStateException(e); + } catch (IOException e) { + log.error("Could not delete index file {}", indexFile, e); + throw new ResourceInitializationException(e); + } + if (loadData) { + super.load(aData); + if (index.requiresExplicitCommit()) + index.commit(); + } + index.close(); + index.open(); + log.info("There are {} entries in the cache at {}.", index.size(), indexFile); + } + + @Override + protected String getValue(String valueString) { + return valueString; + } + + @Override + protected String getKey(String keyString) { + return keyString; + } +} diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/ReversedDoubleMapProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/ReversedDoubleMapProvider.java index d9caa600a..fc1184319 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/ReversedDoubleMapProvider.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/ReversedDoubleMapProvider.java @@ -1,11 +1,21 @@ package de.julielab.jcore.consumer.es.sharedresources; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + public class ReversedDoubleMapProvider extends AbstractMapProvider { + private final static Logger log = LoggerFactory.getLogger(ReversedDoubleMapProvider.class); public ReversedDoubleMapProvider() { + super(log); this.reverse = true; } + @Override + protected void put(String key, Double value) { + map.put(key, value); + } + @Override protected Double getValue(String valueString) { return Double.parseDouble(valueString.trim()); diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/StringIndex.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/StringIndex.java new file mode 100644 index 000000000..733dcc213 --- /dev/null +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/StringIndex.java @@ -0,0 +1,25 @@ +package de.julielab.jcore.consumer.es.sharedresources; + +public interface StringIndex { + String get(String key); + + String[] getArray(String key); + + void put(String key, String value); + + void put(String key, String[] value); + + void commit(); + + boolean requiresExplicitCommit(); + + void close(); + + void open(); + + int size(); + + default String getName() { + return getClass().getSimpleName(); + } +} diff --git a/jcore-elasticsearch-consumer/src/main/resources/de/julielab/jcore/consumer/es/desc/jcore-elasticsearch-consumer.xml b/jcore-elasticsearch-consumer/src/main/resources/de/julielab/jcore/consumer/es/desc/jcore-elasticsearch-consumer.xml index cafc85e71..3b120a9d7 100644 --- a/jcore-elasticsearch-consumer/src/main/resources/de/julielab/jcore/consumer/es/desc/jcore-elasticsearch-consumer.xml +++ b/jcore-elasticsearch-consumer/src/main/resources/de/julielab/jcore/consumer/es/desc/jcore-elasticsearch-consumer.xml @@ -1,11 +1,10 @@ - + org.apache.uima.java true de.julielab.jcore.consumer.es.ElasticSearchConsumer JCore ElasticSearch Consumer - 2.5.1-SNAPSHOT urls @@ -23,7 +22,7 @@ type - The index type the generated documents should have. The types are removed from ElasticSearch with version 7 so this parameter is set to have the same value for all documents. + The index type the generated documents should have. The types are removed from ElasticSearch with version 7 and should be omitted for ES >= 7. String false false @@ -35,6 +34,20 @@ false false + + deleteDocumentsBeforeIndexing + Whether or not to delete documents with the docId of the UIMA CASes in ElasticSearch prior to indexing. This is useful when parts of the document are indexed whose IDs are not stable or that might change after document updates and would not just be overwritten when indexing anew. Defaults to false. + Boolean + false + false + + + documentIdField + Required when deleteDocumentsBeforeIndexing is set to true. This should be an existing index field that contains the document ID of each CAS. It is used to remove existing index documents related to the CAS document ID prior to indexing. + String + false + false + FieldGenerators An array of qualified Java class names. Each enumerated class must implement the FieldGenerator interface and is delivered by the user. These classes will be applied to the consumed CAS and populate Document instances with fields and thus determine the structure and content of the output documents. The field values are derived from CAS data. FieldGenerators always populate a single Document instance with fields. If multiple documents must be created for each CAS, refer to the DocumentGenerators parameter. @@ -71,10 +84,10 @@ false - - - - + + + + true true diff --git a/jcore-elasticsearch-consumer/src/main/resources/de/julielab/jcore/consumer/es/desc/jcore-json-writer.xml b/jcore-elasticsearch-consumer/src/main/resources/de/julielab/jcore/consumer/es/desc/jcore-json-writer.xml index efd472393..47d8daa69 100644 --- a/jcore-elasticsearch-consumer/src/main/resources/de/julielab/jcore/consumer/es/desc/jcore-json-writer.xml +++ b/jcore-elasticsearch-consumer/src/main/resources/de/julielab/jcore/consumer/es/desc/jcore-json-writer.xml @@ -5,7 +5,7 @@ de.julielab.jcore.consumer.es.JsonWriter JCoRe JSON Writer - 2.5.1-SNAPSHOT + 2.6.0 OutputDestination diff --git a/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/AbstractFieldGeneratorTest.java b/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/AbstractFieldGeneratorTest.java index 68292673e..588489b8c 100644 --- a/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/AbstractFieldGeneratorTest.java +++ b/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/AbstractFieldGeneratorTest.java @@ -367,6 +367,6 @@ public void testAddArray() { builder.registerTypeAdapter(PreanalyzedFieldValue.class, new PreanalyzedFieldValue.PreanalyzedFieldValueGsonAdapter()); Gson gson = builder.create(); - assertEquals("{\"field\":[\"eins\",\"zwei\"]}", gson.toJson(d)); + assertEquals("{\"field\":[\"eins\",\"zwei\"]}", gson.toJson(d)); } } diff --git a/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/ElasticSearchConsumerIT.java b/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/ElasticSearchConsumerIT.java index 849c005df..b07d0ca86 100644 --- a/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/ElasticSearchConsumerIT.java +++ b/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/ElasticSearchConsumerIT.java @@ -4,42 +4,51 @@ import de.julielab.jcore.consumer.es.preanalyzed.Document; import de.julielab.jcore.consumer.es.preanalyzed.RawToken; import de.julielab.jcore.types.Header; +import de.julielab.jcore.utility.JCoReTools; import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.cas.CASException; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.BeforeClass; -import org.junit.ClassRule; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.testcontainers.containers.GenericContainer; import org.testcontainers.containers.output.OutputFrame; import org.testcontainers.containers.output.Slf4jLogConsumer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; import org.testcontainers.shaded.com.fasterxml.jackson.databind.ObjectMapper; +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; import java.net.HttpURLConnection; import java.net.URL; import java.time.Duration; import java.util.Map; +import java.util.function.Supplier; +import java.util.regex.Matcher; +import java.util.regex.Pattern; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +@Testcontainers public class ElasticSearchConsumerIT { public static final String TEST_INDEX = "testindex"; public static final String TEST_CLUSTER = "testcluster"; private final static Logger log = LoggerFactory.getLogger(ElasticSearchConsumerIT.class); // in case we need to disable X-shield: https://stackoverflow.com/a/51172136/1314955 - @ClassRule - public static GenericContainer es = new GenericContainer("docker.elastic.co/elasticsearch/elasticsearch:7.0.1") + @Container + public static GenericContainer es = new GenericContainer("docker.elastic.co/elasticsearch/elasticsearch:7.17.0") .withEnv("xpack.security.enabled", "false") .withEnv("discovery.type", "single-node") .withExposedPorts(9200) .withStartupTimeout(Duration.ofMinutes(2)) .withEnv("cluster.name", TEST_CLUSTER); - @BeforeClass + @BeforeAll public static void setup() { Slf4jLogConsumer toStringConsumer = new Slf4jLogConsumer(log); es.followOutput(toStringConsumer, OutputFrame.OutputType.STDOUT); @@ -55,19 +64,82 @@ public void testMinimal() throws Exception { final AnalysisEngine consumer = AnalysisEngineFactory.createEngine(ElasticSearchConsumer.class, ElasticSearchConsumer.PARAM_INDEX_NAME, TEST_INDEX, ElasticSearchConsumer.PARAM_URLS, "http://localhost:" + es.getMappedPort(9200), - ElasticSearchConsumer.PARAM_FIELD_GENERATORS, new String[]{"de.julielab.jcore.consumer.es.ElasticSearchConsumerIT$TestFieldGenerator"}); + ElasticSearchConsumer.PARAM_FIELD_GENERATORS, new String[]{"de.julielab.jcore.consumer.es.ElasticSearchConsumerIT$MinimalTestFieldGenerator"}); consumer.process(jCas); consumer.collectionProcessComplete(); + Thread.sleep(4000); final URL url = new URL("http://localhost:" + es.getMappedPort(9200) + "/" + TEST_INDEX + "/_doc/987"); final ObjectMapper om = new ObjectMapper(); - final Map map = om.readValue(url.openStream(), Map.class); - assertEquals(jCas.getDocumentText(), ((Map)map.get("_source")).get("text")); + final Map map = om.readValue(url.openStream(), Map.class); + assertEquals(jCas.getDocumentText(), ((Map) map.get("_source")).get("text")); + } + + @Test + public void testDeleteDocumentsBeforeIndexing() throws Exception { + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-types"); + final AnalysisEngine consumer = AnalysisEngineFactory.createEngine(ElasticSearchConsumer.class, + ElasticSearchConsumer.PARAM_INDEX_NAME, TEST_INDEX, + ElasticSearchConsumer.PARAM_URLS, "http://localhost:" + es.getMappedPort(9200), + ElasticSearchConsumer.PARAM_FIELD_GENERATORS, new String[]{"de.julielab.jcore.consumer.es.ElasticSearchConsumerIT$TestFieldGenerator"}); + // The indexing code is put into a lambda so we don't have to repeat ourselves + Runnable doIndex = () -> { + try { +// for (int j = 0; j < 2; ++j) { + for (int i = 0; i < 10; i++) { + jCas.setDocumentText("Some text."); + final Header header = new Header(jCas); + header.setDocId(String.valueOf(i)); + header.addToIndexes(); + consumer.process(jCas); + jCas.reset(); + } +// } + consumer.collectionProcessComplete(); + } catch (AnalysisEngineProcessException e) { + throw new RuntimeException(e); + } + }; + Supplier getNumDocuments = () -> { + try { + Thread.sleep(3000); + final URL countUrl = new URL("http://localhost:" + es.getMappedPort(9200) + "/" + TEST_INDEX + "/_count"); + final HttpURLConnection urlConnection = (HttpURLConnection) countUrl.openConnection(); + urlConnection.setRequestMethod("POST"); + urlConnection.setDoOutput(true); + urlConnection.setRequestProperty("Content-Type", "application/json"); + try (BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(urlConnection.getOutputStream()))) { + bw.write("{\"query\":{\"match_all\":{}}}"); + } + final String response = IOStreamUtilities.getStringFromInputStream(urlConnection.getInputStream()); + final Matcher matcher = Pattern.compile("count\":([0-9]+)").matcher(response); + matcher.find(); + return Integer.parseInt(matcher.group(1)); + } catch (InterruptedException| IOException e) { + throw new RuntimeException(e); + } + }; + + doIndex.run(); + doIndex.run(); + // we expect 20 document although we have indexed the same documents twice; the reason is that the index + // document ID is set randomly to simulate the situation where we index individual entities or relations + // that have a document ID different from the main docId + assertEquals(20, getNumDocuments.get()); + + // now activate delete-before-index. After indexing anew, there should be only 10 documents in the index + consumer.setConfigParameterValue(ElasticSearchConsumer.PARAM_DELETE_DOCS_BEFORE_INDEXING, true); + consumer.setConfigParameterValue(ElasticSearchConsumer.PARAM_DOC_ID_FIELD, "docId"); + consumer.reconfigure(); + doIndex.run(); + assertEquals(10, getNumDocuments.get()); } /** * This class is passed by name as parameter to the test consumer AE. */ public static class TestFieldGenerator extends FieldGenerator { + private int internalTestIdCounter = 0; + public TestFieldGenerator(FilterRegistry filterRegistry) { super(filterRegistry); } @@ -75,6 +147,27 @@ public TestFieldGenerator(FilterRegistry filterRegistry) { @Override public Document addFields(JCas aJCas, Document doc) { doc.addField("text", new RawToken(aJCas.getDocumentText())); + doc.addField("docId", new RawToken(JCoReTools.getDocId(aJCas))); + // some diverging index document ID; we use this to test if the delete-before-index function works + doc.setId("divergingid" + internalTestIdCounter++); + return doc; + } + } + + /** + * This class is passed by name as parameter to the test consumer AE. + */ + public static class MinimalTestFieldGenerator extends FieldGenerator { + public MinimalTestFieldGenerator(FilterRegistry filterRegistry) { + super(filterRegistry); + } + + @Override + public Document addFields(JCas aJCas, Document doc) { + final String docId = JCoReTools.getDocId(aJCas); + doc.setId(docId); + // we need any field or the document won't be indexed + doc.addField("text", "Some text."); return doc; } } diff --git a/jcore-elasticsearch-consumer/src/test/resources/de/julielab/jcore/consumer/es/testTypes.xml b/jcore-elasticsearch-consumer/src/test/resources/de/julielab/jcore/consumer/es/testTypes.xml index 0b1bd8c30..7ff4dc049 100644 --- a/jcore-elasticsearch-consumer/src/test/resources/de/julielab/jcore/consumer/es/testTypes.xml +++ b/jcore-elasticsearch-consumer/src/test/resources/de/julielab/jcore/consumer/es/testTypes.xml @@ -2,7 +2,7 @@ testTypes Some types suited for unit tests. - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-elasticsearch-consumer/src/test/resources/onefile-output/thefile-Eriks-MacBook-Air-2.local-1-0.json-Eriks-MacBook-Air-2.local-2-2.json b/jcore-elasticsearch-consumer/src/test/resources/onefile-output/thefile-Eriks-MacBook-Air-2.local-1-0.json-Eriks-MacBook-Air-2.local-2-2.json deleted file mode 100644 index 5a085e8d3..000000000 --- a/jcore-elasticsearch-consumer/src/test/resources/onefile-output/thefile-Eriks-MacBook-Air-2.local-1-0.json-Eriks-MacBook-Air-2.local-2-2.json +++ /dev/null @@ -1,3 +0,0 @@ -{"documentText":"This is one line that should not be interrupted."} -{"documentText":"This is one line that should not be interrupted."} -{"documentText":"This is one line that should not be interrupted."} diff --git a/jcore-embedding-writer/component.meta b/jcore-embedding-writer/component.meta index c95336587..bfa25267d 100644 --- a/jcore-embedding-writer/component.meta +++ b/jcore-embedding-writer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-embedding-writer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Embedding Writer" } diff --git a/jcore-embedding-writer/pom.xml b/jcore-embedding-writer/pom.xml index 820510aa5..b5896bbf2 100644 --- a/jcore-embedding-writer/pom.xml +++ b/jcore-embedding-writer/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -28,14 +28,18 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine de.julielab jcore-utilities ${jcore-utilities-version} + + org.apache.commons + commons-lang3 + org.assertj assertj-core diff --git a/jcore-embedding-writer/src/main/resources/de/julielab/jcore/consumer/ew/desc/jcore-embedding-writer.xml b/jcore-embedding-writer/src/main/resources/de/julielab/jcore/consumer/ew/desc/jcore-embedding-writer.xml index 14b684f02..491922b81 100644 --- a/jcore-embedding-writer/src/main/resources/de/julielab/jcore/consumer/ew/desc/jcore-embedding-writer.xml +++ b/jcore-embedding-writer/src/main/resources/de/julielab/jcore/consumer/ew/desc/jcore-embedding-writer.xml @@ -6,7 +6,7 @@ JCoRe Flair Embedding Writer Given a Flair compatible embedding and a UIMA annotation type, this component prints the embeddings of tokens annotated with the annotation to a file. - 2.5.1-SNAPSHOT + 2.6.0 UseGzip diff --git a/jcore-embedding-writer/src/test/java/de/julielab/jcore/consumer/ew/DecoderTest.java b/jcore-embedding-writer/src/test/java/de/julielab/jcore/consumer/ew/DecoderTest.java index 85ed94597..d34cdd780 100644 --- a/jcore-embedding-writer/src/test/java/de/julielab/jcore/consumer/ew/DecoderTest.java +++ b/jcore-embedding-writer/src/test/java/de/julielab/jcore/consumer/ew/DecoderTest.java @@ -2,7 +2,7 @@ import org.apache.commons.lang3.tuple.Pair; import org.assertj.core.data.Offset; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.*; import java.nio.ByteBuffer; diff --git a/jcore-embedding-writer/src/test/java/de/julielab/jcore/consumer/ew/EmbeddingWriterTest.java b/jcore-embedding-writer/src/test/java/de/julielab/jcore/consumer/ew/EmbeddingWriterTest.java index 6a80fbcbe..8d19cf1ce 100644 --- a/jcore-embedding-writer/src/test/java/de/julielab/jcore/consumer/ew/EmbeddingWriterTest.java +++ b/jcore-embedding-writer/src/test/java/de/julielab/jcore/consumer/ew/EmbeddingWriterTest.java @@ -10,7 +10,7 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.DoubleArray; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileInputStream; diff --git a/jcore-event-flattener-ae/component.meta b/jcore-event-flattener-ae/component.meta index 94b772718..2fc02a11f 100644 --- a/jcore-event-flattener-ae/component.meta +++ b/jcore-event-flattener-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-event-flattener-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Event Flattener AE" } diff --git a/jcore-event-flattener-ae/pom.xml b/jcore-event-flattener-ae/pom.xml index 83ff43f48..cb83be2ac 100644 --- a/jcore-event-flattener-ae/pom.xml +++ b/jcore-event-flattener-ae/pom.xml @@ -1,48 +1,56 @@ - - 4.0.0 - - de.julielab - jcore-base - 2.5.1-SNAPSHOT - - jcore-event-flattener-ae - JCoRe Event Flattener AE - This component reads de.julielab.jcore.types.EventMention annotations and converts event structures into de.julielab.jcore.types.ext.FlattenedRelation annotation. The purpose of FlattenedRelations is to represent complex event structures in a more simple manner. This can be helpful for visualization or further processing. - - - org.slf4j - slf4j-api - - - com.google.guava - guava - 18.0 - test - - - de.julielab - jcore-types - ${jcore-types-version} - - - ch.qos.logback - logback-classic - test - - - de.julielab - jcore-descriptor-creator - - junitjunit - - JULIE Lab Jena, Germany - http://www.julielab.de - - https://github.com/JULIELab/jcore-base/tree/master/jcore-event-flattener-ae - - - BSD-2-Clause - https://opensource.org/licenses/BSD-2-Clause - - + + 4.0.0 + + de.julielab + jcore-base + 2.6.0 + + jcore-event-flattener-ae + JCoRe Event Flattener AE + This component reads de.julielab.jcore.types.EventMention annotations and converts event structures + into de.julielab.jcore.types.ext.FlattenedRelation annotation. The purpose of FlattenedRelations is to represent + complex event structures in a more simple manner. This can be helpful for visualization or further processing. + + + + org.slf4j + slf4j-api + + + com.google.guava + guava + 18.0 + test + + + de.julielab + jcore-types + ${jcore-types-version} + + + ch.qos.logback + logback-classic + test + + + de.julielab + jcore-descriptor-creator + + + org.junit.jupiter + junit-jupiter-engine + + + + JULIE Lab Jena, Germany + http://www.julielab.de + + https://github.com/JULIELab/jcore-base/tree/master/jcore-event-flattener-ae + + + BSD-2-Clause + https://opensource.org/licenses/BSD-2-Clause + + diff --git a/jcore-event-flattener-ae/src/main/java/de/julielab/jcore/ae/eventflattener/EventFlattener.java b/jcore-event-flattener-ae/src/main/java/de/julielab/jcore/ae/eventflattener/EventFlattener.java index 5a7e09497..10cca7a88 100644 --- a/jcore-event-flattener-ae/src/main/java/de/julielab/jcore/ae/eventflattener/EventFlattener.java +++ b/jcore-event-flattener-ae/src/main/java/de/julielab/jcore/ae/eventflattener/EventFlattener.java @@ -40,7 +40,7 @@ * roles for arguments of those event types. For more information, please refer * to http://www.nactem.ac.uk/tsujii/GENIA/SharedTask/detail.shtml#event. * - * @see http://www.nactem.ac.uk/tsujii/GENIA/SharedTask/detail.shtml#event + * @see http://www.nactem.ac.uk/tsujii/GENIA/SharedTask/detail.shtml#event *

* * @author faessler @@ -59,9 +59,9 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { List topEvents = determineTopEvents(aJCas); for (EventMention topEvent : topEvents) { List events = collectEventsInTree(topEvent, - new ArrayList()); + new ArrayList<>()); List arguments = collectPrimitiveArguments( - topEvent, new ArrayList()); + topEvent, new ArrayList<>()); List agentArguments = null; List patientArguments = null; switch (topEvent.getSpecificType()) { @@ -69,9 +69,9 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { case "Positive_regulation": case "Negative_regulation": agentArguments = collectAgentArguments(topEvent, - new ArrayList()); + new ArrayList<>()); patientArguments = collectPatientArguments(topEvent, - new ArrayList()); + new ArrayList<>()); break; default: break; @@ -98,7 +98,7 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { } catch (Exception e) { Header header = (Header) aJCas.getAnnotationIndex(Header.type) .iterator().next(); - log.error("Exception occurred in document {}: {}", + log.error("Exception occurred in document {}:", header.getDocId(), e); throw new AnalysisEngineProcessException(e); } @@ -178,7 +178,7 @@ private List collectPrimitiveArguments( * Returns the EventMentions in the CAS that are not the argument * of another event. * - * @param events + * @param aJCas * @return */ private List determineTopEvents(JCas aJCas) { diff --git a/jcore-event-flattener-ae/src/main/resources/de/julielab/jcore/ae/eventflattener/desc/jcore-event-flattener-ae.xml b/jcore-event-flattener-ae/src/main/resources/de/julielab/jcore/ae/eventflattener/desc/jcore-event-flattener-ae.xml index bbd7bde4f..a7af948d2 100644 --- a/jcore-event-flattener-ae/src/main/resources/de/julielab/jcore/ae/eventflattener/desc/jcore-event-flattener-ae.xml +++ b/jcore-event-flattener-ae/src/main/resources/de/julielab/jcore/ae/eventflattener/desc/jcore-event-flattener-ae.xml @@ -6,13 +6,14 @@ de.julielab.jcore.ae.eventflattener.EventFlattener Descriptor automatically generated by uimaFIT - 2.5.1-SNAPSHOT + 2.6.0 de.julielab.jcore.ae.eventflattener - + + diff --git a/jcore-event-flattener-ae/src/test/java/de/julielab/jules/ae/EventFlattenerTest.java b/jcore-event-flattener-ae/src/test/java/de/julielab/jules/ae/EventFlattenerTest.java index ea1c0d4c3..8af8ce297 100644 --- a/jcore-event-flattener-ae/src/test/java/de/julielab/jules/ae/EventFlattenerTest.java +++ b/jcore-event-flattener-ae/src/test/java/de/julielab/jules/ae/EventFlattenerTest.java @@ -13,108 +13,108 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.FileInputStream; import java.util.Set; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class EventFlattenerTest { - @SuppressWarnings("unused") - private final static Logger log = LoggerFactory - .getLogger(EventFlattenerTest.class); + @SuppressWarnings("unused") + private final static Logger log = LoggerFactory + .getLogger(EventFlattenerTest.class); - @Test - public void testProcess() throws Exception, SecurityException { - JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-all-types"); - XmiCasDeserializer.deserialize(new FileInputStream( - "src/test/resources/21499307.xmi"), jCas - .getCas()); + @Test + public void testProcess() throws Exception, SecurityException { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-all-types"); + XmiCasDeserializer.deserialize(new FileInputStream( + "src/test/resources/21499307.xmi"), jCas + .getCas()); - AnalysisEngine flattener = AnalysisEngineFactory - .createEngine(EventFlattener.class); - flattener.process(jCas); + AnalysisEngine flattener = AnalysisEngineFactory + .createEngine(EventFlattener.class); + flattener.process(jCas); - FSIterator sentit = jCas.getAnnotationIndex(Sentence.type) - .iterator(); - int sentenceCounter = 1; - // we are interested in the 8th sentence because there is the only complex event structure there - Sentence interestingSent = null; - while (sentit.hasNext()) { - Sentence s = (Sentence) sentit.next(); - switch (sentenceCounter) { - case 3: - assertEquals("Wrong number of flattened events in sentence " - + s.getCoveredText(), 2, countEventsInSentence(s)); - break; - case 5: - assertEquals("Wrong number of flattened events in sentence " - + s.getCoveredText(), 1, countEventsInSentence(s)); - break; - case 6: - assertEquals("Wrong number of flattened events in sentence " - + s.getCoveredText(), 2, countEventsInSentence(s)); - break; - case 7: - assertEquals("Wrong number of flattened events in sentence " - + s.getCoveredText(), 4, countEventsInSentence(s)); - break; - case 8: - assertEquals("Wrong number of flattened events in sentence " - + s.getCoveredText(), 6, countEventsInSentence(s)); - interestingSent = s; - break; - case 9: - assertEquals("Wrong number of flattened events in sentence " - + s.getCoveredText(), 1, countEventsInSentence(s)); - break; - default: - assertEquals("Wrong number of flattened events in sentence " - + s.getCoveredText(), 0, countEventsInSentence(s)); - } - sentenceCounter++; - } - FSIterator flateventit = jCas - .getAnnotationIndex(FlattenedRelation.type).subiterator(interestingSent); - while (flateventit.hasNext()) { - FlattenedRelation fr = (FlattenedRelation) flateventit.next(); - if (fr.getId().equals("FE" + 13)) { - // All arguments there? - Set expectedArguments = Sets.newHashSet("anti-apoptotic Bcl-2", "CSN5"); - for (int i = 0; i < fr.getArguments().size(); ++i) - assertTrue("Unexpected argument: " + fr.getArguments(i).getCoveredText(), expectedArguments.remove(fr.getArguments(i).getCoveredText())); - assertTrue("Expected arguments not found in relation: " + expectedArguments, expectedArguments.isEmpty()); - // Arguments correctly divided into agents and patients? - assertEquals(1, fr.getAgents().size()); - assertEquals(1, fr.getPatients().size()); - assertEquals("CSN5", fr.getAgents(0).getCoveredText()); - assertEquals("anti-apoptotic Bcl-2", fr.getPatients(0).getCoveredText()); - // All participating (sub-)events there? - assertEquals(3, fr.getRelations().size()); - Set expectedRelations = Sets.newHashSet("depletion", "caused", "expression"); - for (int i = 0; i < fr.getRelations().size(); ++i) - assertTrue("Unexpected relation: " + fr.getRelations(i).getCoveredText(), expectedRelations.remove(fr.getRelations(i).getCoveredText())); - assertTrue(expectedRelations.isEmpty()); - } - } - - } + FSIterator sentit = jCas.getAnnotationIndex(Sentence.type) + .iterator(); + int sentenceCounter = 1; + // we are interested in the 8th sentence because there is the only complex event structure there + Sentence interestingSent = null; + while (sentit.hasNext()) { + Sentence s = (Sentence) sentit.next(); + switch (sentenceCounter) { + case 3: + assertEquals(2, countEventsInSentence(s), "Wrong number of flattened events in sentence " + + s.getCoveredText()); + break; + case 5: + assertEquals(1, countEventsInSentence(s), "Wrong number of flattened events in sentence " + + s.getCoveredText()); + break; + case 6: + assertEquals(2, countEventsInSentence(s), "Wrong number of flattened events in sentence " + + s.getCoveredText()); + break; + case 7: + assertEquals(4, countEventsInSentence(s), "Wrong number of flattened events in sentence " + + s.getCoveredText()); + break; + case 8: + assertEquals(6, countEventsInSentence(s), "Wrong number of flattened events in sentence " + + s.getCoveredText()); + interestingSent = s; + break; + case 9: + assertEquals(1, countEventsInSentence(s), "Wrong number of flattened events in sentence " + + s.getCoveredText()); + break; + default: + assertEquals(0, countEventsInSentence(s), "Wrong number of flattened events in sentence " + + s.getCoveredText()); + } + sentenceCounter++; + } + FSIterator flateventit = jCas + .getAnnotationIndex(FlattenedRelation.type).subiterator(interestingSent); + while (flateventit.hasNext()) { + FlattenedRelation fr = (FlattenedRelation) flateventit.next(); + if (fr.getId().equals("FE" + 13)) { + // All arguments there? + Set expectedArguments = Sets.newHashSet("anti-apoptotic Bcl-2", "CSN5"); + for (int i = 0; i < fr.getArguments().size(); ++i) + assertTrue(expectedArguments.remove(fr.getArguments(i).getCoveredText()), "Unexpected argument: " + fr.getArguments(i).getCoveredText()); + assertTrue(expectedArguments.isEmpty(), "Expected arguments not found in relation: " + expectedArguments); + // Arguments correctly divided into agents and patients? + assertEquals(1, fr.getAgents().size()); + assertEquals(1, fr.getPatients().size()); + assertEquals("CSN5", fr.getAgents(0).getCoveredText()); + assertEquals("anti-apoptotic Bcl-2", fr.getPatients(0).getCoveredText()); + // All participating (sub-)events there? + assertEquals(3, fr.getRelations().size()); + Set expectedRelations = Sets.newHashSet("depletion", "caused", "expression"); + for (int i = 0; i < fr.getRelations().size(); ++i) + assertTrue(expectedRelations.remove(fr.getRelations(i).getCoveredText()), "Unexpected relation: " + fr.getRelations(i).getCoveredText()); + assertTrue(expectedRelations.isEmpty()); + } + } - private int countEventsInSentence(Sentence s) throws CASRuntimeException, - CASException { - FSIterator flateventit = s.getCAS().getJCas() - .getAnnotationIndex(FlattenedRelation.type).subiterator(s); - int count = 0; - while (flateventit.hasNext()) { - @SuppressWarnings("unused") - Annotation annotation = (Annotation) flateventit.next(); - count++; - } - return count; - } + } + + private int countEventsInSentence(Sentence s) throws CASRuntimeException, + CASException { + FSIterator flateventit = s.getCAS().getJCas() + .getAnnotationIndex(FlattenedRelation.type).subiterator(s); + int count = 0; + while (flateventit.hasNext()) { + @SuppressWarnings("unused") + Annotation annotation = (Annotation) flateventit.next(); + count++; + } + return count; + } } diff --git a/jcore-feature-value-replacement-ae/component.meta b/jcore-feature-value-replacement-ae/component.meta index d81fdcdaa..2451095b0 100644 --- a/jcore-feature-value-replacement-ae/component.meta +++ b/jcore-feature-value-replacement-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-feature-value-replacement-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Feature Value Replacement AE" } diff --git a/jcore-feature-value-replacement-ae/pom.xml b/jcore-feature-value-replacement-ae/pom.xml index 721035710..e68ec49e8 100644 --- a/jcore-feature-value-replacement-ae/pom.xml +++ b/jcore-feature-value-replacement-ae/pom.xml @@ -4,7 +4,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 jcore-feature-value-replacement-ae JCoRe Feature Value Replacement AE @@ -34,8 +34,8 @@ jcore-descriptor-creator
- junit - junit + org.junit.jupiter + junit-jupiter-engine
diff --git a/jcore-feature-value-replacement-ae/src/main/resources/de/julielab/jcore/ae/fvr/desc/jcore-feature-value-replacement-ae.xml b/jcore-feature-value-replacement-ae/src/main/resources/de/julielab/jcore/ae/fvr/desc/jcore-feature-value-replacement-ae.xml index 9be834fd5..8676f0848 100644 --- a/jcore-feature-value-replacement-ae/src/main/resources/de/julielab/jcore/ae/fvr/desc/jcore-feature-value-replacement-ae.xml +++ b/jcore-feature-value-replacement-ae/src/main/resources/de/julielab/jcore/ae/fvr/desc/jcore-feature-value-replacement-ae.xml @@ -6,7 +6,7 @@ de.julielab.jcore.ae.fvr.FeatureValueReplacementAnnotator Descriptor automatically generated by uimaFIT - 2.5.1-SNAPSHOT + 2.6.0 de.julielab.jcore.ae.fvr diff --git a/jcore-feature-value-replacement-ae/src/test/java/de/julielab/jcore/ae/fvr/FeatureValueReplacementAnnotatorTest.java b/jcore-feature-value-replacement-ae/src/test/java/de/julielab/jcore/ae/fvr/FeatureValueReplacementAnnotatorTest.java index efb4df831..81958daf1 100644 --- a/jcore-feature-value-replacement-ae/src/test/java/de/julielab/jcore/ae/fvr/FeatureValueReplacementAnnotatorTest.java +++ b/jcore-feature-value-replacement-ae/src/test/java/de/julielab/jcore/ae/fvr/FeatureValueReplacementAnnotatorTest.java @@ -10,11 +10,11 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.resource.ExternalResourceDescription; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class FeatureValueReplacementAnnotatorTest { @Test diff --git a/jcore-file-reader/component.meta b/jcore-file-reader/component.meta index 2e3e09849..a166fe8b5 100644 --- a/jcore-file-reader/component.meta +++ b/jcore-file-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-file-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe File Reader" } diff --git a/jcore-file-reader/pom.xml b/jcore-file-reader/pom.xml index 74d1574a6..38940e7ef 100644 --- a/jcore-file-reader/pom.xml +++ b/jcore-file-reader/pom.xml @@ -5,7 +5,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 jcore-file-reader JCoRe File Reader @@ -25,9 +25,10 @@ de.julielab julielab-java-utilities + - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-file-reader/src/main/java/de/julielab/jcore/reader/file/main/FileReader.java b/jcore-file-reader/src/main/java/de/julielab/jcore/reader/file/main/FileReader.java index 925c91e5e..f72f7fac1 100644 --- a/jcore-file-reader/src/main/java/de/julielab/jcore/reader/file/main/FileReader.java +++ b/jcore-file-reader/src/main/java/de/julielab/jcore/reader/file/main/FileReader.java @@ -20,12 +20,10 @@ import de.julielab.java.utilities.FileUtilities; import de.julielab.java.utilities.IOStreamUtilities; import de.julielab.jcore.types.Date; -import de.julielab.jcore.types.Sentence; -import de.julielab.jcore.types.Token; +import de.julielab.jcore.types.*; import de.julielab.jcore.types.pubmed.Header; import org.apache.uima.analysis_engine.annotator.AnnotatorConfigurationException; import org.apache.uima.cas.CAS; -import org.apache.uima.cas.CASException; import org.apache.uima.collection.CollectionException; import org.apache.uima.collection.CollectionReader_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; @@ -33,15 +31,20 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.*; +import java.nio.file.FileVisitOption; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Stream; public class FileReader extends CollectionReader_ImplBase { - /** * */ @@ -82,7 +85,9 @@ public class FileReader extends CollectionReader_ImplBase { * */ public static final String ORIG_FILES_EXT = "OriginalFileExt"; - + public static final String TITLE_ABSTRACT_SPLIT = "MakeTitleAbstractSplit"; + public static final String REMOVE_FILE_NAME_EXTENSION_FOR_DOC_ID = "RemoveFileNameExtensionForDocId"; + private final static Logger log = LoggerFactory.getLogger(FileReader.class); private ArrayList files; private int fileIndex; @@ -107,6 +112,10 @@ public class FileReader extends CollectionReader_ImplBase { private File origFolder; @ConfigurationParameter(name = ORIG_FILES_EXT, mandatory = false) private String origFileExt; + @ConfigurationParameter(name = REMOVE_FILE_NAME_EXTENSION_FOR_DOC_ID, mandatory = false, defaultValue = "true") + private boolean removeFileNameExtensionForDocId; + @ConfigurationParameter(name = TITLE_ABSTRACT_SPLIT, mandatory = false, defaultValue = "false", description = "Use the first input line as the title with a Title annotation and mark the rest with the AbstractText annotation. Defaults to false.") + private boolean titleAbstractSplit; /** * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize() @@ -146,6 +155,7 @@ public void initialize() throws ResourceInitializationException { } else { useFilenameAsDocId = filenameAsDocId; } + removeFileNameExtensionForDocId = Optional.ofNullable((Boolean) getConfigParameterValue(REMOVE_FILE_NAME_EXTENSION_FOR_DOC_ID)).orElse(true); allowedExtensionsArray = (String[]) getConfigParameterValue(ALLOWED_FILE_EXTENSIONS); final Set allowedExtensions = new HashSet<>(); @@ -183,6 +193,8 @@ public void initialize() throws ResourceInitializationException { if (!inputDirectory.exists()) throw new ResourceInitializationException(AnnotatorConfigurationException.RESOURCE_NOT_FOUND, new Object[]{inputDirectory.getAbsolutePath()}); + titleAbstractSplit = (boolean) Optional.ofNullable(getConfigParameterValue(TITLE_ABSTRACT_SPLIT)).orElse(false); + fileIndex = 0; files = new ArrayList(); @@ -205,138 +217,159 @@ public boolean hasNext() { * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS) */ @Override - public void getNext(CAS aCAS) throws IOException, CollectionException { - JCas jcas; + public void getNext(CAS aCAS) throws CollectionException { + log.trace("Reading next file, if present"); + File file = null; try { - jcas = aCAS.getJCas(); - } catch (CASException e) { - throw new CollectionException(e); - } + JCas jcas = aCAS.getJCas(); - // open input stream to file - File file = files.get(fileIndex++); + // open input stream to file + file = files.get(fileIndex++); + log.trace("Got next file: {}", file); - String text = IOStreamUtilities.getStringFromInputStream(FileUtilities.getInputStreamFromFile(file)); + String text = IOStreamUtilities.getStringFromInputStream(FileUtilities.getInputStreamFromFile(file)); - Pattern nws = Pattern.compile("[^\\s]+", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS); + Pattern nws = Pattern.compile("[^\\s]+", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS); - String origText = null; - if (origFolder != null) { - File origFile = new File(origFolder, getFileName(file) + "." + origFileExt); - origText = IOStreamUtilities.getStringFromInputStream(FileUtilities.getInputStreamFromFile(origFile)); - } + String origText = null; + if (origFolder != null) { + File origFile = new File(origFolder, getFileName(file, true) + "." + origFileExt); + origText = IOStreamUtilities.getStringFromInputStream(FileUtilities.getInputStreamFromFile(origFile)); + } - // sentence per line mode - if (sentencePerLine) { - BufferedReader rdr = new BufferedReader(new StringReader(text)); - List lines = new ArrayList(); - List start = new ArrayList(); - List end = new ArrayList(); - Integer tmp = 0; - String line; - while ((line = rdr.readLine()) != null) { - if (!Pattern.matches("\\s*", line)) { - lines.add(line); - start.add(tmp); - end.add(tmp + line.length()); + // sentence per line mode + if (sentencePerLine) { + log.trace("Reading input file as one sentence per line."); + BufferedReader rdr = new BufferedReader(new StringReader(text)); + List lines = new ArrayList(); + List start = new ArrayList(); + List end = new ArrayList(); + Integer tmp = 0; + String line; + while ((line = rdr.readLine()) != null) { + if (!Pattern.matches("\\s*", line)) { + lines.add(line); + start.add(tmp); + end.add(tmp + line.length()); + } + tmp += (line.length() + 1); } - tmp += (line.length() + 1); - } - rdr.close(); - - int index_tmp = 0; - Optional newLine; - for (Integer i = 0; i < lines.size(); i++) { - boolean addSent2index = true; - Sentence sent = new Sentence(jcas); - if (origText != null) { - newLine = Stream - .of(lines.get(i).split("\\s+")) - .map(x -> Pattern.quote(x)) - .reduce((x, y) -> x + "\\s*" + y); - Pattern p = Pattern.compile(newLine.get(), Pattern.UNICODE_CHARACTER_CLASS); - Matcher m = p.matcher(origText); - if (m.find(index_tmp)) { - int newStart = m.start(); - int newEnd = m.end(); - index_tmp = m.end() + 1; - sent.setBegin(newStart); - sent.setEnd(newEnd); + rdr.close(); + + int index_tmp = 0; + Optional newLine; + for (Integer i = 0; i < lines.size(); i++) { + boolean addSent2index = true; + Sentence sent = new Sentence(jcas); + if (origText != null) { + newLine = Stream + .of(lines.get(i).split("\\s+")) + .map(x -> Pattern.quote(x)) + .reduce((x, y) -> x + "\\s*" + y); + Pattern p = Pattern.compile(newLine.get(), Pattern.UNICODE_CHARACTER_CLASS); + Matcher m = p.matcher(origText); + if (m.find(index_tmp)) { + int newStart = m.start(); + int newEnd = m.end(); + index_tmp = m.end() + 1; + sent.setBegin(newStart); + sent.setEnd(newEnd); + } else { + addSent2index = false; + } } else { - addSent2index = false; + sent.setBegin(start.get(i)); + sent.setEnd(end.get(i)); + } + sent.setComponentId(this.getClass().getName() + " : Sentence per Line Mode"); + if (addSent2index) { + sent.addToIndexes(); } - } else { - sent.setBegin(start.get(i)); - sent.setEnd(end.get(i)); - } - sent.setComponentId(this.getClass().getName() + " : Sentence per Line Mode"); - if (addSent2index) { - sent.addToIndexes(); } } - } - //token by token mode - if (tokenByToken) { - List tokensList = new ArrayList<>(); - List tokStart = new ArrayList<>(); - List tokEnd = new ArrayList<>(); - - - Integer numberOfTokens = 0; - Matcher m = nws.matcher(text); - while (m.find()) { - String token = m.group(); - int start = m.start(); - int end = m.end(); - tokensList.add(token); - tokStart.add(start); - tokEnd.add(end); - numberOfTokens++; - } + //token by token mode + if (tokenByToken) { + log.trace("Reading input file as tokenized text with whitespace as token separator."); + List tokensList = new ArrayList<>(); + List tokStart = new ArrayList<>(); + List tokEnd = new ArrayList<>(); + + + Integer numberOfTokens = 0; + Matcher m = nws.matcher(text); + while (m.find()) { + String token = m.group(); + int start = m.start(); + int end = m.end(); + tokensList.add(token); + tokStart.add(start); + tokEnd.add(end); + numberOfTokens++; + } - int index_tmp = 0; - for (Integer j = 0; j < tokensList.size(); j++) { - boolean addToken2index = true; - Token token = new Token(jcas); - if (origText != null) { - String tok = tokensList.get(j); - int newStart = origText.indexOf(tok, index_tmp); - int newEnd = newStart + tok.length(); - index_tmp = newEnd; - token.setBegin(newStart); - token.setEnd(newEnd); - } else { - token.setBegin(tokStart.get(j)); - token.setEnd(tokEnd.get(j)); - } - token.setComponentId(this.getClass().getName() + " : Tokenized Mode"); - if (addToken2index) { - token.addToIndexes(); + int index_tmp = 0; + for (Integer j = 0; j < tokensList.size(); j++) { + boolean addToken2index = true; + Token token = new Token(jcas); + if (origText != null) { + String tok = tokensList.get(j); + int newStart = origText.indexOf(tok, index_tmp); + int newEnd = newStart + tok.length(); + index_tmp = newEnd; + token.setBegin(newStart); + token.setEnd(newEnd); + } else { + token.setBegin(tokStart.get(j)); + token.setEnd(tokEnd.get(j)); + } + token.setComponentId(this.getClass().getName() + " : Tokenized Mode"); + if (addToken2index) { + token.addToIndexes(); + } } } - } - // put document in CAS - if (origText != null) { - jcas.setDocumentText(origText); - } else { - jcas.setDocumentText(text); - } + // put document in CAS + if (origText != null) { + jcas.setDocumentText(origText); + } else { + jcas.setDocumentText(text); + } - if (useFilenameAsDocId) { + if (titleAbstractSplit) { + String docText = jcas.getDocumentText(); + final int firstNewlineIndex = docText.indexOf("\n"); + if (firstNewlineIndex > 0) { + final Title title = new Title(jcas, 0, firstNewlineIndex); + title.setTitleType("document"); + title.setComponentId(getClass().getCanonicalName()); + title.addToIndexes(); + } + if (firstNewlineIndex + 1 < docText.length()) { + final AbstractText abstractText = new AbstractText(jcas, firstNewlineIndex + 1, docText.length()); + abstractText.setComponentId(getClass().getCanonicalName()); + abstractText.addToIndexes(); + } + } - String filename = getFileName(file); + if (useFilenameAsDocId) { + String filename = getFileName(file, removeFileNameExtensionForDocId); + log.trace("Setting the file name {} as docId to a new Header annotation.", filename); - Header header = new Header(jcas); + Header header = new Header(jcas); - // set ID - header.setDocId(filename); + // set ID + header.setDocId(filename); - // set publication date - addDateForID(header, jcas, filename); + // set publication date + addDateForID(header, jcas, filename); - header.addToIndexes(); + header.addToIndexes(); + } + } catch (Throwable t) { + log.error("Could not read file {}", file, t); + throw new CollectionException(t); } } @@ -404,33 +437,25 @@ public Progress[] getProgress() { return new Progress[]{new ProgressImpl(fileIndex, files.size(), Progress.ENTITIES)}; } - private String[] createFileListByType(File inputDirectory, final Set allowedExtensions) throws IOException { - String[] path = new File(inputDirectory.getPath()).list(); - - for (int i = 0; i < path.length; i++) { - File file = new File(inputDirectory.getAbsolutePath() + "/" + path[i]); - - if (!useSubDirs && file.isDirectory()) - continue; - - String CurrentExtension = path[i].substring(path[i].lastIndexOf('.') + 1); - if (allowedExtensions.isEmpty() || allowedExtensions.contains(CurrentExtension)) { - files.add(file); - } - - if (useSubDirs && file.isDirectory()) { - createFileListByType(file, allowedExtensions); - } - } - - return path; + private void createFileListByType(File inputDirectory, final Set allowedExtensions) throws IOException { + Files.walk(inputDirectory.toPath(), useSubDirs ? Integer.MAX_VALUE : 1, FileVisitOption.FOLLOW_LINKS) + .filter(p -> { + if (allowedExtensions.isEmpty()) return true; + for (String ext : allowedExtensions) if (p.toString().endsWith(ext)) return true; + return false; + }) + .map(Path::toFile) + .filter(File::isFile) + .forEach(files::add); } - private String getFileName(File fi) { + private String getFileName(File fi, boolean removeExtension) { String filename = fi.getName(); - int extDotIndex = filename.lastIndexOf('.'); - if (extDotIndex > 0) { - filename = filename.substring(0, extDotIndex); + if (removeExtension) { + int extDotIndex = filename.lastIndexOf('.'); + if (extDotIndex > 0) { + filename = filename.substring(0, extDotIndex); + } } if (fileNameSplitUnderscore) { int extUnderScoreIndex = filename.lastIndexOf('_'); diff --git a/jcore-file-reader/src/main/resources/de/julielab/jcore/reader/file/desc/jcore-file-reader.xml b/jcore-file-reader/src/main/resources/de/julielab/jcore/reader/file/desc/jcore-file-reader.xml index 39a2be27e..a5ed06515 100644 --- a/jcore-file-reader/src/main/resources/de/julielab/jcore/reader/file/desc/jcore-file-reader.xml +++ b/jcore-file-reader/src/main/resources/de/julielab/jcore/reader/file/desc/jcore-file-reader.xml @@ -1,147 +1,171 @@ - org.apache.uima.java - de.julielab.jcore.reader.file.main.FileReader - - JCoRe File Reader - - 2.5.1-SNAPSHOT + org.apache.uima.java + de.julielab.jcore.reader.file.main.FileReader + + JCoRe File Reader + + 2.6.0 JULIELab Jena, Germany - - - InputDirectory - The directory where the text files reside. - String - false - true - - - UseFilenameAsDocId - If set to true, the filename is used as document ID (without extension). - Boolean - false - false - - - PublicatonDatesFile - The file that maps each article id (e.g. pubmed id) of the files in the input directory to the corresponding publication date (can be created using julielab/jules/ae/genemapper/utils/PubMedID2PublicationDate) - String - false - false - - - AllowedFileExtensions - A list of file name extensions to restrict the read files in the InputDirectory. All files will be read if this parameter is left blank. - String - true - false - - - SentencePerLine - Whether the files are preprocessed and have only one sentence per line. - Boolean - false - false - - - FileNameSplitUnderscore - Whether the filenames are splitted on underscore as well as. - Boolean - false - false - - - ReadSubDirs - Boolean - false - false - - - TokenByToken - Boolean - false - false - - - OriginalFolder - String - false - false - - - OriginalFileExt - String - false - false - - - - - UseFilenameAsDocId - - false - - - - InputDirectory - - data/files - - - - SentencePerLine - - false - - - - FileNameSplitUnderscore - - false - - - - ReadSubDirs - - false - - - - TokenByToken - - false - - - - OriginalFileExt - - txt - - - - - - - - - - - - - - - - de.julielab.jcore.types.pubmed.Header - de.julielab.jcore.types.Date - - - - - - true - false - true - - - + + + InputDirectory + The directory where the text files reside. + String + false + true + + + UseFilenameAsDocId + If set to true, the filename is used as document ID (without extension). + Boolean + false + false + + + PublicatonDatesFile + The file that maps each article id (e.g. pubmed id) of the files in the input directory to + the corresponding publication date (can be created using + julielab/jules/ae/genemapper/utils/PubMedID2PublicationDate) + + String + false + false + + + AllowedFileExtensions + A list of file name extensions to restrict the read files in the InputDirectory. All files + will be read if this parameter is left blank. + + String + true + false + + + SentencePerLine + Whether the files are preprocessed and have only one sentence per line. + Boolean + false + false + + + FileNameSplitUnderscore + Whether the filenames are splitted on underscore as well as. + Boolean + false + false + + + ReadSubDirs + Boolean + false + false + + + TokenByToken + Boolean + false + false + + + OriginalFolder + String + false + false + + + OriginalFileExt + String + false + false + + + RemoveFileNameExtensionForDocId + Boolean + false + false + + + MakeTitleAbstractSplit + Use the first input line as the title with a Title annotation and mark the rest with the AbstractText annotation. Defaults to false. + Boolean + false + false + + + + + UseFilenameAsDocId + + false + + + + InputDirectory + + data/files + + + + SentencePerLine + + false + + + + FileNameSplitUnderscore + + false + + + + ReadSubDirs + + false + + + + TokenByToken + + false + + + + OriginalFileExt + + txt + + + + RemoveFileNameExtensionForDocId + + true + + + + + + + + + + + + + + + + de.julielab.jcore.types.pubmed.Header + de.julielab.jcore.types.Date + + + + + + true + false + true + + + \ No newline at end of file diff --git a/jcore-file-reader/src/test/java/de/julielab/jcore/reader/file/main/FileReaderTest.java b/jcore-file-reader/src/test/java/de/julielab/jcore/reader/file/main/FileReaderTest.java index f5f2f9cd7..f1e440d04 100644 --- a/jcore-file-reader/src/test/java/de/julielab/jcore/reader/file/main/FileReaderTest.java +++ b/jcore-file-reader/src/test/java/de/julielab/jcore/reader/file/main/FileReaderTest.java @@ -29,16 +29,16 @@ import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; -import org.junit.BeforeClass; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class FileReaderTest { @@ -107,7 +107,7 @@ public class FileReaderTest { private static final String FILE_ARTIFACT_4 = "data/onlyToken/8563171.txt"; - @BeforeClass + @BeforeAll public static void setUp() throws Exception { writeArtifact(ARTIFACT_1, FILE_ARTIFACT_1); diff --git a/jcore-flair-ner-ae/README.md b/jcore-flair-ner-ae/README.md index a06e8a4d7..69d4b0ee0 100644 --- a/jcore-flair-ner-ae/README.md +++ b/jcore-flair-ner-ae/README.md @@ -12,6 +12,8 @@ The python executable lookup works as follows: 2. Otherwise, if the environment variable `PYTHON` is set, this value is used. 3. Otherwise, the `python` command is used. +Tested with flair 0.6.1 and PyTorch 1.7.1. + **1. Parameters** | Parameter Name | Parameter Type | Mandatory | Multivalued | Description | diff --git a/jcore-flair-ner-ae/component.meta b/jcore-flair-ner-ae/component.meta index 5340cb3ce..ee7f7bc6c 100644 --- a/jcore-flair-ner-ae/component.meta +++ b/jcore-flair-ner-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-flair-ner-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Flair NER AE" } diff --git a/jcore-flair-ner-ae/pom.xml b/jcore-flair-ner-ae/pom.xml index 5e9b35b49..7c2dd185c 100644 --- a/jcore-flair-ner-ae/pom.xml +++ b/jcore-flair-ner-ae/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -21,7 +21,7 @@ de.julielab java-stdio-ipc - 1.0.1 + 1.0.3 de.julielab @@ -43,7 +43,7 @@ de.julielab jcore-annotation-adder-ae - 2.5.1-SNAPSHOT + 2.6.0 ch.qos.logback diff --git a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java index 4aea01797..cf36e6c22 100644 --- a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java +++ b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java @@ -1,5 +1,6 @@ package de.julielab.jcore.ae.flairner; +import de.julielab.java.utilities.IOStreamUtilities; import de.julielab.jcore.ae.annotationadder.AnnotationAdderAnnotator; import de.julielab.jcore.ae.annotationadder.AnnotationAdderConfiguration; import de.julielab.jcore.ae.annotationadder.AnnotationAdderHelper; @@ -8,9 +9,11 @@ import de.julielab.jcore.types.EntityMention; import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; +import de.julielab.jcore.types.pubmed.InternalReference; import de.julielab.jcore.utility.JCoReAnnotationTools; import de.julielab.jcore.utility.JCoReTools; import de.julielab.jcore.utility.index.Comparators; +import de.julielab.jcore.utility.index.JCoReOverlapAnnotationIndex; import de.julielab.jcore.utility.index.JCoReTreeMapAnnotationIndex; import de.julielab.jcore.utility.index.TermGenerators; import org.apache.uima.UimaContext; @@ -21,8 +24,10 @@ import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.DoubleArray; +import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -30,6 +35,8 @@ import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.StreamSupport; @@ -57,9 +64,9 @@ public class FlairNerAnnotator extends JCasAnnotator_ImplBase { private String pythonExecutable; @ConfigurationParameter(name = PARAM_STORE_EMBEDDINGS, mandatory = false, description = "Optional. Possible values: ALL, ENTITIES, NONE. The FLAIR SequenceTagger first computes the embeddings for each sentence and uses those as input for the actual NER algorithm. By default, the embeddings are not stored. By setting this parameter to ALL, the embeddings of all tokens of the sentence are retrieved from flair and stored in the embeddingVectors feature of each token. Setting the parameter to ENTITIES will restrict the embedding storage to those tokens which overlap with an entity recognized by FLAIR.") private StoreEmbeddings storeEmbeddings; - @ConfigurationParameter(name = PARAM_GPU_NUM, mandatory = false, defaultValue="0", description = "Specifies the GPU device number to be used for FLAIR. This setting can be overwritten by the Java system property 'flairner.device'.") + @ConfigurationParameter(name = PARAM_GPU_NUM, mandatory = false, defaultValue = "0", description = "Specifies the GPU device number to be used for FLAIR. This setting can be overwritten by the Java system property 'flairner.device'.") private int gpuNum; - @ConfigurationParameter(name=PARAM_COMPONENT_ID, mandatory = false, description = "Specifies the componentId feature value given to the created annotations. Defaults to 'FlairNerAnnotator'.") + @ConfigurationParameter(name = PARAM_COMPONENT_ID, mandatory = false, description = "Specifies the componentId feature value given to the created annotations. Defaults to 'FlairNerAnnotator'.") private String componentId; private AnnotationAdderConfiguration adderConfig; @@ -72,7 +79,7 @@ public void initialize(final UimaContext aContext) throws ResourceInitialization entityClass = (String) aContext.getConfigParameterValue(PARAM_ANNOTATION_TYPE); flairModel = (String) aContext.getConfigParameterValue(PARAM_FLAIR_MODEL); storeEmbeddings = StoreEmbeddings.valueOf(Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_STORE_EMBEDDINGS)).orElse(StoreEmbeddings.NONE.name())); - gpuNum = Optional.ofNullable((Integer)aContext.getConfigParameterValue(PARAM_GPU_NUM)).orElse(0); + gpuNum = Optional.ofNullable((Integer) aContext.getConfigParameterValue(PARAM_GPU_NUM)).orElse(0); componentId = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_COMPONENT_ID)).orElse(getClass().getSimpleName()); if (System.getProperty(GPU_NUM_SYS_PROP) != null) { try { @@ -95,9 +102,35 @@ public void initialize(final UimaContext aContext) throws ResourceInitialization pythonExecutable = pythonExecutableOpt.get(); log.info("Python executable: {} (from descriptor)", pythonExecutable); } + List pythonCommands = List.of("python3", "python3.6", "python36", "python3.7", "python37", "python"); + for (int i = 0; i < pythonCommands.size() && pythonExecutable == null; i++) { + String currentPythonExecutable = pythonCommands.get(i); + log.debug("Trying Python executable: {}", currentPythonExecutable); + try { + try { + Process exec = new ProcessBuilder(List.of(currentPythonExecutable, "--version")).redirectErrorStream(true).start(); + List pythonOutput = IOStreamUtilities.getLinesFromInputStream(exec.getInputStream()); + int exitCode = exec.waitFor(); + if (exitCode == 0 && !pythonOutput.isEmpty()) { + String versionLine = pythonOutput.get(0); + Matcher m = Pattern.compile("3\\..*$").matcher(versionLine); + if (m.find()) { + pythonExecutable = currentPythonExecutable; + log.info("Found Python {} with command {}.", m.group(), pythonExecutable); + } + } + } catch (IOException e) { + log.trace("Python command {} does not exist. Trying the next.", currentPythonExecutable); + } + } catch (InterruptedException e) { + log.error("Error why trying to call python.", e); + throw new ResourceInitializationException(e); + } + } if (pythonExecutable == null) { - pythonExecutable = "python"; - log.info("Python executable: {} (default)", pythonExecutable); + String msg = String.format("Could not find Python 3.x installation. The following commands were tried: %s. Please make Python 3.x available under one of those commands or specify the Python executable explicitly in the component descriptor.", String.join(", ", pythonCommands)); + log.error(msg); + throw new ResourceInitializationException(new IllegalArgumentException(msg)); } try { connector = new StdioPythonConnector(flairModel, pythonExecutable, storeEmbeddings, gpuNum); @@ -125,22 +158,37 @@ public void initialize(final UimaContext aContext) throws ResourceInitialization */ @Override public void process(final JCas aJCas) throws AnalysisEngineProcessException { - int i = 0; - final AnnotationIndex sentIndex = aJCas.getAnnotationIndex(Sentence.class); - Map sentenceMap = new HashMap<>(); - for (Sentence sentence : sentIndex) { - if (sentence.getId() == null) - sentence.setId("s" + i++); - sentenceMap.put(sentence.getId(), sentence); - } try { + int i = 0; + final AnnotationIndex sentIndex = aJCas.getAnnotationIndex(Sentence.class); + Map sentenceMap = new HashMap<>(); + for (Sentence sentence : sentIndex) { + if (sentence.getId() == null) + sentence.setId("s" + i++); + sentenceMap.put(sentence.getId(), sentence); + } + if (log.isDebugEnabled()) { + if (sentenceMap.isEmpty()) + log.debug("Document {} does not have any sentences.", JCoReTools.getDocId(aJCas)); + if (!aJCas.getAnnotationIndex(Token.class).iterator().hasNext()) + log.debug("Document {} does not have any tokens", JCoReTools.getDocId(aJCas)); + } + JCoReOverlapAnnotationIndex intRefIndex = new JCoReOverlapAnnotationIndex<>(aJCas, InternalReference.type); final AnnotationAdderHelper helper = new AnnotationAdderHelper(); + if (log.isTraceEnabled()) + log.trace("Sending document sentences to flair for entity tagging: {}", JCasUtil.select(aJCas, Sentence.class).stream().map(Sentence::getCoveredText).collect(Collectors.toList())); final NerTaggingResponse taggingResponse = connector.tagSentences(StreamSupport.stream(sentIndex.spliterator(), false)); final List taggedEntities = taggingResponse.getTaggedEntities(); for (TaggedEntity entity : taggedEntities) { + log.trace("Adding flair-tagged entity to the CAS: {}", entity); final Sentence sentence = sentenceMap.get(entity.getDocumentId()); EntityMention em = (EntityMention) JCoReAnnotationTools.getAnnotationByClassName(aJCas, entityClass); helper.setAnnotationOffsetsRelativeToSentence(sentence, em, entity, adderConfig); + excludeReferenceAnnotationSpans(em, intRefIndex); + if (em.getEnd() <= em.getBegin() || em.getCoveredText().isBlank()) { + // It seems there was nothing left of a gene mention outside the internal reference; skip + continue; + } em.setSpecificType(entity.getTag()); em.setConfidence(String.valueOf(entity.getLabelConfidence())); em.setComponentId(componentId); @@ -160,6 +208,9 @@ public void process(final JCas aJCas) throws AnalysisEngineProcessException { final String docId = JCoReTools.getDocId(aJCas); log.error("Could not set the offsets of an annotation in document {}", docId); throw new AnalysisEngineProcessException(e); + } catch (Throwable t) { + log.error("Error in {}", this.getClass().getSimpleName(), t); + throw new AnalysisEngineProcessException(t); } } @@ -167,7 +218,7 @@ private void addTokenEmbeddings(JCas aJCas, Map sentenceMap, A final List tokenEmbeddings = taggingResponse.getTokenEmbeddings(); JCoReTreeMapAnnotationIndex tokenIndex = null; if (!tokenEmbeddings.isEmpty()) - tokenIndex = new JCoReTreeMapAnnotationIndex<>(Comparators.longOverlapComparator(),TermGenerators.longOffsetTermGenerator(), TermGenerators.longOffsetTermGenerator(), aJCas, Token.type); + tokenIndex = new JCoReTreeMapAnnotationIndex<>(Comparators.longOverlapComparator(), TermGenerators.longOffsetTermGenerator(), TermGenerators.longOffsetTermGenerator(), aJCas, Token.type); Map> originalTokenEmbeddings = new HashMap<>(); for (TokenEmbedding tokenEmbedding : tokenEmbeddings) { final Sentence sentence = sentenceMap.get(tokenEmbedding.getSentenceId()); @@ -213,6 +264,32 @@ private void addTokenEmbeddings(JCas aJCas, Map sentenceMap, A } } + /** + * Internal references can actually look like a part of a gene, e.g. "filament19" where "19" is a reference. + * Exclude those spans from the gene mentions. + * + * @param a The gene annotation. + * @param intRefIndex The reference index. + */ + private void excludeReferenceAnnotationSpans(Annotation a, JCoReOverlapAnnotationIndex intRefIndex) { + List annotationsInGene = intRefIndex.search(a); + for (Annotation overlappingAnnotation : annotationsInGene) { + if (overlappingAnnotation.getBegin() == a.getBegin()) { + a.setBegin(overlappingAnnotation.getEnd()); + } + if (overlappingAnnotation.getEnd() == a.getEnd()) { + a.setEnd(overlappingAnnotation.getBegin()); + } + // Set zero-character spans on genes that are completely enclosed by a reference. Those are cases + // like, for instance, "Supplementary Figs. S12 and S13, Tables S2 and S3" where S12, S13 and even + // Tables S2 are annotated as genes. + if (overlappingAnnotation.getBegin() <= a.getBegin() && overlappingAnnotation.getEnd() >= a.getEnd()) { + a.setBegin(0); + a.setEnd(0); + } + } + } + @Override public void collectionProcessComplete() throws AnalysisEngineProcessException { try { diff --git a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/NerTaggingResponse.java b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/NerTaggingResponse.java index f28e7bd22..b876a0731 100644 --- a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/NerTaggingResponse.java +++ b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/NerTaggingResponse.java @@ -1,7 +1,6 @@ package de.julielab.jcore.ae.flairner; import java.util.List; -import java.util.stream.Stream; /** *

A class to assemble the response from FLAIR for a tagging request. The found entities are returned as diff --git a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/PythonConnector.java b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/PythonConnector.java index 2ba03c82c..f7a09ba7b 100644 --- a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/PythonConnector.java +++ b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/PythonConnector.java @@ -1,7 +1,6 @@ package de.julielab.jcore.ae.flairner; import de.julielab.jcore.types.Sentence; -import org.apache.commons.lang3.tuple.Pair; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import java.io.IOException; diff --git a/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/desc/jcore-flair-ner-ae.xml b/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/desc/jcore-flair-ner-ae.xml index 3d158471f..f4ca36655 100644 --- a/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/desc/jcore-flair-ner-ae.xml +++ b/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/desc/jcore-flair-ner-ae.xml @@ -6,7 +6,7 @@ JCoRe Flair Named Entity Recognizer This component starts a child process to a python interpreter and loads a Flair sequence tagging model. Sentences are taken from the CAS, sent to Flair for tagging and the results are written into the CAS. The annotation type to use can be configured. It must be a subtype of de.julielab.jcore.types.EntityMention. The tag of each entity is written to the specificType feature. - 2.5.1-SNAPSHOT + 2.6.0 AnnotationType diff --git a/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/python/nerScript.py b/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/python/nerScript.py index d55859594..f37fdab4a 100644 --- a/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/python/nerScript.py +++ b/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/python/nerScript.py @@ -43,9 +43,16 @@ def decodeString(buffer): ba = bytearray() for sentenceToTag in sentenceTaggingRequests: sid = sentenceToTag['sid'] - sentence = Sentence(sentenceToTag['text']) + # In newer flair versions we need to specify the tokenizer in order to use + # the exact input tokenization and avoid token offset mismatches + if "0.4" in flair.__version__: + sentence = Sentence(sentenceToTag['text']) + else: + from flair.tokenization import SpaceTokenizer + # Use the SpaceTokenizer to just use the tokenization given from UIMA + sentence = Sentence(sentenceToTag['text'], use_tokenizer=SpaceTokenizer()) # NER tagging - embeddingStorageMode = "none" if sendEmbeddings == "NONE" else "cpu"; + embeddingStorageMode = "none" if sendEmbeddings == "NONE" else "cpu" tagger.predict(sentence, embedding_storage_mode = embeddingStorageMode) for e in sentence.get_spans("ner"): diff --git a/jcore-flair-ner-ae/src/test/java/de/julielab/jcore/ae/flairner/FlairNerAnnotatorTest.java b/jcore-flair-ner-ae/src/test/java/de/julielab/jcore/ae/flairner/FlairNerAnnotatorTest.java index 2317e08e9..56fc4d046 100644 --- a/jcore-flair-ner-ae/src/test/java/de/julielab/jcore/ae/flairner/FlairNerAnnotatorTest.java +++ b/jcore-flair-ner-ae/src/test/java/de/julielab/jcore/ae/flairner/FlairNerAnnotatorTest.java @@ -4,6 +4,7 @@ import de.julielab.jcore.types.Gene; import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; +import de.julielab.jcore.types.pubmed.InternalReference; import de.julielab.jcore.utility.index.Comparators; import de.julielab.jcore.utility.index.JCoReTreeMapAnnotationIndex; import de.julielab.jcore.utility.index.TermGenerators; @@ -43,7 +44,7 @@ public class FlairNerAnnotatorTest { @Test public void testAnnotatorWithoutWordEmbeddings() throws Exception { - final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types"); + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); final AnalysisEngine engine = AnalysisEngineFactory.createEngine(FlairNerAnnotator.class, FlairNerAnnotator.PARAM_ANNOTATION_TYPE, Gene.class.getCanonicalName(), FlairNerAnnotator.PARAM_FLAIR_MODEL, "src/test/resources/genes-small-model.pt"); String text = "Knockdown of SUB1 homolog by siRNA inhibits the early stages of HIV-1 replication in 293T cells infected with VSV-G pseudotyped HIV-1 ."; jCas.setDocumentText(text); @@ -69,10 +70,39 @@ public void testAnnotatorWithoutWordEmbeddings() throws Exception { engine.collectionProcessComplete(); } + @Test + public void testAnnotatorWithoutWordEmbeddings2() throws Exception { + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); + final AnalysisEngine engine = AnalysisEngineFactory.createEngine(FlairNerAnnotator.class, FlairNerAnnotator.PARAM_ANNOTATION_TYPE, Gene.class.getCanonicalName(), FlairNerAnnotator.PARAM_FLAIR_MODEL, "src/test/resources/genes-small-model.pt"); + String text = "Knockdown of SUB1 homolog2 by siRNA inhibits the early stages of HIV-1 replication in 293T cells infected with VSV-G pseudotyped HIV-1 ."; + jCas.setDocumentText(text); + Sentence s = new Sentence(jCas, 0, text.length()); + addTokens(jCas); + s.addToIndexes(); + new InternalReference(jCas, 25, 26).addToIndexes(); + engine.process(jCas); + List foundGenes = new ArrayList<>(); + JCoReTreeMapAnnotationIndex tokenIndex = new JCoReTreeMapAnnotationIndex<>(TermGenerators.longOffsetTermGenerator(), TermGenerators.longOffsetTermGenerator(), jCas, Token.type); + for (Annotation a : jCas.getAnnotationIndex(Gene.type)) { + Gene g = (Gene) a; + foundGenes.add(g.getCoveredText()); + assertThat(g.getSpecificType().equals("Gene")); + final Iterator tokenIt = tokenIndex.searchFuzzy(g).iterator(); + while (tokenIt.hasNext()) { + Token token = tokenIt.next(); + assertThat(token.getEmbeddingVectors()).isNull(); + } + assertThat(Double.parseDouble(g.getConfidence())).isGreaterThan(0.64); + assertThat(g.getComponentId().equals(FlairNerAnnotator.class.getSimpleName())); + } + assertThat(foundGenes).containsExactly("SUB1 homolog", "HIV-1", "VSV-G", "HIV-1"); + engine.collectionProcessComplete(); + } + @Test public void testAnnotatorWithEntityWordEmbeddings() throws Exception { embeddingsCache.clear(); - final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types"); + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); final AnalysisEngine engine = AnalysisEngineFactory.createEngine(FlairNerAnnotator.class, FlairNerAnnotator.PARAM_STORE_EMBEDDINGS, ENTITIES, FlairNerAnnotator.PARAM_ANNOTATION_TYPE, Gene.class.getCanonicalName(), FlairNerAnnotator.PARAM_FLAIR_MODEL, "src/test/resources/genes-small-model.pt", FlairNerAnnotator.PARAM_COMPONENT_ID, "ATotallyDifferentComponentId"); String text = "Knockdown of SUB1 homolog by siRNA inhibits the early stages of HIV-1 replication in 293T cells infected with VSV-G pseudotyped HIV-1 ."; jCas.setDocumentText(text); @@ -111,7 +141,7 @@ public void testAnnotatorWithEntityWordEmbeddings() throws Exception { @Test(dependsOnMethods = "testAnnotatorWithEntityWordEmbeddings") public void testAnnotatorWithEntitySubWordEmbeddings() throws Exception { - final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types"); + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); final AnalysisEngine engine = AnalysisEngineFactory.createEngine(FlairNerAnnotator.class, FlairNerAnnotator.PARAM_STORE_EMBEDDINGS, ENTITIES, FlairNerAnnotator.PARAM_ANNOTATION_TYPE, Gene.class.getCanonicalName(), FlairNerAnnotator.PARAM_FLAIR_MODEL, "src/test/resources/genes-small-model.pt"); String text = "Knockdown of SUB1 homolog by siRNA inhibits the early stages of HIV-1 replication in 293T cells infected with VSV-G pseudotyped HIV-1 ."; jCas.setDocumentText(text); @@ -179,7 +209,7 @@ private double l2Norm(double[] vector) { @Test public void testAnnotatorWithAllEmbeddings() throws Exception { - final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types"); + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); final AnalysisEngine engine = AnalysisEngineFactory.createEngine(FlairNerAnnotator.class, FlairNerAnnotator.PARAM_STORE_EMBEDDINGS, FlairNerAnnotator.StoreEmbeddings.ALL, FlairNerAnnotator.PARAM_ANNOTATION_TYPE, Gene.class.getCanonicalName(), FlairNerAnnotator.PARAM_FLAIR_MODEL, "src/test/resources/genes-small-model.pt"); String text = "Knockdown of SUB1 homolog by siRNA inhibits the early stages of HIV-1 replication in 293T cells infected with VSV-G pseudotyped HIV-1 ."; jCas.setDocumentText(text); @@ -214,7 +244,7 @@ private void addSentences(JCas jCas) { @Test public void testAnnotator2() throws Exception { - final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types"); + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); final AnalysisEngine engine = AnalysisEngineFactory.createEngine(FlairNerAnnotator.class, FlairNerAnnotator.PARAM_ANNOTATION_TYPE, Gene.class.getCanonicalName(), FlairNerAnnotator.PARAM_FLAIR_MODEL, "src/test/resources/genes-small-model.pt"); // The sentence detection and tokenization was done by the jcore-j[st]bd-biomedical-english JCoRe project components, using the executable (java -jar) command line artifact created when building the components. String text = "Synergistic lethal effect between hydrogen peroxide and neocuproine ( 2,9-dimethyl 1,10-phenanthroline ) in Escherichia coli .\n" + @@ -240,8 +270,8 @@ public void testAnnotator2() throws Exception { } @Test - public void testAnnotatorOnOffsetIsseDocument() throws Exception { - final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types"); + public void testAnnotatorOnOffsetIssueDocument() throws Exception { + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); final AnalysisEngine engine = AnalysisEngineFactory.createEngine(FlairNerAnnotator.class, FlairNerAnnotator.PARAM_ANNOTATION_TYPE, Gene.class.getCanonicalName(), FlairNerAnnotator.PARAM_FLAIR_MODEL, "src/test/resources/genes-small-model.pt"); XmiCasDeserializer.deserialize(new FileInputStream(Path.of("src", "test", "resources", "1681975.xmi").toString()), jCas.getCas()); @@ -259,7 +289,7 @@ public void testAnnotatorOnOffsetIsseDocument() throws Exception { @Test public void testEmbeddings2() throws Exception { - final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types"); + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); final AnalysisEngine engine = AnalysisEngineFactory.createEngine(FlairNerAnnotator.class, FlairNerAnnotator.PARAM_ANNOTATION_TYPE, Gene.class.getCanonicalName(), FlairNerAnnotator.PARAM_FLAIR_MODEL, "src/test/resources/genes-small-model.pt", FlairNerAnnotator.PARAM_STORE_EMBEDDINGS, ENTITIES); // The sentence detection and tokenization was done by the jcore-j[st]bd-biomedical-english JCoRe project components, using the executable (java -jar) command line artifact created when building the components. String text = "We show that tal controls gene expression and tissue folding in Drosophila , thus acting as a link between patterning and morphogenesis .\n tal function is mediated by several 33-nucleotide-long open reading frames ( ORFs )"; diff --git a/jcore-flair-ner-ae/src/test/resources/1681975.xmi b/jcore-flair-ner-ae/src/test/resources/1681975.xmi index 467d07936..04b9b74fa 100644 --- a/jcore-flair-ner-ae/src/test/resources/1681975.xmi +++ b/jcore-flair-ner-ae/src/test/resources/1681975.xmi @@ -1 +1,5 @@ -1681975 \ No newline at end of file + +1681975 \ No newline at end of file diff --git a/jcore-flair-token-embedding-ae/component.meta b/jcore-flair-token-embedding-ae/component.meta index 82dc90b84..da21368ce 100644 --- a/jcore-flair-token-embedding-ae/component.meta +++ b/jcore-flair-token-embedding-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-flair-token-embedding-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Flair Token Embedding Annotator" } diff --git a/jcore-flair-token-embedding-ae/pom.xml b/jcore-flair-token-embedding-ae/pom.xml index aa197c601..18e2e2a77 100644 --- a/jcore-flair-token-embedding-ae/pom.xml +++ b/jcore-flair-token-embedding-ae/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -30,11 +30,11 @@ de.julielab java-stdio-ipc - 1.0.1 + 1.0.3 - junit - junit + org.junit.jupiter + junit-jupiter-engine com.google.code.gson diff --git a/jcore-flair-token-embedding-ae/src/main/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotator.java b/jcore-flair-token-embedding-ae/src/main/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotator.java index d41381bc9..a268d48fd 100644 --- a/jcore-flair-token-embedding-ae/src/main/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotator.java +++ b/jcore-flair-token-embedding-ae/src/main/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotator.java @@ -4,6 +4,7 @@ import de.julielab.ipc.javabridge.Options; import de.julielab.ipc.javabridge.ResultDecoders; import de.julielab.ipc.javabridge.StdioBridge; +import de.julielab.java.utilities.IOStreamUtilities; import de.julielab.jcore.types.EmbeddingVector; import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; @@ -30,6 +31,8 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; @ResourceMetaData(name = "JCoRe Flair Token Embedding Annotator", description = "Adds the Flair compatible embedding vectors to the token annotations.") @TypeCapability(inputs = {"de.julielab.jcore.types.Sentence", "de.julielab.jcore.types.Token"}, outputs = {"de.julielab.jcore.types.EmbeddingVector"}) @@ -37,7 +40,7 @@ public class FlairTokenEmbeddingAnnotator extends JCasAnnotator_ImplBase { public static final String PARAM_EMBEDDING_PATH = "EmbeddingPath"; public static final String PARAM_COMPUTATION_FILTER = "ComputationFilter"; - public static final String PARAM_EMBEDDING_SOURCE = "EmbeddingSource"; + public static final String PARAM_EMBEDDING_SOURCE = "EmbeddingSource"; public static final String PARAM_PYTHON_EXECUTABLE = "PythonExecutable"; private final static Logger log = LoggerFactory.getLogger(FlairTokenEmbeddingAnnotator.class); /** @@ -48,9 +51,9 @@ public class FlairTokenEmbeddingAnnotator extends JCasAnnotator_ImplBase { private String embeddingPath; @ConfigurationParameter(name = PARAM_COMPUTATION_FILTER, mandatory = false, description = "This parameter may be set to a fully qualified annotation type. If given, only for documents containing at least one annotation of this type embeddings will be retrieved from the computing flair python script. However, for contextualized embeddings, all embedding vectors are computed anyway and the the I/O cost is minor in comparison to the embedding computation. Thus, setting this parameter will most probably only result in small time savings.") private String computationFilter; - @ConfigurationParameter(name=PARAM_EMBEDDING_SOURCE, mandatory = false, description = "The value of this parameter will be set to the source feature of the EmbeddingVector annotation instance created on the tokens. If left blank, the value of the " + PARAM_EMBEDDING_PATH + " will be used.") + @ConfigurationParameter(name = PARAM_EMBEDDING_SOURCE, mandatory = false, description = "The value of this parameter will be set to the source feature of the EmbeddingVector annotation instance created on the tokens. If left blank, the value of the " + PARAM_EMBEDDING_PATH + " will be used.") private String embeddingSource; - @ConfigurationParameter(name=PARAM_PYTHON_EXECUTABLE, mandatory = false, description = "The path to the python executable. Required is a python verion >=3.6.") + @ConfigurationParameter(name = PARAM_PYTHON_EXECUTABLE, mandatory = false, description = "The path to the python executable. Required is a python version >=3.6.") private String pythonExecutable; private StdioBridge flairBridge; private Gson gson; @@ -68,9 +71,9 @@ public void initialize(final UimaContext aContext) throws ResourceInitialization computationFilter = (String) aContext.getConfigParameterValue(PARAM_COMPUTATION_FILTER); embeddingSource = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_EMBEDDING_SOURCE)).orElse(embeddingPath); - Optional pythonExecutableOpt = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_PYTHON_EXECUTABLE)); + Optional pythonExecutableOpt = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_PYTHON_EXECUTABLE)); if (!pythonExecutableOpt.isPresent()) { - log.debug("No python executable given in the component descriptor, trying to read PYTHON environment variable." ); + log.debug("No Python executable given in the component descriptor, trying to read PYTHON environment variable."); final String pythonExecutableEnv = System.getenv("PYTHON"); if (pythonExecutableEnv != null) { pythonExecutable = pythonExecutableEnv; @@ -80,9 +83,35 @@ public void initialize(final UimaContext aContext) throws ResourceInitialization pythonExecutable = pythonExecutableOpt.get(); log.info("Python executable: {} (from descriptor)", pythonExecutable); } + List pythonCommands = List.of("python3", "python3.6", "python36", "python3.7", "python37", "python"); + for (int i = 0; i < pythonCommands.size() && pythonExecutable == null; i++) { + String currentPythonExecutable = pythonCommands.get(i); + log.debug("Trying Python executable: {}", currentPythonExecutable); + try { + try { + Process exec = new ProcessBuilder(List.of(currentPythonExecutable, "--version")).redirectErrorStream(true).start(); + List pythonOutput = IOStreamUtilities.getLinesFromInputStream(exec.getInputStream()); + int exitCode = exec.waitFor(); + if (exitCode == 0 && !pythonOutput.isEmpty()) { + String versionLine = pythonOutput.get(0); + Matcher m = Pattern.compile("3\\..*$").matcher(versionLine); + if (m.find()) { + pythonExecutable = currentPythonExecutable; + log.info("Found Python {} with command {}.", m.group(), pythonExecutable); + } + } + } catch (IOException e) { + log.trace("Python command {} does not exist. Trying the next.", currentPythonExecutable); + } + } catch (InterruptedException e) { + log.error("Error why trying to call python.", e); + throw new ResourceInitializationException(e); + } + } if (pythonExecutable == null) { - pythonExecutable = "python3.6"; - log.info("Python executable: {} (default)", pythonExecutable); + String msg = String.format("Could not find Python 3.x installation. The following commands were tried: %s. Please make Python 3.x available under one of those commands or specify the Python executable explicitly in the component descriptor.", String.join(", ", pythonCommands)); + log.error(msg); + throw new ResourceInitializationException(new IllegalArgumentException(msg)); } try { @@ -183,7 +212,7 @@ private String constructEmbeddingRequest(JCas aJCas, List tokenToAddEmbed } ++tokenIndex; } - sentenceTextSb.deleteCharAt(sentenceTextSb.length()-1); + sentenceTextSb.deleteCharAt(sentenceTextSb.length() - 1); Map sentenceAndIndices = new HashMap<>(); sentenceAndIndices.put("sentence", sentenceTextSb.toString()); sentenceAndIndices.put("tokenIndicesToReturn", tokenIndicesToSet); diff --git a/jcore-flair-token-embedding-ae/src/main/resources/de/julielab/jcore/ae/fte/desc/jcore-flair-token-embedding-ae.xml b/jcore-flair-token-embedding-ae/src/main/resources/de/julielab/jcore/ae/fte/desc/jcore-flair-token-embedding-ae.xml index 81db110e0..343cbce3b 100644 --- a/jcore-flair-token-embedding-ae/src/main/resources/de/julielab/jcore/ae/fte/desc/jcore-flair-token-embedding-ae.xml +++ b/jcore-flair-token-embedding-ae/src/main/resources/de/julielab/jcore/ae/fte/desc/jcore-flair-token-embedding-ae.xml @@ -6,7 +6,7 @@ JCoRe Flair Token Embedding Annotator Adds the Flair compatible embedding vectors to the token annotations. - 2.5.1-SNAPSHOT + 2.6.0 EmbeddingPath diff --git a/jcore-flair-token-embedding-ae/src/main/resources/de/julielab/jcore/ae/fte/python/getEmbeddingScript.py b/jcore-flair-token-embedding-ae/src/main/resources/de/julielab/jcore/ae/fte/python/getEmbeddingScript.py index a262f84af..43095851a 100644 --- a/jcore-flair-token-embedding-ae/src/main/resources/de/julielab/jcore/ae/fte/python/getEmbeddingScript.py +++ b/jcore-flair-token-embedding-ae/src/main/resources/de/julielab/jcore/ae/fte/python/getEmbeddingScript.py @@ -1,15 +1,11 @@ -import os -from flair.models import SequenceTagger +import json +import sys +import time from flair.data import Sentence -from typing import List - -from flair.embeddings import WordEmbeddings, CharacterEmbeddings, BytePairEmbeddings, FlairEmbeddings, BertEmbeddings, ELMoEmbeddings from flair.embeddings import StackedEmbeddings - -import sys -import json +from flair.embeddings import WordEmbeddings, CharacterEmbeddings, BytePairEmbeddings, FlairEmbeddings, BertEmbeddings, \ + ELMoEmbeddings from struct import * -import time def decodeString(buffer): diff --git a/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/EmbeddingScriptTest.java b/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/EmbeddingScriptTest.java index ee2ff04ae..d62ad9b4e 100644 --- a/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/EmbeddingScriptTest.java +++ b/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/EmbeddingScriptTest.java @@ -5,8 +5,8 @@ import de.julielab.ipc.javabridge.ResultDecoders; import de.julielab.ipc.javabridge.StdioBridge; import org.assertj.core.data.Offset; -import org.junit.BeforeClass; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.util.ArrayList; import java.util.HashMap; @@ -20,11 +20,11 @@ public class EmbeddingScriptTest { private static final String SCRIPT_PATH = "src/main/resources/de/julielab/jcore/ae/fte/python/getEmbeddingScript.py"; private static String pythonCommand; - @BeforeClass + @BeforeAll public static void setup() { pythonCommand = System.getenv("PYTHON"); if (pythonCommand == null) - pythonCommand = "python3.6"; + pythonCommand = "python"; } @Test @@ -49,11 +49,7 @@ public void testPythonEmbeddingScriptSimple() throws Exception { final double[][] vectors = response.map(ResultDecoders.decodeVectors).findAny().get(); bridge.stop(); - assertThat(vectors).hasSize(10); - for (double[] vector : vectors) { - // The vectors should all have a dimensionality of 1024 - assertThat(vector.length).isEqualTo(1024); - } + assertThat(vectors).hasDimensions(10, 1024); // Those values were output using print(token.embedding.numpy(), file=sys.stderr) in the script assertThat(vectors[0][0]).isCloseTo(1.8812446e-01, Offset.offset(0.000001)); @@ -86,11 +82,7 @@ public void testPythonEmbeddingScriptSpecificVectorsResponse() throws Exception final double[][] vectors = response.map(ResultDecoders.decodeVectors).findAny().get(); bridge.stop(); - assertThat(vectors).hasSize(2); - for (int i = 0; i < vectors.length; i++) { - // The vectors should all have a dimensionality of 1024 - assertThat(vectors[i].length).isEqualTo(1024); - } + assertThat(vectors).hasDimensions(2, 1024); // Those values were output using print(token.embedding.numpy(), file=sys.stderr) in the script assertThat(vectors[0][0]).isCloseTo(-0.16511102, Offset.offset(0.000001)); @@ -128,7 +120,7 @@ public void testPythonEmbeddingScriptMultipleSentences() throws Exception { final double[][] vectors = response.map(ResultDecoders.decodeVectors).findAny().get(); bridge.stop(); - assertThat(vectors).hasSize(12); + assertThat(vectors.length).isEqualTo(12); } } diff --git a/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotatorTest.java b/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotatorTest.java index 200bb491c..f6ef8acce 100644 --- a/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotatorTest.java +++ b/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotatorTest.java @@ -8,7 +8,7 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.Collection; @@ -18,6 +18,7 @@ * Unit tests for jcore-flair-token-embedding-ae. */ public class FlairTokenEmbeddingAnnotatorTest { + @Test public void testEmbeddingAnnotator() throws Exception { final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types"); @@ -29,7 +30,8 @@ public void testEmbeddingAnnotator() throws Exception { addTokens(jCas); final String embeddingPath = "flair:src/test/resources/gene_small_best_lm.pt"; - final AnalysisEngine engine = AnalysisEngineFactory.createEngine("de.julielab.jcore.ae.fte.desc.jcore-flair-token-embedding-ae", FlairTokenEmbeddingAnnotator.PARAM_EMBEDDING_PATH, embeddingPath); + final AnalysisEngine engine = AnalysisEngineFactory.createEngine("de.julielab.jcore.ae.fte.desc.jcore-flair-token-embedding-ae", + FlairTokenEmbeddingAnnotator.PARAM_EMBEDDING_PATH, embeddingPath); engine.process(jCas); @@ -58,7 +60,9 @@ public void testEmbeddingAnnotatorWithFilterAnnotation() throws Exception { new Gene(jCas, 75, 91).addToIndexes(); final String embeddingPath = "flair:src/test/resources/gene_small_best_lm.pt"; - final AnalysisEngine engine = AnalysisEngineFactory.createEngine("de.julielab.jcore.ae.fte.desc.jcore-flair-token-embedding-ae", FlairTokenEmbeddingAnnotator.PARAM_EMBEDDING_PATH, embeddingPath, FlairTokenEmbeddingAnnotator.PARAM_COMPUTATION_FILTER, "de.julielab.jcore.types.Gene"); + final AnalysisEngine engine = AnalysisEngineFactory.createEngine("de.julielab.jcore.ae.fte.desc.jcore-flair-token-embedding-ae", + FlairTokenEmbeddingAnnotator.PARAM_EMBEDDING_PATH, embeddingPath, + FlairTokenEmbeddingAnnotator.PARAM_COMPUTATION_FILTER, "de.julielab.jcore.types.Gene"); engine.process(jCas); diff --git a/jcore-flow-controllers/component.meta b/jcore-flow-controllers/component.meta new file mode 100644 index 000000000..bbae688c0 --- /dev/null +++ b/jcore-flow-controllers/component.meta @@ -0,0 +1,20 @@ +{ + "categories": [ + "flowcontroller" + ], + "description": "Flow controllers can be used to control the route a (J)CAS takes through an aggregate analysis engine. This project contains Flow Controllers developed at the JULIE Lab.", + "descriptors": [ + { + "category": "flowcontroller", + "location": "de.julielab.jcore.flow.annotationdefined.desc.jcore-annotation-defined-flowcontroller" + } + ], + "exposable": true, + "group": "general", + "maven-artifact": { + "artifactId": "jcore-flow-controllers", + "groupId": "de.julielab", + "version": "2.6.0" + }, + "name": "JCoRe Flow Controllers" +} diff --git a/jcore-flow-controllers/pom.xml b/jcore-flow-controllers/pom.xml new file mode 100644 index 000000000..1bd6ede1d --- /dev/null +++ b/jcore-flow-controllers/pom.xml @@ -0,0 +1,70 @@ + + + + jcore-base + de.julielab + 2.6.0 + + 4.0.0 + + jcore-flow-controllers + + + + org.slf4j + slf4j-api + + + de.julielab + jcore-types + ${jcore-types-version} + + + org.junit.jupiter + junit-jupiter-engine + + + ch.qos.logback + logback-classic + provided + + + org.assertj + assertj-core + test + + + de.julielab + jcore-utilities + ${jcore-utilities-version} + + + org.jetbrains + annotations + 21.0.1 + compile + + + de.julielab + jcore-descriptor-creator + + + + JCoRe Flow Controllers + + JULIE Lab Jena, Germany + http://www.julielab.de + + https://github.com/JULIELab/jcore-base/tree/master/jcore-flow-controllers + Flow controllers can be used to control the route a (J)CAS takes through an aggregate analysis engine. + This project contains Flow Controllers developed at the JULIE Lab. + + + + BSD-2-Clause + https://opensource.org/licenses/BSD-2-Clause + + + \ No newline at end of file diff --git a/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlow.java b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlow.java new file mode 100644 index 000000000..c945ef0eb --- /dev/null +++ b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlow.java @@ -0,0 +1,89 @@ +package de.julielab.jcore.flow.annotationdefined; + +import de.julielab.jcore.types.casflow.ToVisit; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.analysis_engine.metadata.FixedFlow; +import org.apache.uima.analysis_engine.metadata.FlowConstraints; +import org.apache.uima.flow.FinalStep; +import org.apache.uima.flow.JCasFlow_ImplBase; +import org.apache.uima.flow.SimpleStep; +import org.apache.uima.flow.Step; +import org.apache.uima.jcas.JCas; +import org.jetbrains.annotations.Nullable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Arrays; +import java.util.Set; +import java.util.stream.Collectors; + +/** + *

Returns steps according an existing {@link ToVisit} annotation of the CAS or, if not present, the default aggregate flow.

+ *

This is, for example, used by the XMLDBMultiplier to let CASes skip large parts of the pipeline when + * the currently read document already exists in the database.

+ */ +public class AnnotationDefinedFlow extends JCasFlow_ImplBase { + private final static Logger log = LoggerFactory.getLogger(AnnotationDefinedFlow.class); + private String[] toVisitKeys; + private String[] fixedFlow; + private int currentPos; + private String docId; + + /** + *

Creates a flow that follows to entries in {@link ToVisit#getDelegateKeys()} of toVisit or, if + * toVisit is null, falls back to the default fixed flow.

+ *

If toVisit is not null but the delegateKeys are null or empty, no component in the aggregate using this flow will process the respective CAS.

+ * + * @param toVisit An annotation containing the keys of the delegate AEs to visit. May be null which case the default fixed flow will be used. + * @param flowConstraints The default fixed flow of the aggregate analysis engine. + * @param jCas + * @throws AnalysisEngineProcessException If flowConstraints is not a fixed flow. + */ + public AnnotationDefinedFlow(@Nullable ToVisit toVisit, FlowConstraints flowConstraints, JCas jCas) throws AnalysisEngineProcessException { + if (!(flowConstraints instanceof FixedFlow)) + throw new AnalysisEngineProcessException(new IllegalArgumentException("This flow requires the FixedFlow to determine the default processing order. However, the flow constraints are of type " + flowConstraints.getClass().getCanonicalName())); + this.fixedFlow = ((FixedFlow) flowConstraints).getFixedFlow(); + // We have the following cases: + // 1. There are given keys to visit, use them. + // 2. There are no keys given but the ToVisit annotation is not null, skip all components. + // 3. There is not ToVisit annotation at all, use the default fixed flow. + if (log.isTraceEnabled()) { + docId = JCoReTools.getDocId(jCas); + if (toVisit != null) { + String[] delegateKeys = toVisit.getDelegateKeys() != null ? toVisit.getDelegateKeys().toArray() : null; + log.trace("Found ToVisit annotation for document {} with the following component keys: {}", docId, delegateKeys); + } else { + log.trace("Got no ToVisit annotation for document {}, the CAS is routed through the aggregate in the default order.", docId); + } + } + if (toVisit != null && toVisit.getDelegateKeys() != null) { + // filter for delegates actually contained in the current AAE. + Set knownKeys = Arrays.stream(this.fixedFlow).collect(Collectors.toSet()); + toVisitKeys = Arrays.stream(toVisit.getDelegateKeys().toArray()).filter(knownKeys::contains).toArray(String[]::new); + } else if (toVisit != null) + toVisitKeys = new String[0]; + else + toVisitKeys = null; + this.currentPos = 0; + } + + /** + *

Routes the CAS to the next component defined by the CAS'es {@link ToVisit} annotation or, + * if ToVisit was not found, to the next component as defined by the default fixed flow.

+ * + * @return The next component to visit or the next default flow component. + */ + @Override + public Step next() { + // If toVisitKeys was not given, we just use the fixedFlow. + if ((toVisitKeys == null && currentPos < fixedFlow.length) || (toVisitKeys != null && currentPos < toVisitKeys.length)) { + String nextAEKey = toVisitKeys != null ? toVisitKeys[currentPos] : fixedFlow[currentPos]; + ++currentPos; + log.trace("Next component key to visit for document {}: {}", docId, nextAEKey); + return new SimpleStep(nextAEKey); + } + log.trace("Flow finished for document {}.", docId); + return new FinalStep(); + } +} diff --git a/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowController.java b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowController.java new file mode 100644 index 000000000..c6c016e45 --- /dev/null +++ b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowController.java @@ -0,0 +1,25 @@ +package de.julielab.jcore.flow.annotationdefined; + +import de.julielab.jcore.types.casflow.ToVisit; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.flow.Flow; +import org.apache.uima.flow.JCasFlowController_ImplBase; +import org.apache.uima.jcas.JCas; + +/** + *

Routes CASes through an aggregate analysis engine according to the {@link ToVisit} annotation present in the CAS.

+ *

If there is no ToVisit annotation, the default (fixed) flow will be used. Thus, the fixed flow constraint + * must be set on the aggregate engine.

+ */ +@ResourceMetaData(name = "JCoRe Annotation Defined Flow Controller", description = "This flow controller relies on an annotation of type ToVisit to be present in the CAS. If there is no such annotation, the default fixed flow of the aggregate engine using this flow controller is used. Otherwise, the names of the components to pass the CAS to are taken from the annotation. If the annotation exists but defines to components to be visited by the CAS, no components are visited at all.", vendor = "JULIE Lab, Germany", version = "placeholder") +public class AnnotationDefinedFlowController extends JCasFlowController_ImplBase { + @Override + public Flow computeFlow(JCas jCas) throws AnalysisEngineProcessException { + boolean exists = JCasUtil.exists(jCas, ToVisit.class); + ToVisit toVisit = exists ? JCasUtil.selectSingle(jCas, ToVisit.class) : null; + // When toVisit is null, the default, fixed flow is used. + return new AnnotationDefinedFlow(toVisit, getContext().getAggregateMetadata().getFlowConstraints(), jCas); + } +} diff --git a/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/FixedInnerFlow.java b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/FixedInnerFlow.java new file mode 100644 index 000000000..eeae85f0a --- /dev/null +++ b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/FixedInnerFlow.java @@ -0,0 +1,40 @@ +package de.julielab.jcore.flow.annotationdefined; + +import org.apache.uima.flow.FinalStep; +import org.apache.uima.flow.JCasFlow_ImplBase; +import org.apache.uima.flow.SimpleStep; +import org.apache.uima.flow.Step; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + *

This flow is supposed to route the output CASes of the {@link de.julielab.jcore.reader.xmi.XmiDBMultiplier} in + * a fixed, sequential manner through the aggregate engine. It just skips the first delegate - the multiplier itself - then continues with the rest.

+ */ +public class FixedInnerFlow extends JCasFlow_ImplBase { + private final static Logger log = LoggerFactory.getLogger(FixedInnerFlow.class); + private int currentPosition; + private String[] fixedFlow; + + public FixedInnerFlow(String[] fixedFlow) { + this.fixedFlow = fixedFlow; + this.currentPosition = 0; + } + + public Step next() { + Step step = null; + for (; currentPosition < fixedFlow.length && step == null; currentPosition++) { + String aeKey = fixedFlow[currentPosition]; + // The first analysis engine is the multiplier + if (currentPosition > 0) { + log.trace("Inner next AE is: " + aeKey); + step = new SimpleStep(aeKey); + } + } + if (step == null) { + // no appropriate AEs to call - end of flow + log.trace("Inner flow Complete."); + } + return step == null ? new FinalStep() : step; + } +} diff --git a/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/HashComparisonFlowController.java b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/HashComparisonFlowController.java new file mode 100644 index 000000000..bdbf88c9c --- /dev/null +++ b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/HashComparisonFlowController.java @@ -0,0 +1,117 @@ +//package de.julielab.jcore.flow.annotationdefined; +// +//import de.julielab.costosys.configuration.FieldConfig; +//import de.julielab.costosys.dbconnection.CoStoSysConnection; +//import de.julielab.costosys.dbconnection.DataBaseConnector; +//import de.julielab.jcore.reader.db.DBReader; +//import de.julielab.jcore.types.casmultiplier.RowBatch; +//import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +//import org.apache.uima.cas.FeatureStructure; +//import org.apache.uima.fit.descriptor.ConfigurationParameter; +//import org.apache.uima.fit.descriptor.ResourceMetaData; +//import org.apache.uima.fit.util.JCasUtil; +//import org.apache.uima.flow.Flow; +//import org.apache.uima.flow.FlowControllerContext; +//import org.apache.uima.flow.JCasFlowController_ImplBase; +//import org.apache.uima.jcas.JCas; +//import org.apache.uima.jcas.cas.StringArray; +//import org.apache.uima.resource.ResourceInitializationException; +//import org.slf4j.Logger; +//import org.slf4j.LoggerFactory; +// +//import java.io.FileNotFoundException; +//import java.sql.ResultSet; +//import java.sql.SQLException; +//import java.util.*; +//import java.util.stream.Collectors; +// +///** +// *

Prereque

+// *

Expects a jCas as being output by the {@link de.julielab.jcore.reader.xmi.XmiDBMultiplierReader}, i.e. the CAS +// * should contain a {@link de.julielab.jcore.types.casmultiplier.RowBatch} annotation. Then, Retrieves the sha256 hashes for +// * the passed documents from the database.

+// */ +//@ResourceMetaData(name = "JCoRe Hash Comparison Flow Controller", description = "This flow controller aims to skip processing for CASes that already exist in the database and haven't changed with regards to a newly read version. For this purpose, the sha256 hash of the CAS document text is compared to the the existing hash in the database for the same document ID. If the hashes match, the text is the same and, thus, the annotations will be the same.") +//public class HashComparisonFlowController extends JCasFlowController_ImplBase { +// public static final String PARAM_ADD_SHA_HASH = "AddShaHash"; +// public static final String PARAM_TABLE_DOCUMENT = "DocumentTable"; +// private final static Logger log = LoggerFactory.getLogger(HashComparisonFlowController.class); +// @ConfigurationParameter(name = DBReader.PARAM_COSTOSYS_CONFIG_NAME, description = "Path to the CoStoSys configuration XML file that specifies the database this pipeline writes to, i.e. the same file that the DB XMI Writer is using. If there is no DB Writer in use, this flow controller is not applicable.") +// private String costosysConfig; +// @ConfigurationParameter(name = PARAM_ADD_SHA_HASH, description = "Possible values: document_text, defaults to 'document_text' and thus doesn't need to be specified manually at the moment. This parameter needs to match the value for the same parameter given to the XMIDBWriter in this pipeline. Then, a comparison between the existing hash in the database and the new hash of the CAS read in this pipeline can be made. In case the hashes match, the CAS skips all component except the DBCheckpointAE to mark the document as processed.") +// private String documentItemToHash; +// @ConfigurationParameter(name = PARAM_TABLE_DOCUMENT, description = "String parameter indicating the name of the " + +// "table where the XMI data will be stored. The name must be schema qualified.") +// private String docTableParamValue; +// +// private DataBaseConnector dbc; +// +// @Override +// public void initialize(FlowControllerContext aContext) throws ResourceInitializationException { +// this.costosysConfig = (String) aContext.getConfigParameterValue(DBReader.PARAM_COSTOSYS_CONFIG_NAME); +// this.documentItemToHash = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_ADD_SHA_HASH)).orElse("document_text"); +// try { +// dbc = new DataBaseConnector(this.costosysConfig); +// } catch (FileNotFoundException e) { +// log.error("Could not create the CoStoSys DatabaseConnector:", e); +// throw new ResourceInitializationException(e); +// } +// } +// +// @Override +// public Flow computeFlow(JCas jCas) throws AnalysisEngineProcessException { +// RowBatch rowBatch; +// try { +// rowBatch = JCasUtil.selectSingle(jCas, RowBatch.class); +// } catch (IllegalArgumentException e) { +// log.error("Could not select the RowBatch annotation from the JCas:", e); +// throw new AnalysisEngineProcessException(e); +// } +// Map id2hash = fetchCurrentHashesFromDatabase(rowBatch); +// return new HashComparisonOuterFlow(id2hash, documentItemToHash, getContext().getAggregateMetadata().getFlowConstraints()); +// } +// +// /** +// *

Fetches the hashes of the currently stored documents in the database.

+// * @param rowBatch The annotation specifying which documents should be fetched by the multiplier and then be processed by the aggregate. +// * @return A map from a string representation of the RowBatches document IDs to the hashes for the respective IDs. +// * @throws AnalysisEngineProcessException If the SQL request fails. +// */ +// private Map fetchCurrentHashesFromDatabase(RowBatch rowBatch) throws AnalysisEngineProcessException { +// String dataTable = dbc.getNextDataTable(rowBatch.getTableName()); +// String hashColumn = documentItemToHash + "_sha256"; +// // Extract the document IDs in this RowBatch. The IDs could be composite keys. +// List documentIds = new ArrayList<>(rowBatch.getIdentifiers().size()); +// Iterator documentIDsIt = rowBatch.getIdentifiers().iterator(); +// while (documentIDsIt.hasNext()) { +// StringArray pkArray = (StringArray) documentIDsIt.next(); +// documentIds.add(pkArray.toStringArray()); +// } +// Map id2hash = new HashMap<>(documentIds.size()); +// // This is the map we want to fill that lets us look up the hash of the document text by document ID. +// String sql = null; +// // Query the database for the document IDs in the current RowBatch and retrieve hashes. +// try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { +// FieldConfig activeTableFieldConfiguration = dbc.getActiveTableFieldConfiguration(); +// String idQuery = documentIds.stream() +// .map(key -> Arrays.stream(key).map(part -> "%s='" + part + '"').toArray(String[]::new)) +// .map(activeTableFieldConfiguration::expandPKNames).map(expandedKeys -> String.join(" AND ", expandedKeys)) +// .collect(Collectors.joining(" OR ")); +// sql = String.format("SELECT %s,%s FROM %s WHERE %s", activeTableFieldConfiguration.getPrimaryKeyString(), hashColumn, dataTable, idQuery); +// ResultSet rs = conn.createStatement().executeQuery(sql); +// while (rs.next()) { +// StringBuilder pkSb = new StringBuilder(); +// for (int i = 0; i < activeTableFieldConfiguration.getPrimaryKey().length; i++) +// pkSb.append(rs.getString(i)).append(','); +// // Remove training comma +// pkSb.deleteCharAt(pkSb.length()); +// String hash = rs.getString(activeTableFieldConfiguration.getPrimaryKey().length); +// id2hash.put(pkSb.toString(), hash); +// } +// } catch (SQLException e) { +// log.error("Could not retrieve hashes from the database. SQL query was {}:", sql, e); +// throw new AnalysisEngineProcessException(e); +// } +// return id2hash; +// } +//} diff --git a/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/HashComparisonOuterFlow.java b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/HashComparisonOuterFlow.java new file mode 100644 index 000000000..896b52892 --- /dev/null +++ b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/HashComparisonOuterFlow.java @@ -0,0 +1,72 @@ +//package de.julielab.jcore.flow.annotationdefined; +// +//import org.apache.commons.codec.binary.Base64; +//import org.apache.commons.codec.digest.DigestUtils; +//import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +//import org.apache.uima.analysis_engine.metadata.FixedFlow; +//import org.apache.uima.analysis_engine.metadata.FlowConstraints; +//import org.apache.uima.flow.*; +//import org.apache.uima.jcas.JCas; +//import org.slf4j.Logger; +//import org.slf4j.LoggerFactory; +// +//import java.util.Map; +// +///** +// *

Note: This flow can only be used in an aggregate analysis engine where the {@link de.julielab.jcore.reader.xmi.XmiDBMultiplier} is the first component.

+// *

This flow is created by the {@link HashComparisonFlowController} and routes the CAS that was filled by the {@link de.julielab.jcore.reader.xmi.XmiDBMultiplierReader}. +// * This CAS contains an instance of {@link de.julielab.jcore.types.casmultiplier.RowBatch} which contains the information which documents should be read +// * from which database table.

+// *

Within this flow, the reader CAS is passed to the multiplier, the first component. For CASes created by the multiplier, +// * the method {@link #newCasProduced(JCas, String)} is called for which a new flow concerning the processing order of the +// * multiplier-created CASes within the aggregate is determined.

+// */ +//public class HashComparisonOuterFlow extends JCasFlow_ImplBase { +// private final static Logger log = LoggerFactory.getLogger(HashComparisonOuterFlow.class); +// private String[] fixedFlow; +// private int currentPosition; +// private Map id2hash; +// private String documentItemToHash; +// +// public HashComparisonOuterFlow(Map id2hash, String documentItemToHash, FlowConstraints flowConstraints) throws AnalysisEngineProcessException { +// this.id2hash = id2hash; +// this.documentItemToHash = documentItemToHash; +// if (!(flowConstraints instanceof FixedFlow)) { +// throw new AnalysisEngineProcessException(new IllegalArgumentException("This flow requires the original FixedFlow to know the order of the delegate engines but the given flow is of type " + flowConstraints.getClass())); +// } +// FixedFlow fixedFlow = (FixedFlow) flowConstraints; +// this.fixedFlow = fixedFlow.getFixedFlow(); +// this.currentPosition = 0; +// } +// +// @Override +// protected Flow newCasProduced(JCas newCas, String producedBy) throws AnalysisEngineProcessException { +// String newHash = getHash(newCas); +// return new FixedInnerFlow(fixedFlow); +// } +// +// private String getHash(JCas newCas) { +// final String documentText = newCas.getDocumentText(); +// final byte[] sha = DigestUtils.sha256(documentText.getBytes()); +// return Base64.encodeBase64String(sha); +// } +// +// public Step next() { +// Step step = null; +// for (; currentPosition < fixedFlow.length && step == null; currentPosition++) { +// String aeKey = fixedFlow[currentPosition]; +// +// // The outer flow only passes the CAS to the CAS multiplier. The multiplier creates more CASes which +// // are then passed to newCasProduced() and are then routed by the InnerFlow. +// if (currentPosition == 0) { +// log.trace("Outer next AE is: " + aeKey); +// step = new SimpleStep(aeKey); +// } +// } +// if (step == null) { +// // no appropriate AEs to call - end of flow +// log.trace("Outer flow Complete."); +// } +// return step == null ? new FinalStep() : step; +// } +//} diff --git a/jcore-flow-controllers/src/main/resources/de/julielab/jcore/flow/annotationdefined/desc/jcore-annotation-defined-flowcontroller.xml b/jcore-flow-controllers/src/main/resources/de/julielab/jcore/flow/annotationdefined/desc/jcore-annotation-defined-flowcontroller.xml new file mode 100644 index 000000000..b64a02723 --- /dev/null +++ b/jcore-flow-controllers/src/main/resources/de/julielab/jcore/flow/annotationdefined/desc/jcore-annotation-defined-flowcontroller.xml @@ -0,0 +1,24 @@ + + + org.apache.uima.java + de.julielab.jcore.flow.annotationdefined.AnnotationDefinedFlowController + + JCoRe Annotation Defined Flow Controller + This flow controller relies on an annotation of type ToVisit to be present in the CAS. If there is no such annotation, the default fixed flow of the aggregate engine using this flow controller is used. Otherwise, the names of the components to pass the CAS to are taken from the annotation. If the annotation exists but defines to components to be visited by the CAS, no components are visited at all. + placeholder + JULIE Lab, Germany + + + + + + + + + + false + true + false + + + \ No newline at end of file diff --git a/jcore-flow-controllers/src/test/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowControllerTest.java b/jcore-flow-controllers/src/test/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowControllerTest.java new file mode 100644 index 000000000..228e94a49 --- /dev/null +++ b/jcore-flow-controllers/src/test/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowControllerTest.java @@ -0,0 +1,110 @@ +package de.julielab.jcore.flow.annotationdefined; + +import de.julielab.jcore.types.Token; +import de.julielab.jcore.types.casflow.ToVisit; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.cas.FSIterator; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.FlowControllerFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.flow.FlowControllerDescription; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.StringArray; +import org.apache.uima.resource.ResourceInitializationException; +import org.junit.jupiter.api.Test; + +import static java.util.Arrays.asList; +import static org.assertj.core.api.Assertions.assertThat; +public class AnnotationDefinedFlowControllerTest { + @Test + public void testFlowControllerSingleKey() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-casflow-types"); + ToVisit toVisit = new ToVisit(jCas); + StringArray toVisitKeys = new StringArray(jCas, 1); + toVisitKeys.set(0, "TestAE 2"); + toVisit.setDelegateKeys(toVisitKeys); + toVisit.addToIndexes(); + + AnalysisEngine aae = createTestAAE(); + + aae.process(jCas); + + FSIterator it = jCas.getAnnotationIndex(Token.type).iterator(); + assertThat(it).toIterable().extracting(Token::getComponentId).containsExactly("TestAE 2"); + } + + @Test + public void testFlowControllerNoKey() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-casflow-types"); + ToVisit toVisit = new ToVisit(jCas); + StringArray toVisitKeys = new StringArray(jCas, 0); + toVisit.setDelegateKeys(toVisitKeys); + toVisit.addToIndexes(); + + AnalysisEngine aae = createTestAAE(); + + aae.process(jCas); + + FSIterator it = jCas.getAnnotationIndex(Token.type).iterator(); + assertThat(it).isExhausted(); + } + + @Test + public void testFlowControllerNullKey() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-casflow-types"); + ToVisit toVisit = new ToVisit(jCas); + toVisit.addToIndexes(); + + AnalysisEngine aae = createTestAAE(); + + aae.process(jCas); + + FSIterator it = jCas.getAnnotationIndex(Token.type).iterator(); + assertThat(it).isExhausted(); + } + + @Test + public void testFlowControllerNoVisitAnnotation() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-casflow-types"); + + AnalysisEngine aae = createTestAAE(); + + aae.process(jCas); + + FSIterator it = jCas.getAnnotationIndex(Token.type).iterator(); + assertThat(it).toIterable().extracting(Token::getComponentId).containsExactly("TestAE 1", "TestAE 2", "TestAE 3"); + } + + private AnalysisEngine createTestAAE() throws ResourceInitializationException { + FlowControllerDescription flowControllerDescription = FlowControllerFactory.createFlowControllerDescription(AnnotationDefinedFlowController.class); + AnalysisEngineDescription testAeDesc1 = AnalysisEngineFactory.createEngineDescription(TestAE.class, "name", "TestAE 1"); + AnalysisEngineDescription testAeDesc2 = AnalysisEngineFactory.createEngineDescription(TestAE.class, "name", "TestAE 2"); + AnalysisEngineDescription testAeDesc3 = AnalysisEngineFactory.createEngineDescription(TestAE.class, "name", "TestAE 3"); + AnalysisEngineDescription aaeWithFlowController = AnalysisEngineFactory.createEngineDescription(asList(testAeDesc1, testAeDesc2, testAeDesc3), asList("TestAE 1", "TestAE 2", "TestAE 3"), null, null, + flowControllerDescription); + AnalysisEngine aae = AnalysisEngineFactory.createEngine(aaeWithFlowController); + return aae; + } + + public static class TestAE extends JCasAnnotator_ImplBase { + @ConfigurationParameter(name = "name") + private String name; + + @Override + public void initialize(UimaContext context) { + name = (String) context.getConfigParameterValue("name"); + } + + @Override + public void process(JCas jCas) { + // Indicate that this jCas was processed by this component. + Token token = new Token(jCas); + token.setComponentId(name); + token.addToIndexes(); + } + } +} diff --git a/jcore-gnormplus-ae/LICENSE b/jcore-gnormplus-ae/LICENSE new file mode 100644 index 000000000..fbbd41e05 --- /dev/null +++ b/jcore-gnormplus-ae/LICENSE @@ -0,0 +1,26 @@ +BSD 2-Clause License + +Copyright (c) 2017, JULIE Lab +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/jcore-gnormplus-ae/README.md b/jcore-gnormplus-ae/README.md new file mode 100644 index 000000000..7e8752172 --- /dev/null +++ b/jcore-gnormplus-ae/README.md @@ -0,0 +1,34 @@ +# JCoRe GNormPlus Annotator + +**Descriptor Path**: +``` +de.julielab.jcore.ae.gnp.desc.jcore-gnormplus-ae +``` + +Wrapper for the JULIE Lab variant of the GNormPlus gene ID mapper. + + + +**1. Parameters** + +| Parameter Name | Parameter Type | Mandatory | Multivalued | Description | +|----------------|----------------|-----------|-------------|-------------| +| param1 | UIMA-Type | Boolean | Boolean | Description | +| param2 | UIMA-Type | Boolean | Boolean | Description | + +**2. Predefined Settings** + +| Parameter Name | Parameter Syntax | Example | +|----------------|------------------|---------| +| param1 | Syntax-Description | `Example` | +| param2 | Syntax-Description | `Example` | + +**3. Capabilities** + +| Type | Input | Output | +|------|:-----:|:------:| +| de.julielab.jcore.types.TYPE | | `+` | +| de.julielab.jcore.types.ace.TYPE | `+` | | + + +[1] Some Literature? diff --git a/jcore-gnormplus-ae/component.meta b/jcore-gnormplus-ae/component.meta new file mode 100644 index 000000000..ae9063b04 --- /dev/null +++ b/jcore-gnormplus-ae/component.meta @@ -0,0 +1,37 @@ +{ + "categories": [ + "ae", + "multiplier" + ], + "description": "Wrapper for the JULIE Lab variant of the GNormPlus gene ID mapper.", + "descriptors": [ + { + "category": "multiplier", + "location": "de.julielab.jcore.multiplier.gnp.desc.jcore-gnormplus-pmc-db-multiplier" + }, + { + "category": "multiplier", + "location": "de.julielab.jcore.multiplier.gnp.desc.jcore-gnormplus-bioc-multiplier" + }, + { + "category": "multiplier", + "location": "de.julielab.jcore.multiplier.gnp.desc.jcore-gnormplus-xmi-db-multiplier" + }, + { + "category": "multiplier", + "location": "de.julielab.jcore.multiplier.gnp.desc.jcore-gnormplus-xml-db-multiplier" + }, + { + "category": "ae", + "location": "de.julielab.jcore.ae.gnp.desc.jcore-gnormplus-ae" + } + ], + "exposable": true, + "group": "general", + "maven-artifact": { + "artifactId": "jcore-gnormplus-ae", + "groupId": "de.julielab", + "version": "2.6.0" + }, + "name": "JCoRe GNormPlus Annotator" +} diff --git a/jcore-gnormplus-ae/pom.xml b/jcore-gnormplus-ae/pom.xml new file mode 100644 index 000000000..ab8d4d208 --- /dev/null +++ b/jcore-gnormplus-ae/pom.xml @@ -0,0 +1,85 @@ + + + + 4.0.0 + jcore-gnormplus-ae + jar + de.julielab + + + de.julielab + jcore-base + 2.6.0 + + + + + + + de.julielab + jcore-gnp-bioc-writer + 2.6.0 + + + de.julielab + jcore-gnp-bioc-reader + 2.6.0 + + + de.julielab + jcore-xmi-db-reader + 2.6.0 + + + de.julielab + jcore-xml-db-reader + 2.6.0 + + + de.julielab + jcore-pmc-db-reader + 2.6.0 + + + de.julielab + julielab-gnormplus + 1.0.0 + + + ch.qos.logback + logback-classic + test + + + org.slf4j + slf4j-api + + + de.julielab + jcore-types + ${jcore-types-version} + + + de.julielab + jcore-descriptor-creator + + + org.junit.jupiter + junit-jupiter-engine + + + JCoRe GNormPlus Annotator + + JULIE Lab Jena, Germany + http://www.julielab.de + + https://github.com/JULIELab/jcore-base/tree/master/jcore-gnormplus-ae + Wrapper for the JULIE Lab variant of the GNormPlus gene ID mapper. + + + BSD 2-Clause + https://opensource.org/licenses/BSD-2-Clause + + + diff --git a/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/ae/gnp/GNormPlusAnnotator.java b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/ae/gnp/GNormPlusAnnotator.java new file mode 100644 index 000000000..10d01c157 --- /dev/null +++ b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/ae/gnp/GNormPlusAnnotator.java @@ -0,0 +1,123 @@ +package de.julielab.jcore.ae.gnp; + +import com.pengyifan.bioc.BioCCollection; +import com.pengyifan.bioc.BioCDocument; +import de.julielab.jcore.consumer.gnp.BioCDocumentPopulator; +import de.julielab.jcore.reader.BioCCasPopulator; +import de.julielab.jcore.types.Gene; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.xml.stream.XMLStreamException; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Optional; + +@ResourceMetaData(name = "JCoRe GNormPlus Annotator", description = "Wrapper for the JULIE Lab variant of the GNormPlus gene ID mapper.", vendor = "JULIE Lab Jena, Germany") +@TypeCapability(inputs = {}, outputs = {"de.julielab.jcore.types.ConceptMention", "de.julielab.jcore.types.Organism"}) +public class GNormPlusAnnotator extends JCasAnnotator_ImplBase { + + public static final String PARAM_ADD_GENES = "AddGenes"; + public static final String DESC_GENE_TYPE_NAME = "The UIMA type denoting gene annotations that should be written into the BioC format when the " + PARAM_ADD_GENES + " parameter is set to true."; + public static final String PARAM_GENE_TYPE_NAME = "GeneTypeName"; + public static final String DESC_ADD_GENES = "If set to true, all Gene annotations in the CAS will be added to the BioC documents. The default type used is de.julielab.jcore.types.Gene. This can be changed with the " + PARAM_GENE_TYPE_NAME + " parameter."; + public static final String PARAM_GNP_SETUP_FILE = "GNormPlusSetupFile"; + public static final String PARAM_FOCUS_SPECIES = "FocusSpecies"; + public static final String PARAM_OUTPUT_DIR = "OutputDirectory"; + public static final String DESC_GNP_SETUP_FILE = "File path or class path resource path to the setup.txt file for GNormPlus. If not specified, a default setup file is loaded that expects the Dictionary/ directory directly under the working directory, performs gene recognition with the CRF and thus expects the GNormPlus CRF directory directly under the working directory and maps the found genes to NCBI gene IDs for all organisms."; + public static final String DESC_FOCUS_SPECIES = "If given, all gene mentions are assigned to this NCBI taxonomy ID, i.e. species recognition is omitted."; + public static final String DESC_OUTPUT_DIR = "Optional. If specified, the GNormPlus output files in BioC format will be saved to the given directory. In this way, this component can be used directly as a BioC XML writer through the GNormPlus algorithm."; + private final static Logger log = LoggerFactory.getLogger(GNormPlusAnnotator.class); + @ConfigurationParameter(name = PARAM_ADD_GENES, mandatory = false, defaultValue = "false", description = DESC_ADD_GENES) + private boolean addGenes; + @ConfigurationParameter(name = PARAM_GENE_TYPE_NAME, mandatory = false, defaultValue = "de.julielab.jcore.types.Gene", description = DESC_GENE_TYPE_NAME) + private String geneTypeName; + @ConfigurationParameter(name = PARAM_GNP_SETUP_FILE, mandatory = false, description = DESC_GNP_SETUP_FILE) + private String setupFile; + @ConfigurationParameter(name = PARAM_FOCUS_SPECIES, mandatory = false, description = DESC_FOCUS_SPECIES) + private String focusSpecies; + @ConfigurationParameter(name = PARAM_OUTPUT_DIR, mandatory = false, description = DESC_OUTPUT_DIR) + private String outputDirectory; + + private BioCDocumentPopulator bioCDocumentPopulator; + + /** + * This method is called a single time by the framework at component + * creation. Here, descriptor parameters are read and initial setup is done. + */ + @Override + public void initialize(final UimaContext aContext) throws ResourceInitializationException { + addGenes = (boolean) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_ADD_GENES)).orElse(false); + geneTypeName = (String) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_GENE_TYPE_NAME)).orElse(Gene.class.getCanonicalName()); + setupFile = (String) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_GNP_SETUP_FILE)).orElse("/de/julielab/jcore/ae/gnp/config/setup_do_ner.txt"); + focusSpecies = (String) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_FOCUS_SPECIES)).orElse(""); + outputDirectory = (String) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_OUTPUT_DIR)).orElse(""); + + try { + GNormPlusProcessing.initializeGNormPlus(setupFile, focusSpecies); + } catch (IOException e) { + log.error("Could not find resource {}", setupFile); + throw new ResourceInitializationException(e); + } + try { + bioCDocumentPopulator = new BioCDocumentPopulator(addGenes, geneTypeName); + } catch (ClassNotFoundException e) { + log.error("Gene annotation class {} could not be found.", geneTypeName, e); + throw new ResourceInitializationException(e); + } + + try { + if (!outputDirectory.isBlank()) + Files.createDirectories(Path.of(outputDirectory)); + } catch (IOException e) { + log.error("Could not create the output directory {}", outputDirectory); + throw new ResourceInitializationException(e); + } + } + + /** + * This method is called for each document going through the component. This + * is where the actual work happens. + */ + @Override + public void process(final JCas aJCas) throws AnalysisEngineProcessException { + final BioCDocument bioCDocument = bioCDocumentPopulator.populate(aJCas); + BioCCollection bioCCollection = GNormPlusProcessing.createEmptyJulieLabBioCCollection(); + bioCCollection.addDocument(bioCDocument); + String outputDirectory = this.outputDirectory; + final Path outputFilePath = GNormPlusProcessing.processWithGNormPlus(bioCCollection, outputDirectory); + + try { + final BioCCasPopulator bioCCasPopulator = new BioCCasPopulator(outputFilePath); + bioCCasPopulator.populateWithNextDocument(aJCas, true); + } catch (XMLStreamException | IOException e) { + log.error("Could not read GNormPlus output file {}"); + throw new AnalysisEngineProcessException(e); + } +// try { +// Files.delete(filePath); +// } catch (IOException e) { +// log.error("Could not delete temporary file {}", filePath); +// throw new AnalysisEngineProcessException(e); +// } + try { + if (outputDirectory.isBlank() && Files.exists(outputFilePath)) + Files.delete(outputFilePath); + } catch (IOException e) { + log.error("Could not delete temporary file {}", outputFilePath); + throw new AnalysisEngineProcessException(e); + } + + } + + +} diff --git a/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/ae/gnp/GNormPlusProcessing.java b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/ae/gnp/GNormPlusProcessing.java new file mode 100644 index 000000000..2c131183b --- /dev/null +++ b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/ae/gnp/GNormPlusProcessing.java @@ -0,0 +1,71 @@ +package de.julielab.jcore.ae.gnp; + +import GNormPluslib.GNormPlus; +import com.pengyifan.bioc.BioCCollection; +import com.pengyifan.bioc.io.BioCCollectionWriter; +import de.julielab.java.utilities.FileUtilities; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.xml.stream.XMLStreamException; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.FileAlreadyExistsException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Date; + +public class GNormPlusProcessing { + private final static Logger log = LoggerFactory.getLogger(GNormPlusProcessing.class); + + public static synchronized void initializeGNormPlus(String setupFileResourcePath, String focusSpecies) throws IOException { + if (!GNormPlus.initialized) { + final InputStream setupFileStream = FileUtilities.findResource(setupFileResourcePath); + if (setupFileStream == null) + throw new IOException("Could not find resource as file or classpath resource " + setupFileResourcePath); + GNormPlus.loadConfiguration(setupFileStream, focusSpecies); + GNormPlus.loadResources(focusSpecies, System.currentTimeMillis()); + } + } + + public static BioCCollection createEmptyJulieLabBioCCollection() { + final BioCCollection bioCCollection = new BioCCollection(); + bioCCollection.setDate(new Date().toString()); + bioCCollection.setEncoding("UTF-8"); + bioCCollection.setKey("BioC.key"); + bioCCollection.setSource("JULIE Lab GNormPlus"); + return bioCCollection; + } + + /** + * @param bioCCollection + * @param outputDirectory + * @return The path of the GNormPlus output file. + * @throws AnalysisEngineProcessException + */ + public static Path processWithGNormPlus(BioCCollection bioCCollection, String outputDirectory) throws AnalysisEngineProcessException { + String collectionId = "collection_including_" + bioCCollection.getDocument(0).getID(); + final Path filePath = Path.of("jcore-gnp-tmp", collectionId + ".xml"); + final Path outputFilePath = Path.of(outputDirectory.isBlank() ? "tmp" : outputDirectory, collectionId + "processed.xml"); + try { + try { + if (!Files.exists(filePath.getParent())) + Files.createDirectory(filePath.getParent()); + } catch (FileAlreadyExistsException e) { + // OK, so another process created it, not a big deal. + } + if (!Files.exists(outputFilePath.getParent())) + Files.createDirectories(outputFilePath.getParent()); + try (BioCCollectionWriter w = new BioCCollectionWriter(filePath)) { + w.writeCollection(bioCCollection); + } + GNormPlus.processFile(filePath.toString(), filePath.getFileName().toString(), outputFilePath.toString(), System.currentTimeMillis(), "Test"); + Files.delete(filePath); + } catch (IOException | XMLStreamException e) { + log.error("Could not process document {}", collectionId); + throw new AnalysisEngineProcessException(e); + } + return outputFilePath; + } +} diff --git a/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusBioCMultiplier.java b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusBioCMultiplier.java new file mode 100644 index 000000000..5e8eebab4 --- /dev/null +++ b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusBioCMultiplier.java @@ -0,0 +1,93 @@ +package de.julielab.jcore.multiplier.gnp; + +import de.julielab.jcore.ae.gnp.GNormPlusAnnotator; +import de.julielab.jcore.consumer.gnp.BioCDocumentPopulator; +import de.julielab.jcore.reader.GNormPlusFormatMultiplier; +import de.julielab.jcore.types.Gene; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.AbstractCas; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Optional; + +import static de.julielab.jcore.ae.gnp.GNormPlusAnnotator.DESC_FOCUS_SPECIES; + +@ResourceMetaData(name = "JCoRe GNormPlus BioC Multiplier", description = "A CAS multiplier to be used with the GNormPlus BioC Format multiplier reader. It wraps the JULIE Lab variant of the GNormPlus gene ID mapper. It is a multiplier because this enables batch-processing of documents with GNormPlus which makes the processing more efficient.", vendor = "JULIE Lab Jena, Germany") +@TypeCapability(inputs = {}, outputs = {"de.julielab.jcore.types.ConceptMention", "de.julielab.jcore.types.Organism"}) +public class GNormPlusBioCMultiplier extends GNormPlusFormatMultiplier { + public static final String PARAM_ADD_GENES = GNormPlusAnnotator.PARAM_ADD_GENES; + public static final String PARAM_GENE_TYPE_NAME = GNormPlusAnnotator.PARAM_GENE_TYPE_NAME; + public static final String PARAM_OUTPUT_DIR = GNormPlusAnnotator.PARAM_OUTPUT_DIR; + public static final String PARAM_GNP_SETUP_FILE = GNormPlusAnnotator.PARAM_GNP_SETUP_FILE; + public static final String PARAM_FOCUS_SPECIES = GNormPlusAnnotator.PARAM_FOCUS_SPECIES; + private final static Logger log = LoggerFactory.getLogger(GNormPlusXmiDBMultiplier.class); + @ConfigurationParameter(name = PARAM_ADD_GENES, mandatory = false, defaultValue = "false", description = GNormPlusAnnotator.DESC_ADD_GENES) + private boolean addGenes; + @ConfigurationParameter(name = PARAM_GNP_SETUP_FILE, mandatory = false, description = GNormPlusAnnotator.DESC_GNP_SETUP_FILE) + private String setupFile; + @ConfigurationParameter(name = PARAM_GENE_TYPE_NAME, mandatory = false, defaultValue = "de.julielab.jcore.types.Gene", description = GNormPlusAnnotator.DESC_GENE_TYPE_NAME) + private String geneTypeName; + @ConfigurationParameter(name = PARAM_OUTPUT_DIR, mandatory = false, description = GNormPlusAnnotator.DESC_OUTPUT_DIR) + private String outputDirectory; + @ConfigurationParameter(name = PARAM_FOCUS_SPECIES, mandatory = false, description = DESC_FOCUS_SPECIES) + private String focusSpecies; + + private BioCDocumentPopulator bioCDocumentPopulator; + + private GNormPlusMultiplierLogic multiplierLogic; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException { + super.initialize(aContext); + addGenes = (boolean) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_ADD_GENES)).orElse(false); + geneTypeName = (String) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_GENE_TYPE_NAME)).orElse(Gene.class.getCanonicalName()); + try { + bioCDocumentPopulator = new BioCDocumentPopulator(addGenes, geneTypeName); + } catch (ClassNotFoundException e) { + log.error("Gene annotation class {} could not be found.", geneTypeName, e); + throw new ResourceInitializationException(e); + } catch (Throwable t) { + log.error("Could not create BioCDocumentPopulator instance", t); + throw new ResourceInitializationException(t); + } + try { + multiplierLogic = new GNormPlusMultiplierLogic(aContext, bioCDocumentPopulator, () -> { + try { + return super.hasNext(); + } catch (AnalysisEngineProcessException e) { + log.error("Error when calling hasNext() of the base multiplier"); + throw new RuntimeException(e); + } + }, () -> { + try { + return (JCas) super.next(); + } catch (AnalysisEngineProcessException e) { + log.error("Error when calling next() of the base multiplier."); + throw new RuntimeException(e); + } + }, () -> getEmptyJCas(), + false); + } catch (IOException e) { + log.error("Could not initialize GNormPlus", e); + throw new ResourceInitializationException(e); + } + } + + @Override + public boolean hasNext() { + return multiplierLogic.hasNext(); + } + + @Override + public AbstractCas next() throws AnalysisEngineProcessException { + return multiplierLogic.next(); + } +} diff --git a/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusMultiplierLogic.java b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusMultiplierLogic.java new file mode 100644 index 000000000..39c59e7be --- /dev/null +++ b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusMultiplierLogic.java @@ -0,0 +1,153 @@ +package de.julielab.jcore.multiplier.gnp; + +import com.pengyifan.bioc.BioCCollection; +import com.pengyifan.bioc.BioCDocument; +import de.julielab.jcore.ae.gnp.GNormPlusProcessing; +import de.julielab.jcore.consumer.gnp.BioCDocumentPopulator; +import de.julielab.jcore.reader.BioCCasPopulator; +import de.julielab.jcore.types.ext.DBProcessingMetaData; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.AbstractCas; +import org.apache.uima.cas.impl.XmiCasDeserializer; +import org.apache.uima.cas.impl.XmiCasSerializer; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.SAXException; + +import javax.xml.stream.XMLStreamException; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.function.Supplier; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +import static de.julielab.jcore.ae.gnp.GNormPlusAnnotator.*; + +public class GNormPlusMultiplierLogic { + private final static Logger log = LoggerFactory.getLogger(GNormPlusMultiplierLogic.class); + private BioCDocumentPopulator bioCDocumentPopulator; + private BioCCasPopulator bioCCasPopulator; + private String outputDirectory; + private Supplier baseMultiplierHasNext; + private Supplier baseMultiplierNext; + private Supplier multiplierGetEmptyCas; + private int currentCollectionIndex; + private int currentBiocResultCollectionIndex; + private List cachedCasData; + private boolean skipUnchangedDocuments; + + public GNormPlusMultiplierLogic(UimaContext aContext, BioCDocumentPopulator bioCDocumentPopulator, Supplier baseMultiplierHasNext, Supplier baseMultiplierNext, Supplier multiplierGetEmptyCas, boolean skipUnchangedDocuments) throws IOException { + this.skipUnchangedDocuments = skipUnchangedDocuments; + String setupFile = (String) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_GNP_SETUP_FILE)).orElse("/de/julielab/jcore/ae/gnp/config/setup_do_ner.txt"); + String focusSpecies = (String) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_FOCUS_SPECIES)).orElse(""); + outputDirectory = (String) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_OUTPUT_DIR)).orElse(""); + this.bioCDocumentPopulator = bioCDocumentPopulator; + this.baseMultiplierHasNext = baseMultiplierHasNext; + this.baseMultiplierNext = baseMultiplierNext; + this.multiplierGetEmptyCas = multiplierGetEmptyCas; + cachedCasData = new ArrayList<>(); + currentCollectionIndex = 0; + currentBiocResultCollectionIndex = 0; + + GNormPlusProcessing.initializeGNormPlus(setupFile, focusSpecies); + } + + public AbstractCas next() throws AnalysisEngineProcessException { + try { + // Process the incoming documents batch-wise (this is why we use a multiplier here so we have access + // to whole batches). This checks if we still have processed documents or if we need to process the next + // batch. + if (bioCCasPopulator == null || bioCCasPopulator.documentsLeftInCollection() == 0) { + currentCollectionIndex = 0; + currentBiocResultCollectionIndex = 0; + final BioCCollection gnormPlusInputCollection = GNormPlusProcessing.createEmptyJulieLabBioCCollection(); + // We first retrieve the whole current batch from the super multiplier and serialize the CASes + // to XMI. We do that because we only have one CAS at a time and, thus, must store the data + // of the whole batch. We can then later deserialize the documents and add the GNP annotations to it. + // This allows batch-processing within GNP which reduces file writes and reads (GNP internally + // writes a lot of temporary files that contain all the documents given to it in one single batch file). + cachedCasData.clear(); + while (baseMultiplierHasNext.get()) { + final JCas jCas = baseMultiplierNext.get(); + final boolean isDocumentHashUnchanged = JCasUtil.selectSingle(jCas, DBProcessingMetaData.class).getIsDocumentHashUnchanged(); + // skip document if it is unchanged and skipping is enabled + if (!(isDocumentHashUnchanged && skipUnchangedDocuments)) { + final BioCDocument bioCDocument = bioCDocumentPopulator.populate(jCas); + gnormPlusInputCollection.addDocument(bioCDocument); + } else { + log.trace("Document with ID {} already exists in the XMI database table with unchanged text contents, skipping GNormPlus processing.", JCoReTools.getDocId(jCas)); + } + try (final ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + try (final GZIPOutputStream os = new GZIPOutputStream(baos)) { + XmiCasSerializer.serialize(jCas.getCas(), os); + } + cachedCasData.add(baos.toByteArray()); + jCas.release(); + } catch (IOException | SAXException e) { + log.error("Error when serializing CAS data for caching purposes."); + throw new AnalysisEngineProcessException(e); + } + } + // now process the whole batch with GNP + if (gnormPlusInputCollection.getDocmentCount() > 0) { + log.trace("Processing {} documents with GNormPlus.", gnormPlusInputCollection.getDocmentCount()); + final Path outputFilePath = GNormPlusProcessing.processWithGNormPlus(gnormPlusInputCollection, outputDirectory); + try { + bioCCasPopulator = new BioCCasPopulator(outputFilePath); + // delete the GNP output if we don't want to keep it + if (outputDirectory.isBlank()) { + Files.delete(outputFilePath); + } + } catch (XMLStreamException | IOException e) { + log.error("Could not read GNormPlus output from {}", outputFilePath); + throw new AnalysisEngineProcessException(e); + } + } + } + // Now we have a batch of documents processed with GNP. Get the next document from the cache and + // add the GNP annotations to it. + byte[] currentCasData = cachedCasData.get(currentCollectionIndex); + final JCas jCas = multiplierGetEmptyCas.get(); + try (InputStream is = new GZIPInputStream(new ByteArrayInputStream(currentCasData))) { + XmiCasDeserializer.deserialize(is, jCas.getCas()); + } catch (SAXException | IOException e) { + log.error("Could not deserialize cached CAS data"); + throw new AnalysisEngineProcessException(e); + } + final boolean isDocumentHashUnchanged = JCasUtil.selectSingle(jCas, DBProcessingMetaData.class).getIsDocumentHashUnchanged(); + // If the document is unchanged and we skip unchanged documents, we do not have a GNormPlus result for this + // document, skip. + if (!(isDocumentHashUnchanged && skipUnchangedDocuments)) { + bioCCasPopulator.populateWithNextDocument(jCas, true); + bioCCasPopulator.clearDocument(currentBiocResultCollectionIndex++); + } + cachedCasData.set(currentCollectionIndex, null); + ++currentCollectionIndex; + + return jCas; + } catch (AnalysisEngineProcessException e) { + log.error("Error while retrieving or processing data for/with GNormPlus", e); + throw e; + } + } + + public boolean hasNext() { + try { + return bioCCasPopulator != null && bioCCasPopulator.documentsLeftInCollection() > 0 || baseMultiplierHasNext.get(); + } catch (Throwable t) { + log.error("Could not determine hasNext()", t); + throw t; + } + } +} diff --git a/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusPMCDBMultiplier.java b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusPMCDBMultiplier.java new file mode 100644 index 000000000..85a16c211 --- /dev/null +++ b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusPMCDBMultiplier.java @@ -0,0 +1,96 @@ +package de.julielab.jcore.multiplier.gnp; + +import de.julielab.jcore.ae.gnp.GNormPlusAnnotator; +import de.julielab.jcore.consumer.gnp.BioCDocumentPopulator; +import de.julielab.jcore.multiplier.pmc.PMCDBMultiplier; +import de.julielab.jcore.types.Gene; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.AbstractCas; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Optional; + +import static de.julielab.jcore.ae.gnp.GNormPlusAnnotator.DESC_FOCUS_SPECIES; + +@ResourceMetaData(name = "JCoRe GNormPlus PMC Database Multiplier", description = "A CAS multiplier to be used with the DB PMC multiplier reader in place of the DB PMC multiplier. It wraps the JULIE Lab variant of the GNormPlus gene ID mapper. It is a multiplier because this enables batch-processing of documents with GNormPlus which makes the processing more efficient.", vendor = "JULIE Lab Jena, Germany") +@TypeCapability(inputs = {}, outputs = {"de.julielab.jcore.types.ConceptMention", "de.julielab.jcore.types.Organism"}) +public class GNormPlusPMCDBMultiplier extends PMCDBMultiplier { + public static final String PARAM_ADD_GENES = GNormPlusAnnotator.PARAM_ADD_GENES; + public static final String PARAM_GENE_TYPE_NAME = GNormPlusAnnotator.PARAM_GENE_TYPE_NAME; + public static final String PARAM_OUTPUT_DIR = GNormPlusAnnotator.PARAM_OUTPUT_DIR; + public static final String PARAM_GNP_SETUP_FILE = GNormPlusAnnotator.PARAM_GNP_SETUP_FILE; + public static final String PARAM_FOCUS_SPECIES = GNormPlusAnnotator.PARAM_FOCUS_SPECIES; + public static final String PARAM_SKIP_UNCHANGED_DOCUMENTS = "SkipUnchangedDocuments"; + private final static Logger log = LoggerFactory.getLogger(GNormPlusPMCDBMultiplier.class); + private static boolean shutdownHookInstalled = false; + @ConfigurationParameter(name = PARAM_ADD_GENES, mandatory = false, defaultValue = "false", description = GNormPlusAnnotator.DESC_ADD_GENES) + private boolean addGenes; + @ConfigurationParameter(name = PARAM_GNP_SETUP_FILE, mandatory = false, description = GNormPlusAnnotator.DESC_GNP_SETUP_FILE) + private String setupFile; + @ConfigurationParameter(name = PARAM_GENE_TYPE_NAME, mandatory = false, defaultValue = "de.julielab.jcore.types.Gene", description = GNormPlusAnnotator.DESC_GENE_TYPE_NAME) + private String geneTypeName; + @ConfigurationParameter(name = PARAM_OUTPUT_DIR, mandatory = false, description = GNormPlusAnnotator.DESC_OUTPUT_DIR) + private String outputDirectory; + @ConfigurationParameter(name = PARAM_FOCUS_SPECIES, mandatory = false, description = DESC_FOCUS_SPECIES) + private String focusSpecies; + @ConfigurationParameter(name = PARAM_SKIP_UNCHANGED_DOCUMENTS, mandatory = false, description = "Whether to omit GNormPlus processing on documents that already exist in the XMI database table and whose document text has not changed.") + private boolean skipUnchangedDocuments; + private BioCDocumentPopulator bioCDocumentPopulator; + private GNormPlusMultiplierLogic multiplierLogic; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException { + super.initialize(aContext); + addGenes = (boolean) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_ADD_GENES)).orElse(false); + geneTypeName = (String) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_GENE_TYPE_NAME)).orElse(Gene.class.getCanonicalName()); + skipUnchangedDocuments = (boolean) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_SKIP_UNCHANGED_DOCUMENTS)).orElse(false); + try { + bioCDocumentPopulator = new BioCDocumentPopulator(addGenes, geneTypeName); + } catch (ClassNotFoundException e) { + log.error("Gene annotation class {} could not be found.", geneTypeName, e); + throw new ResourceInitializationException(e); + } + try { + multiplierLogic = new GNormPlusMultiplierLogic(aContext, bioCDocumentPopulator, () -> super.hasNext(), () -> { + try { + return (JCas) super.next(); + } catch (AnalysisEngineProcessException e) { + log.error("Error when calling next() of the base multiplier."); + throw new RuntimeException(e); + } + }, () -> getEmptyJCas(), + skipUnchangedDocuments); + } catch (IOException e) { + log.error("Could not initialize GNormPlus", e); + throw new ResourceInitializationException(e); + } + } + + @Override + public boolean hasNext() { + try { + return multiplierLogic.hasNext(); + } catch (Throwable t) { + log.error("Error when checking hasNext() on multiplier", t); + } + return false; + } + + @Override + public AbstractCas next() throws AnalysisEngineProcessException { + try { + return multiplierLogic.next(); + } catch (Throwable t) { + log.error("Error when retrieving next multiplier CAS", t); + throw new AnalysisEngineProcessException(t); + } + } +} diff --git a/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusXMLDBMultiplier.java b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusXMLDBMultiplier.java new file mode 100644 index 000000000..9d58264b1 --- /dev/null +++ b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusXMLDBMultiplier.java @@ -0,0 +1,96 @@ +package de.julielab.jcore.multiplier.gnp; + +import de.julielab.jcore.ae.gnp.GNormPlusAnnotator; +import de.julielab.jcore.consumer.gnp.BioCDocumentPopulator; +import de.julielab.jcore.reader.xml.XMLDBMultiplier; +import de.julielab.jcore.types.Gene; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.AbstractCas; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Optional; + +import static de.julielab.jcore.ae.gnp.GNormPlusAnnotator.DESC_FOCUS_SPECIES; + +@ResourceMetaData(name = "JCoRe GNormPlus XML Database Multiplier", description = "A CAS multiplier to be used with the DB XML multiplier reader in place of the DB XML multiplier. It wraps the JULIE Lab variant of the GNormPlus gene ID mapper. It is a multiplier because this enables batch-processing of documents with GNormPlus which makes the processing more efficient.", vendor = "JULIE Lab Jena, Germany") +@TypeCapability(inputs = {}, outputs = {"de.julielab.jcore.types.ConceptMention", "de.julielab.jcore.types.Organism"}) +public class GNormPlusXMLDBMultiplier extends XMLDBMultiplier { + public static final String PARAM_ADD_GENES = GNormPlusAnnotator.PARAM_ADD_GENES; + public static final String PARAM_GENE_TYPE_NAME = GNormPlusAnnotator.PARAM_GENE_TYPE_NAME; + public static final String PARAM_OUTPUT_DIR = GNormPlusAnnotator.PARAM_OUTPUT_DIR; + public static final String PARAM_GNP_SETUP_FILE = GNormPlusAnnotator.PARAM_GNP_SETUP_FILE; + public static final String PARAM_FOCUS_SPECIES = GNormPlusAnnotator.PARAM_FOCUS_SPECIES; + public static final String PARAM_SKIP_UNCHANGED_DOCUMENTS = "SkipUnchangedDocuments"; + private final static Logger log = LoggerFactory.getLogger(GNormPlusXMLDBMultiplier.class); + private static boolean shutdownHookInstalled = false; + @ConfigurationParameter(name = PARAM_ADD_GENES, mandatory = false, defaultValue = "false", description = GNormPlusAnnotator.DESC_ADD_GENES) + private boolean addGenes; + @ConfigurationParameter(name = PARAM_GNP_SETUP_FILE, mandatory = false, description = GNormPlusAnnotator.DESC_GNP_SETUP_FILE) + private String setupFile; + @ConfigurationParameter(name = PARAM_GENE_TYPE_NAME, mandatory = false, defaultValue = "de.julielab.jcore.types.Gene", description = GNormPlusAnnotator.DESC_GENE_TYPE_NAME) + private String geneTypeName; + @ConfigurationParameter(name = PARAM_OUTPUT_DIR, mandatory = false, description = GNormPlusAnnotator.DESC_OUTPUT_DIR) + private String outputDirectory; + @ConfigurationParameter(name = PARAM_FOCUS_SPECIES, mandatory = false, description = DESC_FOCUS_SPECIES) + private String focusSpecies; + @ConfigurationParameter(name = PARAM_SKIP_UNCHANGED_DOCUMENTS, mandatory = false, description = "Whether to omit GNormPlus processing on documents that already exist in the XMI database table and whose document text has not changed.") + private boolean skipUnchangedDocuments; + private BioCDocumentPopulator bioCDocumentPopulator; + private GNormPlusMultiplierLogic multiplierLogic; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException { + super.initialize(aContext); + addGenes = (boolean) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_ADD_GENES)).orElse(false); + geneTypeName = (String) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_GENE_TYPE_NAME)).orElse(Gene.class.getCanonicalName()); + skipUnchangedDocuments = (boolean) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_SKIP_UNCHANGED_DOCUMENTS)).orElse(false); + try { + bioCDocumentPopulator = new BioCDocumentPopulator(addGenes, geneTypeName); + } catch (ClassNotFoundException e) { + log.error("Gene annotation class {} could not be found.", geneTypeName, e); + throw new ResourceInitializationException(e); + } + try { + multiplierLogic = new GNormPlusMultiplierLogic(aContext, bioCDocumentPopulator, () -> super.hasNext(), () -> { + try { + return (JCas) super.next(); + } catch (AnalysisEngineProcessException e) { + log.error("Error when calling next() of the base multiplier."); + throw new RuntimeException(e); + } + }, () -> getEmptyJCas(), + skipUnchangedDocuments); + } catch (IOException e) { + log.error("Could not initialize GNormPlus", e); + throw new ResourceInitializationException(e); + } + } + + @Override + public boolean hasNext() { + try { + return multiplierLogic.hasNext(); + } catch (Throwable t) { + log.error("Error when checking hasNext() on multiplier", t); + } + return false; + } + + @Override + public AbstractCas next() throws AnalysisEngineProcessException { + try { + return multiplierLogic.next(); + } catch (Throwable t) { + log.error("Error when retrieving next multiplier CAS", t); + throw new AnalysisEngineProcessException(t); + } + } +} diff --git a/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusXmiDBMultiplier.java b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusXmiDBMultiplier.java new file mode 100644 index 000000000..e90905e83 --- /dev/null +++ b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusXmiDBMultiplier.java @@ -0,0 +1,92 @@ +package de.julielab.jcore.multiplier.gnp; + +import de.julielab.jcore.ae.gnp.GNormPlusAnnotator; +import de.julielab.jcore.consumer.gnp.BioCDocumentPopulator; +import de.julielab.jcore.reader.xmi.XmiDBMultiplier; +import de.julielab.jcore.types.Gene; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.AbstractCas; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Optional; + +import static de.julielab.jcore.ae.gnp.GNormPlusAnnotator.DESC_FOCUS_SPECIES; + +@ResourceMetaData(name = "JCoRe GNormPlus XMI Database Multiplier", description = "A CAS multiplier to be used with the DB XMI multiplier reader in place of the DB XMI multiplier. It wraps the JULIE Lab variant of the GNormPlus gene ID mapper. It is a multiplier because this enables batch-processing of documents with GNormPlus which makes the processing more efficient.", vendor = "JULIE Lab Jena, Germany") +@TypeCapability(inputs = {}, outputs = {"de.julielab.jcore.types.ConceptMention", "de.julielab.jcore.types.Organism"}) +public class GNormPlusXmiDBMultiplier extends XmiDBMultiplier { + public static final String PARAM_ADD_GENES = GNormPlusAnnotator.PARAM_ADD_GENES; + public static final String PARAM_GENE_TYPE_NAME = GNormPlusAnnotator.PARAM_GENE_TYPE_NAME; + public static final String PARAM_OUTPUT_DIR = GNormPlusAnnotator.PARAM_OUTPUT_DIR; + public static final String PARAM_GNP_SETUP_FILE = GNormPlusAnnotator.PARAM_GNP_SETUP_FILE; + public static final String PARAM_FOCUS_SPECIES = GNormPlusAnnotator.PARAM_FOCUS_SPECIES; + private final static Logger log = LoggerFactory.getLogger(GNormPlusXmiDBMultiplier.class); + private static boolean shutdownHookInstalled = false; + @ConfigurationParameter(name = PARAM_ADD_GENES, mandatory = false, defaultValue = "false", description = GNormPlusAnnotator.DESC_ADD_GENES) + private boolean addGenes; + @ConfigurationParameter(name = PARAM_GNP_SETUP_FILE, mandatory = false, description = GNormPlusAnnotator.DESC_GNP_SETUP_FILE) + private String setupFile; + @ConfigurationParameter(name = PARAM_GENE_TYPE_NAME, mandatory = false, defaultValue = "de.julielab.jcore.types.Gene", description = GNormPlusAnnotator.DESC_GENE_TYPE_NAME) + private String geneTypeName; + @ConfigurationParameter(name = PARAM_OUTPUT_DIR, mandatory = false, description = GNormPlusAnnotator.DESC_OUTPUT_DIR) + private String outputDirectory; + @ConfigurationParameter(name = PARAM_FOCUS_SPECIES, mandatory = false, description = DESC_FOCUS_SPECIES) + private String focusSpecies; + private BioCDocumentPopulator bioCDocumentPopulator; + private GNormPlusMultiplierLogic multiplierLogic; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException { + super.initialize(aContext); + addGenes = (boolean) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_ADD_GENES)).orElse(false); + geneTypeName = (String) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_GENE_TYPE_NAME)).orElse(Gene.class.getCanonicalName()); + try { + bioCDocumentPopulator = new BioCDocumentPopulator(addGenes, geneTypeName); + } catch (ClassNotFoundException e) { + log.error("Gene annotation class {} could not be found.", geneTypeName, e); + throw new ResourceInitializationException(e); + } + try { + multiplierLogic = new GNormPlusMultiplierLogic(aContext, bioCDocumentPopulator, () -> super.hasNext(), () -> { + try { + return (JCas) super.next(); + } catch (AnalysisEngineProcessException e) { + log.error("Error when calling next() of the base multiplier."); + throw new RuntimeException(e); + } + }, () -> getEmptyJCas(), + false); + } catch (IOException e) { + log.error("Could not initialize GNormPlus", e); + throw new ResourceInitializationException(e); + } + } + + @Override + public boolean hasNext() { + try { + return multiplierLogic.hasNext(); + } catch (Throwable t) { + log.error("Error when checking hasNext() on multiplier", t); + } + return false; + } + + @Override + public AbstractCas next() throws AnalysisEngineProcessException { + try { + return multiplierLogic.next(); + } catch (Throwable t) { + log.error("Error when retrieving next multiplier CAS", t); + throw new AnalysisEngineProcessException(t); + } + } +} diff --git a/jcore-gnormplus-ae/src/main/resources/de/julielab/jcore/ae/gnp/config/setup_do_ner.txt b/jcore-gnormplus-ae/src/main/resources/de/julielab/jcore/ae/gnp/config/setup_do_ner.txt new file mode 100644 index 000000000..73009c799 --- /dev/null +++ b/jcore-gnormplus-ae/src/main/resources/de/julielab/jcore/ae/gnp/config/setup_do_ner.txt @@ -0,0 +1,32 @@ +#===Annotation +#Attribution setting: +#FocusSpecies = Taxonomy ID +# All: All species +# 9606: Human +# 4932: yeast +# 7227: Fly +# 10090: Mouse +# 10116: Rat +# 7955: Zebrafish +# 3702: Arabidopsis thaliana +#open: True +#close: False + +[Focus Species] + FocusSpecies = All + FilterAntibody = True +[Dictionary & Model] + DictionaryFolder = Dictionary + GNRModel = Dictionary/GNR.Model + SCModel = Dictionary/SimConcept.Model +[Modules] + SpeciesRecognition = True + GeneRecognition = True + SpeciesAssignment = True + GeneNormalization = True +[Others] + GeneIDMatch = False + HomologeneID = False + Normalization2Protein = False + ShowUnNormalizedMention = False + DeleteTmp = True \ No newline at end of file diff --git a/jcore-gnormplus-ae/src/main/resources/de/julielab/jcore/ae/gnp/desc/jcore-gnormplus-ae.xml b/jcore-gnormplus-ae/src/main/resources/de/julielab/jcore/ae/gnp/desc/jcore-gnormplus-ae.xml new file mode 100644 index 000000000..ae82b1037 --- /dev/null +++ b/jcore-gnormplus-ae/src/main/resources/de/julielab/jcore/ae/gnp/desc/jcore-gnormplus-ae.xml @@ -0,0 +1,86 @@ + + + org.apache.uima.java + true + de.julielab.jcore.ae.gnp.GNormPlusAnnotator + + JCoRe GNormPlus Annotator + Wrapper for the JULIE Lab variant of the GNormPlus gene ID mapper. + 2.6.0 + JULIE Lab Jena, Germany + + + AddGenes + If set to true, all Gene annotations in the CAS will be added to the BioC documents. The default type used is de.julielab.jcore.types.Gene. This can be changed with the GeneTypeName parameter. + Boolean + false + false + + + GeneTypeName + The UIMA type denoting gene annotations that should be written into the BioC format when the AddGenes parameter is set to true. + String + false + false + + + GNormPlusSetupFile + File path or class path resource path to the setup.txt file for GNormPlus. If not specified, a default setup file is loaded that expects the Dictionary/ directory directly under the working directory, performs gene recognition with the CRF and thus expects the GNormPlus CRF directory directly under the working directory and maps the found genes to NCBI gene IDs for all organisms. + String + false + false + + + FocusSpecies + If given, all gene mentions are assigned to this NCBI taxonomy ID, i.e. species recognition is omitted. + String + false + false + + + OutputDirectory + Optional. If specified, the GNormPlus output files in BioC format will be saved to the given directory. In this way, this component can be used directly as a BioC XML writer through the GNormPlus algorithm. + String + false + false + + + + + AddGenes + + false + + + + GeneTypeName + + de.julielab.jcore.types.Gene + + + + + + + + + + + + + + + + de.julielab.jcore.types.ConceptMention + de.julielab.jcore.types.Organism + + + + + + true + true + false + + + \ No newline at end of file diff --git a/jcore-gnormplus-ae/src/main/resources/de/julielab/jcore/multiplier/gnp/desc/jcore-gnormplus-bioc-multiplier.xml b/jcore-gnormplus-ae/src/main/resources/de/julielab/jcore/multiplier/gnp/desc/jcore-gnormplus-bioc-multiplier.xml new file mode 100644 index 000000000..ba7fc2a1c --- /dev/null +++ b/jcore-gnormplus-ae/src/main/resources/de/julielab/jcore/multiplier/gnp/desc/jcore-gnormplus-bioc-multiplier.xml @@ -0,0 +1,100 @@ + + + org.apache.uima.java + true + de.julielab.jcore.multiplier.gnp.GNormPlusBioCMultiplier + + JCoRe GNormPlus BioC Multiplier + A CAS multiplier to be used with the GNormPlus BioC Format multiplier reader. It wraps the JULIE Lab variant of the GNormPlus gene ID mapper. It is a multiplier because this enables batch-processing of documents with GNormPlus which makes the processing more efficient. + 2.6.0 + JULIE Lab Jena, Germany + + + AddGenes + If set to true, all Gene annotations in the CAS will be added to the BioC documents. The default type used is de.julielab.jcore.types.Gene. This can be changed with the GeneTypeName parameter. + Boolean + false + false + + + GNormPlusSetupFile + File path or class path resource path to the setup.txt file for GNormPlus. If not specified, a default setup file is loaded that expects the Dictionary/ directory directly under the working directory, performs gene recognition with the CRF and thus expects the GNormPlus CRF directory directly under the working directory and maps the found genes to NCBI gene IDs for all organisms. + String + false + false + + + GeneTypeName + The UIMA type denoting gene annotations that should be written into the BioC format when the AddGenes parameter is set to true. + String + false + false + + + OutputDirectory + Optional. If specified, the GNormPlus output files in BioC format will be saved to the given directory. In this way, this component can be used directly as a BioC XML writer through the GNormPlus algorithm. + String + false + false + + + FocusSpecies + If given, all gene mentions are assigned to this NCBI taxonomy ID, i.e. species recognition is omitted. + String + false + false + + + CostosysConfigFile + Path to the CoStoSys configuration file that is used by the XMI DB writer in the same pipeline, if any. The XMI DB writer requires information about the XMI documents that are already in the database and should be updated with new annotations. The current highest XMI ID must be known to avoid ID collisions. To obtain the ID, it must be received from the database beforehand. This allows to retrieve the information batch wise instead of one-by-one which would be much slower. + String + false + false + + + DocumentsTable + Required to retrieve the max XMI ID for use by the XMI DB writer. The schema-qualified name of the XMI document table that the XMI DB writer will write annotations into. + String + false + false + + + + + AddGenes + + false + + + + GeneTypeName + + de.julielab.jcore.types.Gene + + + + + + + + + + + + + + + + de.julielab.jcore.types.ConceptMention + de.julielab.jcore.types.Organism + + + + + + true + true + true + + + \ No newline at end of file diff --git a/jcore-gnormplus-ae/src/main/resources/de/julielab/jcore/multiplier/gnp/desc/jcore-gnormplus-pmc-db-multiplier.xml b/jcore-gnormplus-ae/src/main/resources/de/julielab/jcore/multiplier/gnp/desc/jcore-gnormplus-pmc-db-multiplier.xml new file mode 100644 index 000000000..29212813a --- /dev/null +++ b/jcore-gnormplus-ae/src/main/resources/de/julielab/jcore/multiplier/gnp/desc/jcore-gnormplus-pmc-db-multiplier.xml @@ -0,0 +1,143 @@ + + + org.apache.uima.java + true + de.julielab.jcore.multiplier.gnp.GNormPlusPMCDBMultiplier + + JCoRe GNormPlus PMC Database Multiplier + A CAS multiplier to be used with the DB PMC multiplier reader in place of the DB PMC multiplier. It wraps the JULIE Lab variant of the GNormPlus gene ID mapper. It is a multiplier because this enables batch-processing of documents with GNormPlus which makes the processing more efficient. + 2.6.0 + JULIE Lab Jena, Germany + + + AddGenes + If set to true, all Gene annotations in the CAS will be added to the BioC documents. The default type used is de.julielab.jcore.types.Gene. This can be changed with the GeneTypeName parameter. + Boolean + false + false + + + GNormPlusSetupFile + File path or class path resource path to the setup.txt file for GNormPlus. If not specified, a default setup file is loaded that expects the Dictionary/ directory directly under the working directory, performs gene recognition with the CRF and thus expects the GNormPlus CRF directory directly under the working directory and maps the found genes to NCBI gene IDs for all organisms. + String + false + false + + + GeneTypeName + The UIMA type denoting gene annotations that should be written into the BioC format when the AddGenes parameter is set to true. + String + false + false + + + OutputDirectory + Optional. If specified, the GNormPlus output files in BioC format will be saved to the given directory. In this way, this component can be used directly as a BioC XML writer through the GNormPlus algorithm. + String + false + false + + + FocusSpecies + If given, all gene mentions are assigned to this NCBI taxonomy ID, i.e. species recognition is omitted. + String + false + false + + + SkipUnchangedDocuments + Whether to omit GNormPlus processing on documents that already exist in the XMI database table and whose document text has not changed. + Boolean + false + false + + + OmitBibliographyReferences + If set to true, references to the bibliography are omitted from the CAS text. + Boolean + false + false + + + AddShaHash + For use with AnnotationDefinedFlowController. Possible values: document_text, defaults to 'document_text' and thus doesn't need to be specified manually at the moment. This parameter needs to match the value for the same parameter given to the XMIDBWriter in this pipeline. Then, a comparison between the existing hash in the database and the new hash of the CAS read in this pipeline can be made. In case the hashes match, the CAS is directly routed to the components specified in the ToVisitKeys parameter, skipping all other components. Note that this only works with AAEs where the first component is an 'AnnotationControlledFlow'. + String + false + false + + + DocumentTable + For use with AnnotationDefinedFlowController. String parameter indicating the name of the table where the XMI data and, thus, the hash is stored. The name must be schema qualified. Note that in this component, only the ToVisit annotation is created that determines which components to apply to a CAS with matching (unchanged) hash. The logic to actually control the CAS flow is contained in the AnnotationDefinedFlowController. + String + false + false + + + DocumentTableSchema + For use with AnnotationDefinedFlowController. The name of the schema that the document table - given with the DocumentTable parameter - adheres to. Only the primary key part is required for hash value retrieval. + String + false + false + + + ToVisitKeys + For use with AnnotationDefinedFlowController. The delegate AE keys of the AEs this CAS should still applied on although the hash has not changed. Can be null or empty indicating that no component should be applied to the CAS. This is, however, the task of the AnnotationDefinedFlowController. + String + true + false + + + TruncateAtSize + The maximum number of characters allowed in the document text. Characters exceeding this size are discarded. This can be necessary when large documents cannot be handled by subsequent components in the pipeline. Defaults to Integer.MAX_VALUE. + Integer + false + false + + + + + AddGenes + + false + + + + GeneTypeName + + de.julielab.jcore.types.Gene + + + + OmitBibliographyReferences + + false + + + + + + + + + + + + + + + + + + de.julielab.jcore.types.ConceptMention + de.julielab.jcore.types.Organism + + + + + + true + true + true + + + \ No newline at end of file diff --git a/jcore-gnormplus-ae/src/main/resources/de/julielab/jcore/multiplier/gnp/desc/jcore-gnormplus-xmi-db-multiplier.xml b/jcore-gnormplus-ae/src/main/resources/de/julielab/jcore/multiplier/gnp/desc/jcore-gnormplus-xmi-db-multiplier.xml new file mode 100644 index 000000000..cc3c750ea --- /dev/null +++ b/jcore-gnormplus-ae/src/main/resources/de/julielab/jcore/multiplier/gnp/desc/jcore-gnormplus-xmi-db-multiplier.xml @@ -0,0 +1,108 @@ + + + org.apache.uima.java + true + de.julielab.jcore.multiplier.gnp.GNormPlusXmiDBMultiplier + + JCoRe GNormPlus XMI Database Multiplier + A CAS multiplier to be used with the DB XMI multiplier reader. It wraps the JULIE Lab variant of the GNormPlus gene ID mapper. It is a multiplier because this enables batch-processing of documents with GNormPlus which makes the processing more efficient. + 2.6.0 + JULIE Lab Jena, Germany + + + AddGenes + If set to true, all Gene annotations in the CAS will be added to the BioC documents. The default type used is de.julielab.jcore.types.Gene. This can be changed with the GeneTypeName parameter. + Boolean + false + false + + + GNormPlusSetupFile + File path or class path resource path to the setup.txt file for GNormPlus. If not specified, a default setup file is loaded that expects the Dictionary/ directory directly under the working directory, performs gene recognition with the CRF and thus expects the GNormPlus CRF directory directly under the working directory and maps the found genes to NCBI gene IDs for all organisms. + String + false + false + + + GeneTypeName + The UIMA type denoting gene annotations that should be written into the BioC format when the AddGenes parameter is set to true. + String + false + false + + + OutputDirectory + Optional. If specified, the GNormPlus output files in BioC format will be saved to the given directory. In this way, this component can be used directly as a BioC XML writer through the GNormPlus algorithm. + String + false + false + + + FocusSpecies + If given, all gene mentions are assigned to this NCBI taxonomy ID, i.e. species recognition is omitted. + String + false + false + + + LogFinalXmi + For debugging purposes. If set to true, before parsing the final XMI data assembled from the annotation modules, it is printed to console. + Boolean + false + false + + + TruncateAtSize + Specify size in bytes of the XMI sofa string, i.e. the document text. If the text surpasses that size, the document is not populated from XMI but given some placeholder information. This can be necessary when large documents cannot be handled by subsequent components in the pipeline. + Integer + false + false + + + + + AddGenes + + false + + + + GeneTypeName + + de.julielab.jcore.types.Gene + + + + LogFinalXmi + + false + + + + + + + + + + + + + + + + + + de.julielab.jcore.types.ConceptMention + de.julielab.jcore.types.Organism + + + + + + true + true + true + + + \ No newline at end of file diff --git a/jcore-gnormplus-ae/src/main/resources/de/julielab/jcore/multiplier/gnp/desc/jcore-gnormplus-xml-db-multiplier.xml b/jcore-gnormplus-ae/src/main/resources/de/julielab/jcore/multiplier/gnp/desc/jcore-gnormplus-xml-db-multiplier.xml new file mode 100644 index 000000000..f300ca78c --- /dev/null +++ b/jcore-gnormplus-ae/src/main/resources/de/julielab/jcore/multiplier/gnp/desc/jcore-gnormplus-xml-db-multiplier.xml @@ -0,0 +1,151 @@ + + + org.apache.uima.java + true + de.julielab.jcore.multiplier.gnp.GNormPlusXMLDBMultiplier + + JCoRe GNormPlus XML Database Multiplier + A CAS multiplier to be used with the DB XML multiplier reader in place of the DB XML multiplier. It wraps the JULIE Lab variant of the GNormPlus gene ID mapper. It is a multiplier because this enables batch-processing of documents with GNormPlus which makes the processing more efficient. + 2.6.0 + JULIE Lab Jena, Germany + + + AddGenes + If set to true, all Gene annotations in the CAS will be added to the BioC documents. The default type used is de.julielab.jcore.types.Gene. This can be changed with the GeneTypeName parameter. + Boolean + false + false + + + GNormPlusSetupFile + File path or class path resource path to the setup.txt file for GNormPlus. If not specified, a default setup file is loaded that expects the Dictionary/ directory directly under the working directory, performs gene recognition with the CRF and thus expects the GNormPlus CRF directory directly under the working directory and maps the found genes to NCBI gene IDs for all organisms. + String + false + false + + + GeneTypeName + The UIMA type denoting gene annotations that should be written into the BioC format when the AddGenes parameter is set to true. + String + false + false + + + OutputDirectory + Optional. If specified, the GNormPlus output files in BioC format will be saved to the given directory. In this way, this component can be used directly as a BioC XML writer through the GNormPlus algorithm. + String + false + false + + + FocusSpecies + If given, all gene mentions are assigned to this NCBI taxonomy ID, i.e. species recognition is omitted. + String + false + false + + + SkipUnchangedDocuments + Whether to omit GNormPlus processing on documents that already exist in the XMI database table and whose document text has not changed. + Boolean + false + false + + + RowMapping + In case that the CoStoSys active table schema specified more than two columns to be retrieved, the other columns need a mapping into the CAS.A mapping item has the following form: <column index>=<uima type>#<type feature>:<feature datatype>:defaultValue where the defaultValue is optional. Example: 2=de.julielab.jules.types.max_xmi_id#id:int:0 maps the content of the third (index 2, zero-based) retrieved column (may also belong to an additional table!) to feature "id" of the type "d.j.j.t.max_xmi_id" which is an int. In case there is no value returned from the database for a document, use a 0 as default. + String + true + false + + + MappingFile + An XML mapping file following the specification required by the jcore-xml-mapper. The mapping file specifies how contents from an XML docuent are to be brought into the CAS. + String + false + true + + + AddShaHash + For use with AnnotationDefinedFlowController and XMIDBWriter. Possible values: document_text, defaults to 'document_text' and thus doesn't need to be specified manually at the moment. This parameter needs to match the value for the same parameter given to the XMIDBWriter in this pipeline. Then, a comparison between the existing hash in the database and the new hash of the CAS read in this pipeline can be made. In case the hashes match, the CAS is directly routed to the components specified in the ToVisitKeys parameter, skipping all other components. Note that this only works with AAEs where the first component is an 'AnnotationControlledFlow'. Additionally, the DBProcessingMetaData#hasDocumentHashChanged is set. This can be used by the XMIDBWriter to omit the reset of mirror subsets when updating the base document when the actual CAS text stayed the same. + String + false + false + + + DocumentTable + For use with AnnotationDefinedFlowController. String parameter indicating the name of the table where the XMI data and, thus, the hash is stored. The name must be schema qualified. Note that in this component, only the ToVisit annotation is created that determines which components to apply to a CAS with matching (unchanged) hash. The logic to actually control the CAS flow is contained in the AnnotationDefinedFlowController. + String + false + false + + + DocumentTableSchema + For use with AnnotationDefinedFlowController. The name of the schema that the document table - given with the DocumentTable parameter - adheres to. Only the primary key part is required for hash value retrieval. + String + false + false + + + ToVisitKeys + For use with AnnotationDefinedFlowController. Specifies the delegate AE keys of the AEs this CAS should still applied on although the hash has not changed. Can be null or empty indicating that no component should be applied to the CAS. The task of the AnnotationDefinedFlowController is then to read those annotations and route the CAS accordingly. + String + true + false + + + AddToVisitKeys + Toggles the creation of annotations for the AnnotationDefinedFlowController. Only needed when such a flow controller is used in the pipeline. For details, see the description of ToVisitKeys. + Boolean + false + false + + + AddUnchangedDocumentTextFlag + Toggles the addition of the 'document text is unchanged' flag. The value of this flag is determined via a SHA256 hash of the CAS document text. When DocumentTable and DocumentTableSchema are specified, the hash value of the document in storage is retrieved and compared to the current value. The flag is then set with respect to the comparison result. + Boolean + false + false + + + + + AddGenes + + false + + + + GeneTypeName + + de.julielab.jcore.types.Gene + + + + + + + + + + + + + + + + + + de.julielab.jcore.types.ConceptMention + de.julielab.jcore.types.Organism + + + + + + true + true + true + + + \ No newline at end of file diff --git a/jcore-gnormplus-ae/src/test/java/de/julielab/jcore/ae/gnp/GNormPlusAnnotatorTest.java b/jcore-gnormplus-ae/src/test/java/de/julielab/jcore/ae/gnp/GNormPlusAnnotatorTest.java new file mode 100644 index 000000000..8b56aafe8 --- /dev/null +++ b/jcore-gnormplus-ae/src/test/java/de/julielab/jcore/ae/gnp/GNormPlusAnnotatorTest.java @@ -0,0 +1,20 @@ + +package de.julielab.jcore.ae.gnp; + +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * Unit tests for jcore-gnormplus-ae. + * + */ +public class GNormPlusAnnotatorTest{ + private final static Logger log = LoggerFactory.getLogger(GNormPlusAnnotatorTest.class); + + @Test + public void testAnnotator() { + // TODO + } +} diff --git a/jcore-gnp-bioc-reader/BioC.dtd b/jcore-gnp-bioc-reader/BioC.dtd new file mode 100644 index 000000000..8bd0d55ca --- /dev/null +++ b/jcore-gnp-bioc-reader/BioC.dtd @@ -0,0 +1,158 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/jcore-gnp-bioc-reader/LICENSE b/jcore-gnp-bioc-reader/LICENSE new file mode 100644 index 000000000..fbbd41e05 --- /dev/null +++ b/jcore-gnp-bioc-reader/LICENSE @@ -0,0 +1,26 @@ +BSD 2-Clause License + +Copyright (c) 2017, JULIE Lab +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/jcore-gnp-bioc-reader/README.md b/jcore-gnp-bioc-reader/README.md new file mode 100644 index 000000000..7947f772a --- /dev/null +++ b/jcore-gnp-bioc-reader/README.md @@ -0,0 +1,34 @@ +# JCoRe GNormPlus BioC Reader + +**Descriptor Path**: +``` +de.julielab.jcore.reader.desc.jcore-bnp-bioc-reader +``` + +A reader for the BioC format used by GNormPlus. Reads the text and the annotations, both species and genes. + + + +**1. Parameters** + +| Parameter Name | Parameter Type | Mandatory | Multivalued | Description | +|----------------|----------------|-----------|-------------|-------------| +| param1 | UIMA-Type | Boolean | Boolean | Description | +| param2 | UIMA-Type | Boolean | Boolean | Description | + +**2. Predefined Settings** + +| Parameter Name | Parameter Syntax | Example | +|----------------|------------------|---------| +| param1 | Syntax-Description | `Example` | +| param2 | Syntax-Description | `Example` | + +**3. Capabilities** + +| Type | Input | Output | +|------|:-----:|:------:| +| de.julielab.jcore.types.TYPE | | `+` | +| de.julielab.jcore.types.ace.TYPE | `+` | | + + +[1] Some Literature? diff --git a/jcore-gnp-bioc-reader/component.meta b/jcore-gnp-bioc-reader/component.meta new file mode 100644 index 000000000..de865415a --- /dev/null +++ b/jcore-gnp-bioc-reader/component.meta @@ -0,0 +1,25 @@ +{ + "categories": [ + "multiplier", + "reader" + ], + "description": "A reader for the BioC format used by GNormPlus. Reads the text and the annotations, both species and genes.", + "descriptors": [ + { + "category": "multiplier", + "location": "de.julielab.jcore.reader.desc.jcore-bnp-bioc-multiplier" + }, + { + "category": "reader", + "location": "de.julielab.jcore.reader.desc.jcore-bnp-bioc-multiplier-reader" + } + ], + "exposable": true, + "group": "general", + "maven-artifact": { + "artifactId": "jcore-gnp-bioc-reader", + "groupId": "de.julielab", + "version": "2.6.0" + }, + "name": "JCoRe GNormPlus BioC Reader" +} diff --git a/jcore-gnp-bioc-reader/pom.xml b/jcore-gnp-bioc-reader/pom.xml new file mode 100644 index 000000000..01593d1c7 --- /dev/null +++ b/jcore-gnp-bioc-reader/pom.xml @@ -0,0 +1,69 @@ + + + + 4.0.0 + jcore-gnp-bioc-reader + jar + de.julielab + + + de.julielab + jcore-base + 2.6.0 + + + + + ch.qos.logback + logback-classic + test + + + org.slf4j + slf4j-api + + + de.julielab + jcore-descriptor-creator + + + com.pengyifan.bioc + pengyifan-bioc + 1.0.3 + + + de.julielab + jcore-types + ${jcore-types-version} + + + org.junit.jupiter + junit-jupiter-engine + + + org.assertj + assertj-core + + + de.julielab + jcore-utilities + ${jcore-utilities-version} + test + + + de.julielab + costosys + 1.5.2 + + + JCoRe GNormPlus BioC Reader + + JULIE Lab Jena, Germany + http://www.julielab.de + + https://github.com/JULIELab/jcore-base/tree/master/jcore-bnp-bioc-reader + A reader for the BioC format used by GNormPlus. Reads the text and the annotations, both species and + genes. + + diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java new file mode 100644 index 000000000..0f72a4581 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java @@ -0,0 +1,318 @@ +package de.julielab.jcore.reader; + +import com.pengyifan.bioc.*; +import com.pengyifan.bioc.io.BioCCollectionReader; +import de.julielab.costosys.dbconnection.CoStoSysConnection; +import de.julielab.costosys.dbconnection.DataBaseConnector; +import de.julielab.jcore.types.*; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.jcas.cas.StringArray; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.xml.stream.XMLStreamException; +import java.io.IOException; +import java.nio.file.Path; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * Reads a BioC collection from file and adds the species and gene annotations from its documents to a JCases. + */ +public class BioCCasPopulator { + + private final static Logger log = LoggerFactory.getLogger(BioCCasPopulator.class); + private final BioCCollection bioCCollection; + private Map maxXmiIdMap; + private Map sofaMaps; + private int pos; + + /** + * This constructor is used when the GNormPlusMultiplier/Reader is used to read files that directly correspond to + * JeDIS database documents and should be written back into the database. Then we need some information about + * the database and the state of the document. + * @param biocCollectionPath The BioC documents to read that have equivalents in the JeDIS database. + * @param costosysConfiguration The CoStoSys configuration to connect to the JeDIS database. + * @param documentsTable The name of the database table that stores the documents. + * @throws XMLStreamException + * @throws IOException + * @throws SQLException + */ + public BioCCasPopulator(Path biocCollectionPath, Path costosysConfiguration, String documentsTable) throws XMLStreamException, IOException, SQLException { + this(biocCollectionPath); + if (costosysConfiguration != null) { + maxXmiIdMap = new HashMap<>(); + sofaMaps = new HashMap<>(); + DataBaseConnector dbc = new DataBaseConnector(costosysConfiguration.toString()); + try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { + retrieveXmiMetaData(documentsTable, dbc, conn); + } + } + pos = 0; + } + + /** + * This constructor is used when GNormPlus BioC files - or only the contained annotatoins - should be read into a CAS without the need to synchronize to a JeDIS database. + * @param biocCollectionPath The BioC documents to read that have equivalents in the JeDIS database. + * @throws XMLStreamException + * @throws IOException + */ + public BioCCasPopulator(Path biocCollectionPath) throws XMLStreamException, IOException { + try (BioCCollectionReader bioCCollectionReader = new BioCCollectionReader(biocCollectionPath)) { + bioCCollection = bioCCollectionReader.readCollection(); + } + } + + private void retrieveXmiMetaData(String documentsTable, DataBaseConnector dbc, CoStoSysConnection conn) throws SQLException { + log.debug("Retrieving the max XMI IDs for the current BioC collection of size {} from the database.", bioCCollection.getDocmentCount()); + Statement stmt = conn.createStatement(); + StringBuilder maxIdQueryBuilder = new StringBuilder(); + if (dbc.getActiveTableFieldConfiguration().getPrimaryKey().length > 1) + throw new IllegalArgumentException("The primary key of the active field schema '" + dbc.getActiveTableFieldConfiguration().getName() + "' is a compound key. Compound primary keys are currently not supported in this component."); + String pkString = dbc.getActiveTableFieldConfiguration().getPrimaryKeyString(); + maxIdQueryBuilder.append("SELECT ").append(pkString).append(",max_xmi_id,sofa_mapping FROM ").append(documentsTable).append(" WHERE ").append(pkString).append(" in ").append("("); + for (BioCDocument document : bioCCollection.getDocuments()) { + String docId = document.getID(); + maxIdQueryBuilder.append("'").append(docId).append("'").append(","); + } + // remove trailing comma + maxIdQueryBuilder.deleteCharAt(maxIdQueryBuilder.length() - 1); + maxIdQueryBuilder.append(")"); + String maxIdQuery = maxIdQueryBuilder.toString(); + ResultSet rs = stmt.executeQuery(maxIdQuery); + while (rs.next()) { + maxXmiIdMap.put(rs.getString(1), rs.getInt(2)); + sofaMaps.put(rs.getString(1), rs.getString(3)); + } + if (log.isTraceEnabled()) { + log.trace("XMI ID sample: {}", maxXmiIdMap.entrySet().stream().limit(10).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue))); + log.trace("Sofa map sample: {}", sofaMaps.entrySet().stream().limit(10).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue))); + } + log.debug("Obtained {} max XMI IDs.", maxXmiIdMap.size()); + } + + public void populateWithNextDocument(JCas jCas) { + populateWithNextDocument(jCas, false); + } + + /** + * Populate the given CAS either with the complete contents of the next BioC document or only with its annotations. + * @param jCas The CAS to add data to. Can be empty when it should be populated with the BioC document text or it already may have a text when it only should be filled with the annotations of the BioC document. + * @param onlyAddAnnotations Whether to add only annotations from the next BioC document instead of its whole textual contents. + */ + public void populateWithNextDocument(JCas jCas, boolean onlyAddAnnotations) { + BioCDocument document = bioCCollection.getDocument(pos++); + if (!onlyAddAnnotations) { + setDocumentId(jCas, document); + setDocumentText(jCas, document); + setMaxXmiId(jCas, document); + } + Iterator allAnnotations = Stream.concat(document.getAnnotations().stream(), document.getPassages().stream().map(BioCPassage::getAnnotations).flatMap(Collection::stream)).iterator(); + for (BioCAnnotation annotation : (Iterable) () -> allAnnotations) { + Optional type = annotation.getInfon("type"); + if (!type.isPresent()) + throw new IllegalArgumentException("BioCDocument " + document.getID() + " has an annotation that does not specify its type: " + annotation); + try { + switch (type.get()) { + case "Gene": + addGeneAnnotation(annotation, jCas); + break; + case "FamilyName": + addFamilyAnnotation(annotation, jCas); + break; + case "Species": + addSpeciesAnnotation(annotation, jCas); + break; + } + } catch (MissingInfonException | IllegalArgumentException e) { + throw new IllegalArgumentException("BioCDocument " + document.getID() + " has an annotation issue; see cause exception.", e); + } + } + } + + + private void setMaxXmiId(JCas jCas, BioCDocument document) { + if (maxXmiIdMap != null) { + Integer maxXmiId = maxXmiIdMap.get(document.getID()); + String mappingString = sofaMaps.get(document.getID()); + if (maxXmiId == null) + throw new IllegalStateException("No max XMI ID was obtained for the document with ID " + document.getID() + ". This means that this document is not already part of the database documents table. When adding annotations to existing database documents, make sure that all documents exist in the database already."); + XmiMetaData xmiMetaData = new XmiMetaData(jCas); + xmiMetaData.setMaxXmiId(maxXmiId); + String[] mappings = mappingString != null ? mappingString.split("\\|") : null; + StringArray mappingsArray = null; + if (mappings != null) { + mappingsArray = new StringArray(jCas, mappings.length); + for (int i = 0; i < mappings.length; i++) { + String mapping = mappings[i]; + mappingsArray.set(i, mapping); + } + } + if (mappingsArray != null) + xmiMetaData.setSofaIdMappings(mappingsArray); + xmiMetaData.addToIndexes(); + } + } + + private void setDocumentId(JCas jCas, BioCDocument document) { + Header h = new Header(jCas); + h.setDocId(document.getID()); + h.addToIndexes(); + } + + private void setDocumentText(JCas jCas, BioCDocument document) { + StringBuilder sb = new StringBuilder(); + // iterate over the passages and create the complete document text from their individual text elements + for (BioCPassage passage : document.getPassages()) { + int offset = passage.getOffset(); + // The offset of the passage must match its starting position in the StringBuilder or the annotation + // offsets won't match. We might need to fill up the StringBuilder to reach the given offset. + while (sb.length() < offset) + sb.append(" "); + if (passage.getText().isPresent()) { + sb.append(passage.getText().get()); + Optional type = passage.getInfon("type"); + if (type.isPresent()) { + int passageEnd = offset + passage.getText().get().length(); + Zone passageAnnotation; + // The values in this switch are basically determined by the values created in the BioCDocumentPopulator in the jcore-gnp-bioc-writer project. + switch (type.get()) { + case "title": + passageAnnotation = new Title(jCas, offset, passageEnd); + ((Title) passageAnnotation).setTitleType("document"); + break; + case "section_title": + passageAnnotation = new Title(jCas, offset, passageEnd); + ((Title) passageAnnotation).setTitleType("section"); + break; + case "figure_title": + passageAnnotation = new Title(jCas, offset, passageEnd); + ((Title) passageAnnotation).setTitleType("figure"); + break; + case "table_title": + passageAnnotation = new Title(jCas, offset, passageEnd); + ((Title) passageAnnotation).setTitleType("table"); + break; + case "other_title": + passageAnnotation = new Title(jCas, offset, passageEnd); + ((Title) passageAnnotation).setTitleType("other"); + break; + case "abstract": + passageAnnotation = new AbstractText(jCas, offset, passageEnd); + break; + case "paragraph": + passageAnnotation = new Paragraph(jCas, offset, passageEnd); + break; + case "figure": + case "table": + // for figures and tables we have actually no means to distinguish between captions and the actual object; mainly because the actual objects have so far not been part of the CAS documents; thus, this can only be a caption until the objects themselves are added + passageAnnotation = new Caption(jCas, offset, passageEnd); + ((Caption) passageAnnotation).setCaptionType(type.get()); + break; + default: + log.debug("Unhandled passage type {}", type.get()); + passageAnnotation = new Zone(jCas, offset, passageEnd); + break; + } + passageAnnotation.setComponentId(GNormPlusFormatMultiplier.class.getCanonicalName()); + passageAnnotation.addToIndexes(); + } + } + } + jCas.setDocumentText(sb.toString()); + } + + private void addSpeciesAnnotation(BioCAnnotation annotation, JCas jCas) throws MissingInfonException { + Optional taxId = annotation.getInfon("NCBI Taxonomy"); +// if (!taxId.isPresent()) +// throw new MissingInfonException("Species annotation does not specify its taxonomy ID: " + annotation); + // the "total location" is the span from the minimum location value to the maximum location value; + // for GNormPlus, there are no discontinuing annotations anyway + BioCLocation location = annotation.getTotalLocation(); + Organism organism = new Organism(jCas, location.getOffset(), location.getOffset() + location.getLength()); + if (taxId.isPresent()) { + ResourceEntry resourceEntry = new ResourceEntry(jCas, organism.getBegin(), organism.getEnd()); + resourceEntry.setSource("NCBI Taxonomy"); + resourceEntry.setComponentId(GNormPlusFormatMultiplierReader.class.getCanonicalName()); + resourceEntry.setEntryId(taxId.get()); + FSArray resourceEntryList = new FSArray(jCas, 1); + resourceEntryList.set(0, resourceEntry); + organism.setResourceEntryList(resourceEntryList); + } + organism.addToIndexes(); + } + + private void addGeneAnnotation(BioCAnnotation annotation, JCas jCas) throws MissingInfonException { + Optional geneId = annotation.getInfon("NCBI Gene"); +// if (!geneId.isPresent()) +// throw new MissingInfonException("Gene annotation does not specify its gene ID: " + annotation); + // the "total location" is the span from the minimum location value to the maximum location value; + // for GNormPlus, there are no discontinuing annotations anyway + BioCLocation location = annotation.getTotalLocation(); + Gene gene = new Gene(jCas, location.getOffset(), location.getOffset() + location.getLength()); + gene.setComponentId(GNormPlusFormatMultiplierReader.class.getCanonicalName()); + gene.setSpecificType("Gene"); + if (geneId.isPresent()) { // one gene mention might have multiple IDs when there are ranges or enumerations, e.g. "IL2-5", "B7-1 and B7-2" or "B7-1/2" + String[] geneIds = geneId.get().split(";"); + FSArray resourceEntryList = new FSArray(jCas, geneIds.length); + for (int i = 0; i < geneIds.length; i++) { + ResourceEntry resourceEntry = new ResourceEntry(jCas, gene.getBegin(), gene.getEnd()); + // 9999 ist the GeNo score for exact matches; GNP only recognized exact dictionary matches and transfers + // their IDs to other forms under certain circumstances (abbreviations, for example) + resourceEntry.setConfidence("9999"); + resourceEntry.setSource("NCBI Gene"); + resourceEntry.setComponentId(GNormPlusFormatMultiplierReader.class.getCanonicalName()); + resourceEntry.setEntryId(geneIds[i]); + resourceEntryList.set(i, resourceEntry); + } + gene.setResourceEntryList(resourceEntryList); + } + gene.addToIndexes(); + } + + private void addFamilyAnnotation(BioCAnnotation annotation, JCas jCas) { + // the "total location" is the span from the minimum location value to the maximum location value; + // for GNormPlus, there are no discontinuing annotations anyway + BioCLocation location; + try { + location = annotation.getTotalLocation(); + } catch (Exception e) { + // This handles a legacy issue: We modified GNormPlus to output FamilyName annotations. For some reason, + // FamilyNames can have zero length. This has been fixed but there is still old output that would + // cause an error at this point. Thus, when the offsets are invalid, skip the annotation. + return; + } + Gene gene = new Gene(jCas, location.getOffset(), location.getOffset() + location.getLength()); + gene.setSpecificType("FamilyName"); + // e.g. NCBITaxonomyID:9606 + Optional focusSpecies = annotation.getInfon("FocusSpecies"); + if (focusSpecies.isPresent()) { + String taxId = focusSpecies.get().substring(15); + StringArray speciesArray = new StringArray(jCas, 1); + speciesArray.set(0, taxId); + gene.setSpecies(speciesArray); + } + gene.addToIndexes(); + } + + public int documentsLeftInCollection() { + return bioCCollection.getDocmentCount() - pos; + } + + public long getCollectionTextLength() { + return bioCCollection.getDocuments().stream().map(BioCDocument::getPassages).flatMap(Collection::stream).mapToInt(passage -> passage.getText().orElse("").length()).sum(); + } + + public int getNumDocumentsInCollection() { + return bioCCollection.getDocmentCount(); + } + + public void clearDocument(int index) { + bioCCollection.getDocuments().set(index, null); + } +} diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplier.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplier.java new file mode 100644 index 000000000..1739e461e --- /dev/null +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplier.java @@ -0,0 +1,97 @@ +package de.julielab.jcore.reader; + +import de.julielab.jcore.types.casmultiplier.JCoReURI; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_component.JCasMultiplier_ImplBase; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.AbstractCas; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.URI; +import java.nio.file.Path; +import java.text.DecimalFormat; +import java.util.Collection; +import java.util.Iterator; + +@ResourceMetaData(name = "JCoRe GNormPlus BioC Format Multiplier", description = "Multiplier for GNormPlusFormatMultiplierReader. Takes URIs pointing to BioC collection files that contain annotations created by GNormPlus. For each such file, reads all documents and returns CASes for them until all documents in all collections have been read into a CAS.") +@TypeCapability(outputs = {"de.julielab.jcore.types.Gene", "de.julielab.jcore.types.Organism"}) +public class GNormPlusFormatMultiplier extends JCasMultiplier_ImplBase { + public static final String PARAM_COSTOSYS_CONFIG = "CostosysConfigFile"; + public static final String PARAM_XMI_DOCUMENTS_TABLE = "DocumentsTable"; + private final static Logger log = LoggerFactory.getLogger(GNormPlusFormatMultiplier.class); + private Iterator currentUriBatch; + private BioCCasPopulator casPopulator; + private DecimalFormat df = new DecimalFormat(); + + @ConfigurationParameter(name = PARAM_COSTOSYS_CONFIG, mandatory = false, description = "Path to the CoStoSys configuration file that is used by the XMI DB writer in the same pipeline, if any. The XMI DB writer requires information about the XMI documents that are already in the database and should be updated with new annotations. The current highest XMI ID must be known to avoid ID collisions. To obtain the ID, it must be received from the database beforehand. This allows to retrieve the information batch wise instead of one-by-one which would be much slower.") + private String costosysConfiguration; + @ConfigurationParameter(name = PARAM_XMI_DOCUMENTS_TABLE, mandatory = false, description = "Required to retrieve the max XMI ID for use by the XMI DB writer. The schema-qualified name of the XMI document table that the XMI DB writer will write annotations into.") + private String documentsTable; + + private long lastTimeStamp; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException { + super.initialize(aContext); + costosysConfiguration = (String) aContext.getConfigParameterValue(PARAM_COSTOSYS_CONFIG); + documentsTable = (String) aContext.getConfigParameterValue(PARAM_XMI_DOCUMENTS_TABLE); + if (costosysConfiguration == null ^ documentsTable == null) + throw new ResourceInitializationException(new IllegalArgumentException("Either both or none parameters must be defined: " + PARAM_COSTOSYS_CONFIG + ", " + PARAM_XMI_DOCUMENTS_TABLE)); + lastTimeStamp = 0; + } + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + try { + Collection jcoreUris = JCasUtil.select(jCas, JCoReURI.class); + if (log.isDebugEnabled()) + log.debug("Received batch of {} BioC XML URIs", jcoreUris.size()); + currentUriBatch = jcoreUris.stream().map(JCoReURI::getUri).map(URI::create).iterator(); + } catch (Throwable e) { + log.error("Unexpected error", e); + throw new AnalysisEngineProcessException(e); + } + } + + @Override + public boolean hasNext() throws AnalysisEngineProcessException { + if ((casPopulator == null || casPopulator.documentsLeftInCollection() == 0) && currentUriBatch.hasNext()) { + URI nextUri = currentUriBatch.next(); + try { + if (log.isDebugEnabled() && lastTimeStamp != 0) { + long collectionTextLength = casPopulator.getCollectionTextLength(); + long passedMillis = System.currentTimeMillis() - lastTimeStamp; + log.debug("Last document batch of size {} processing time: {}s for text length of {} characters; that is {}ms per character.", casPopulator.getNumDocumentsInCollection(), passedMillis / 1000, collectionTextLength, df.format((double)passedMillis/collectionTextLength)); + } + lastTimeStamp = System.currentTimeMillis(); + casPopulator = new BioCCasPopulator(Path.of(nextUri), costosysConfiguration != null ? Path.of(costosysConfiguration) : null, documentsTable); + } catch (Exception e) { + log.error("Could not read from {}", nextUri, e); + throw new AnalysisEngineProcessException(e); + } + } + return casPopulator != null && casPopulator.documentsLeftInCollection() > 0; + } + + @Override + public AbstractCas next() throws AnalysisEngineProcessException { + if (hasNext()) { + JCas cas = getEmptyJCas(); + try { + casPopulator.populateWithNextDocument(cas); + return cas; + } catch (Exception e) { + log.error("Could not populate CAS with the next BioC document.", e); + throw new AnalysisEngineProcessException(e); + } + } + return null; + } +} diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierReader.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierReader.java new file mode 100644 index 000000000..40c706594 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierReader.java @@ -0,0 +1,97 @@ +package de.julielab.jcore.reader; + +import de.julielab.jcore.types.casmultiplier.JCoReURI; +import org.apache.uima.UimaContext; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Progress; +import org.apache.uima.util.ProgressImpl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URI; +import java.nio.file.FileVisitOption; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Iterator; +import java.util.Optional; +import java.util.stream.Stream; + +@ResourceMetaData(name = "JCoRe GNormPlus Format Multiplier Reader", description = "A reader for the BioC XML format used by GNormPlus. Requires the matching multiplier.") +public class GNormPlusFormatMultiplierReader extends JCasCollectionReader_ImplBase { + + public static final String PARAM_INPUT_PATH = "InputPath"; + public static final String PARAM_RECURSIVE = "Recursive"; + public static final String PARAM_BATCH_SIZE = "BatchSize"; + private final static Logger log = LoggerFactory.getLogger(GNormPlusFormatMultiplierReader.class); + @ConfigurationParameter(name = PARAM_INPUT_PATH, description = "Path to a directory or file to be read. In case of a directory, all files ending in .xml will be read.") + private String inputPathString; + @ConfigurationParameter(name = PARAM_RECURSIVE, mandatory = false, defaultValue = "true", description = "Whether to read also the subdirectories of the input directory, if the input path points to a directory.") + private boolean recursive; + @ConfigurationParameter(name = PARAM_BATCH_SIZE, mandatory = false, defaultValue = "20", description = "The number of XML file URI references to send to the CAS multipliers in each work assignment. Defaults to 20.") + private int batchSize; + private Iterator fileIterator; + private int completed; + + + /** + * This method is called a single time by the framework at component + * creation. Here, descriptor parameters are read and initial setup is done. + */ + @Override + public void initialize(UimaContext context) throws ResourceInitializationException { + super.initialize(context); + inputPathString = (String) context.getConfigParameterValue(PARAM_INPUT_PATH); + recursive = Optional.of((boolean) context.getConfigParameterValue(PARAM_RECURSIVE)).orElse(true); + try { + Path inputPath = Path.of(inputPathString); + Stream pathStream; + if (recursive) + pathStream = Files.walk(inputPath, FileVisitOption.FOLLOW_LINKS); + else + pathStream = Files.list(inputPath); + pathStream = pathStream.filter(p -> p.toString().toLowerCase().endsWith(".xml")); + fileIterator = pathStream.iterator(); + } catch (IOException e) { + log.error("Could not read the files of inputPath {}", inputPathString, e); + throw new ResourceInitializationException(e); + } + completed = 0; + } + + @Override + public void getNext(JCas jCas) throws CollectionException { + for (int i = 0; i < batchSize && fileIterator.hasNext(); i++) { + URI uri = fileIterator.next().toUri(); + try { + JCoReURI fileType = new JCoReURI(jCas); + fileType.setUri(uri.toString()); + fileType.addToIndexes(); + } catch (Exception e) { + log.error("Exception with URI: " + uri, e); + throw new CollectionException(e); + } + completed++; + if (completed % 10 == 0) { + log.debug("{} input files read", completed); + } + } + } + + + @Override + public Progress[] getProgress() { + return new Progress[]{new ProgressImpl(completed, -1, "documents")}; + } + + @Override + public boolean hasNext() { + return fileIterator.hasNext(); + } + +} diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/MissingInfonException.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/MissingInfonException.java new file mode 100644 index 000000000..59277495c --- /dev/null +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/MissingInfonException.java @@ -0,0 +1,22 @@ +package de.julielab.jcore.reader; + +public class MissingInfonException extends Exception { + public MissingInfonException() { + } + + public MissingInfonException(String message) { + super(message); + } + + public MissingInfonException(String message, Throwable cause) { + super(message, cause); + } + + public MissingInfonException(Throwable cause) { + super(cause); + } + + public MissingInfonException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier-reader.xml b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier-reader.xml new file mode 100644 index 000000000..947a31d8b --- /dev/null +++ b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier-reader.xml @@ -0,0 +1,59 @@ + + + org.apache.uima.java + de.julielab.jcore.reader.GNormPlusFormatMultiplierReader + + JCoRe GNormPlus Format Multiplier Reader + A reader for the BioC XML format used by GNormPlus. Requires the matching multiplier. + 2.6.0 + + + InputPath + Path to a directory or file to be read. In case of a directory, all files ending in .xml will be read. + String + false + true + + + Recursive + Whether to read also the subdirectories of the input directory, if the input path points to a directory. + Boolean + false + false + + + BatchSize + The number of XML file URI references to send to the CAS multipliers in each work assignment. Defaults to 20. + Integer + false + false + + + + + Recursive + + true + + + + BatchSize + + 20 + + + + + + + + + + + + true + false + true + + + \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml new file mode 100644 index 000000000..bbbf18e18 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml @@ -0,0 +1,53 @@ + + + org.apache.uima.java + true + de.julielab.jcore.reader.GNormPlusFormatMultiplier + + JCoRe GNormPlus BioC Format Multiplier + Multiplier for GNormPlusFormatMultiplierReader. Takes URIs pointing to BioC collection files that contain annotations created by GNormPlus. For each such file, reads all documents and returns CASes for them until all documents in all collections have been read into a CAS. + 2.6.0 + + + CostosysConfigFile + Path to the CoStoSys configuration file that is used by the XMI DB writer in the same pipeline, if any. The XMI DB writer requires information about the XMI documents that are already in the database and should be updated with new annotations. The current highest XMI ID must be known to avoid ID collisions. To obtain the ID, it must be received from the database beforehand. This allows to retrieve the information batch wise instead of one-by-one which would be much slower. + String + false + false + + + DocumentsTable + Required to retrieve the max XMI ID for use by the XMI DB writer. The schema-qualified name of the XMI document table that the XMI DB writer will write annotations into. + String + false + false + + + + + + + + + + + + + + + + + + de.julielab.jcore.types.Gene + de.julielab.jcore.types.Organism + + + + + + true + true + true + + + \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java new file mode 100644 index 000000000..e1ffdc7e4 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java @@ -0,0 +1,100 @@ +package de.julielab.jcore.reader; + +import de.julielab.jcore.types.*; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.assertj.core.api.Condition; +import org.junit.jupiter.api.Test; + +import java.nio.file.Path; +import java.util.Collection; + +import static org.assertj.core.api.Assertions.assertThat; + +class BioCCasPopulatorTest { + + private JCas getJCas() throws Exception { + return JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types"); + } + + @Test + public void populateWithNextDocument() throws Exception { + BioCCasPopulator bioCCasPopulator = new BioCCasPopulator(Path.of("src", "test", "resources", "test-input-path", "bioc_collection_3.xml"), null, null); + assertThat(bioCCasPopulator.documentsLeftInCollection()).isEqualTo(2); + JCas jCas = getJCas(); + bioCCasPopulator.populateWithNextDocument(jCas); + + assertThat(jCas.getDocumentText()).startsWith("Langerin").endsWith("antigen-processing pathway."); + Title title = JCasUtil.selectSingle(jCas, Title.class); + assertThat(title).extracting(Title::getTitleType).isEqualTo("document"); + assertThat(title).extracting(Title::getCoveredText).isEqualTo("Langerin, a novel C-type lectin specific to Langerhans cells, is an endocytic receptor that induces the formation of Birbeck granules."); + AbstractText abstractText = JCasUtil.selectSingle(jCas, AbstractText.class); + assertThat(abstractText).extracting(AbstractText::getCoveredText).is(new Condition<>(s -> s.startsWith("We have identified"), "Abstract has an unexpected beginning")); + // this document does not have organisms, we check those for the second document in the collection below + Collection genes = JCasUtil.select(jCas, Gene.class); + assertThat(genes).hasSize(7); + for (Gene o : genes) { + assertThat(o.getResourceEntryList()).isNotNull(); + assertThat(o.getResourceEntryList()).hasSize(1); + assertThat(o.getResourceEntryList(0)).extracting(ResourceEntry::getComponentId).isEqualTo(GNormPlusFormatMultiplierReader.class.getCanonicalName()); + assertThat(o.getResourceEntryList(0)).extracting(ResourceEntry::getSource).isEqualTo("NCBI Gene"); + assertThat(o.getResourceEntryList(0)).extracting(ResourceEntry::getEntryId).isNotNull(); + } + assertThat(genes).extracting(Gene::getCoveredText).contains("Langerin"); + + assertThat(bioCCasPopulator.documentsLeftInCollection()).isEqualTo(1); + jCas.reset(); + bioCCasPopulator.populateWithNextDocument(jCas); + assertThat(jCas.getDocumentText()).startsWith("BCAR1, a human homologue"); + + Collection organisms = JCasUtil.select(jCas, Organism.class); + assertThat(organisms).isNotEmpty(); + for (Organism o : organisms) { + assertThat(o.getResourceEntryList()).isNotNull(); + assertThat(o.getResourceEntryList()).hasSize(1); + assertThat(o.getResourceEntryList(0)).extracting(ResourceEntry::getComponentId).isEqualTo(GNormPlusFormatMultiplierReader.class.getCanonicalName()); + assertThat(o.getResourceEntryList(0)).extracting(ResourceEntry::getSource).isEqualTo("NCBI Taxonomy"); + assertThat(o.getResourceEntryList(0)).extracting(ResourceEntry::getEntryId).isNotNull(); + } + assertThat(organisms).extracting(Organism::getCoveredText).contains("human", "patients", "rat", "retrovirus", "ZR-75-1"); + } + + @Test + public void addFamilyNames() throws Exception { + BioCCasPopulator bioCCasPopulator = new BioCCasPopulator(Path.of("src", "test", "resources", "bioc_collection_0_0.xml"), null, null); + JCas jCas = getJCas(); + bioCCasPopulator.populateWithNextDocument(jCas); + + Collection genes = JCasUtil.select(jCas, Gene.class); + assertThat(genes).hasSize(23); + assertThat(genes).filteredOn(Gene::getSpecificType, "FamilyName").hasSize(5); + for (Gene o : genes) { + if (o.getSpecificType().equals("FamilyName")) { + assertThat(o.getSpecies(0)).isEqualTo("9606"); + } + } + } + + @Test + public void multipleGeneIds() throws Exception { + // Check that gene mentions with multiple IDs (enumerations, alternatives, ranges...) result in multiple ResourceEntries in a Gene annotation + BioCCasPopulator bioCCasPopulator = new BioCCasPopulator(Path.of("src", "test", "resources", "multipleGeneIdsDocument.xml"), null, null); + JCas jCas = getJCas(); + bioCCasPopulator.populateWithNextDocument(jCas); + + Collection genes = JCasUtil.select(jCas, Gene.class); + boolean multipleIdGeneFound = false; + for (Gene o : genes) { + if (o.getBegin() == 805) { + multipleIdGeneFound = true; + FSArray resourceEntryList = o.getResourceEntryList(); + assertThat(resourceEntryList).hasSize(2); + assertThat(o.getResourceEntryList(0).getEntryId()).isEqualTo("12519"); + assertThat(o.getResourceEntryList(1).getEntryId()).isEqualTo("12524"); + } + } + assertThat(multipleIdGeneFound).isTrue(); + } +} \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierReaderTest.java b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierReaderTest.java new file mode 100644 index 000000000..b2ad2190e --- /dev/null +++ b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierReaderTest.java @@ -0,0 +1,69 @@ + +package de.julielab.jcore.reader; + + +import de.julielab.jcore.types.casmultiplier.JCoReURI; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.factory.CollectionReaderFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.junit.jupiter.api.Test; + +import java.nio.file.Path; +import java.util.Collection; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Unit tests for jcore-bnp-bioc-reader. + * @author + * + */ +public class GNormPlusFormatMultiplierReaderTest{ + + private JCas getCas() throws Exception { + return JCasFactory.createJCas("de.julielab.jcore.types.casmultiplier.jcore-uri-multiplier-types"); + } + @Test + public void testReader() throws Exception { + CollectionReader reader = CollectionReaderFactory.createReader(GNormPlusFormatMultiplierReader.class, GNormPlusFormatMultiplierReader.PARAM_INPUT_PATH, Path.of("src", "test", "resources", "test-input-path").toString()); + assertThat(reader.hasNext()).isTrue(); + JCas jCas = getCas(); + reader.getNext(jCas.getCas()); + Collection uris = JCasUtil.select(jCas, JCoReURI.class); + assertThat(uris).extracting(JCoReURI::getUri).map(Path::of).map(Path::getFileName).map(Path::toString).containsExactlyInAnyOrder("bioc_collection_2.xml", "bioc_collection_3.xml", "bioc_collection_0.xml", "bioc_collection_1.xml"); + assertThat(reader.hasNext()).isFalse(); + } + + @Test + public void testReader2() throws Exception { + // check that the non-recursive mode also works + CollectionReader reader = CollectionReaderFactory.createReader(GNormPlusFormatMultiplierReader.class, GNormPlusFormatMultiplierReader.PARAM_INPUT_PATH, Path.of("src", "test", "resources", "test-input-path").toString(), GNormPlusFormatMultiplierReader.PARAM_RECURSIVE, false); + assertThat(reader.hasNext()); + JCas jCas = getCas(); + reader.getNext(jCas.getCas()); + Collection uris = JCasUtil.select(jCas, JCoReURI.class); + assertThat(uris).extracting(JCoReURI::getUri).map(Path::of).map(Path::getFileName).map(Path::toString).containsExactlyInAnyOrder("bioc_collection_3.xml"); + assertThat(reader.hasNext()).isFalse(); + } + + @Test + public void testReader3() throws Exception { + // check that the batch size parameter works as intended + CollectionReader reader = CollectionReaderFactory.createReader(GNormPlusFormatMultiplierReader.class, GNormPlusFormatMultiplierReader.PARAM_INPUT_PATH, Path.of("src", "test", "resources", "test-input-path").toString(), GNormPlusFormatMultiplierReader.PARAM_BATCH_SIZE, 2); + assertThat(reader.hasNext()).isTrue(); + JCas jCas = getCas(); + reader.getNext(jCas.getCas()); + Collection uris = JCasUtil.select(jCas, JCoReURI.class); + assertThat(uris).hasSize(2); + assertThat(reader.hasNext()).isTrue(); + jCas.reset(); + // there should another batch available + reader.getNext(jCas.getCas()); + Collection uris2 = JCasUtil.select(jCas, JCoReURI.class); + assertThat(uris2).hasSize(2); + // now the reader should be exhausted + assertThat(reader.hasNext()).isFalse(); + } +} diff --git a/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierTest.java b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierTest.java new file mode 100644 index 000000000..a38744b34 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierTest.java @@ -0,0 +1,43 @@ +package de.julielab.jcore.reader; + +import de.julielab.jcore.types.casmultiplier.JCoReURI; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.JCasIterator; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.junit.jupiter.api.Test; + +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; +class GNormPlusFormatMultiplierTest { + private JCas getCas() throws Exception { + return JCasFactory.createJCas("de.julielab.jcore.types.casmultiplier.jcore-uri-multiplier-types"); + } + + @Test + void process() throws Exception { + JCas cas = getCas(); + JCoReURI jCoReURI = new JCoReURI(cas); + jCoReURI.setUri(Path.of("src", "test", "resources", "test-input-path", "subdir1", "bioc_collection_0.xml").toUri().toString()); + jCoReURI.addToIndexes(); + + JCoReURI jCoReURI2 = new JCoReURI(cas); + jCoReURI2.setUri(Path.of("src", "test", "resources", "test-input-path", "subdir2", "bioc_collection_2.xml").toUri().toString()); + jCoReURI2.addToIndexes(); + + AnalysisEngine multiplier = AnalysisEngineFactory.createEngine(GNormPlusFormatMultiplier.class); + JCasIterator jCasIterator = multiplier.processAndOutputNewCASes(cas); + List docIds = new ArrayList<>(); + while (jCasIterator.hasNext()) { + JCas multiplierCas = jCasIterator.next(); + docIds.add(JCoReTools.getDocId(multiplierCas)); + multiplierCas.release(); + } + assertThat(docIds).containsExactlyInAnyOrder("1378843", "10896916", "10722742", "1770008"); + } +} \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/test/resources/bioc_collection_0_0.xml b/jcore-gnp-bioc-reader/src/test/resources/bioc_collection_0_0.xml new file mode 100644 index 000000000..46dc0e704 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/test/resources/bioc_collection_0_0.xml @@ -0,0 +1,261 @@ + + + + JCoRe GNormPlus BioC Writer + Wed Mar 02 14:58:28 CET 2022 + PubTator.key + + 10885490 + + title + 0 + Decreased plasma cholesterol esterification and cholesteryl ester transfer in hypopituitary patients on glucocorticoid replacement therapy. + + 9606 + Species + + patients + + + + abstract + 140 + Cardiovascular risk is increased in hypopituitary patients. No data are available with respect to the effect of glucocorticoid replacement therapy on high density lipoproteins (HDL) metabolism in such patients. Plasma lecithin:cholesterol acyl transferase (LCAT), cholesteryl ester transfer protein (CETP) and phospholipid transfer protein (PLTP) are important determinants of HDL remodelling. The possible influence of conventional glucocorticoid replacement on plasma lipids, plasma LCAT, CETP and PLTP activity levels, as well as on plasma cholesterol esterification (EST) and cholesteryl ester transfer (CET) was evaluated in 24 consecutive hypopituitary patients (12 men and 12 women) with untreated growth hormone deficiency of whom 17 had adrenal insufficiency and were treated with cortisone acetate, 25 to 37.5 mg daily. Twenty-three patients were on stable levothyroxin therapy and 22 patients used sex steroids. Urinary excretion of cortisol and cortisone metabolites was higher (p<0.001) in glucocorticoid-treated patients. Body mass index (p<0.08) and fat mass (p<0.12) were not significantly different in patients receiving and not receiving glucocorticoids. Fasting blood glucose, plasma insulin and insulin resistance were similar in the groups. Plasma total (p<0.05) and very low+low density lipoprotein cholesterol (p<0.01) were lower in patients receiving glucocorticoids, whereas HDL cholesterol and plasma triglycerides were not different between patients treated and not treated with glucocorticoids. Plasma LCAT activity was 45% lower (p<0.02) and CETP activity was 34% lower (p<0.05) in patients on glucocorticoid treatment. Multiple regression analysis showed that these effects were independent of gender and fat mass. In glucocorticoid-receiving patients, plasma EST and CET were decreased by 80% (p<0.01) and by 58% (p<0.05), respectively. These changes were at least partly attributable to lower LCAT and CETP activity levels. In contrast, plasma PLTP activity was not different between patients with and without glucocorticoid treatment, suggesting that exogenous glucocorticoids exert a different regulatory effect on plasma CETP compared to PLTP. In conclusion, this preliminary study suggests that conventional glucocorticoid replacement in hypopituitary patients is associated with a decrease in plasma cholesterol esterification and cholesteryl ester transfer, indicating that these steps in HDL metabolism are impaired. Such abnormalities in HDL metabolism could be involved in increased cardiovascular risk in glucocorticoid-treated hypopituitary patients, despite a lack of deterioration in plasma lipids. + + 3931 + Gene + + lecithin:cholesterol acyl transferase + + + 3931 + Gene + + LCAT + + + 1071 + Gene + + cholesteryl ester transfer protein + + + 1071 + Gene + + CETP + + + 5360 + Gene + + phospholipid transfer protein + + + 5360 + Gene + + PLTP + + + 3931 + Gene + + LCAT + + + 1071 + Gene + + CETP + + + 5360 + Gene + + PLTP + + + 3630 + Gene + + insulin + + + 3630 + Gene + + insulin + + + NCBITaxonomyID:9606 + FamilyName + + HDL + + + 3931 + Gene + + LCAT + + + 1071 + Gene + + CETP + + + 3931 + Gene + + LCAT + + + 1071 + Gene + + CETP + + + 5360 + Gene + + PLTP + + + 1071 + Gene + + CETP + + + 5360 + Gene + + PLTP + + + NCBITaxonomyID:9606 + FamilyName + + HDL + + + NCBITaxonomyID:9606 + FamilyName + + HDL + + + NCBITaxonomyID:9606 + FamilyName + + HDL + + + NCBITaxonomyID:9606 + FamilyName + + HDL + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + men + + + 9606 + Species + + women + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + + \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/test/resources/multipleGeneIdsDocument.xml b/jcore-gnp-bioc-reader/src/test/resources/multipleGeneIdsDocument.xml new file mode 100644 index 000000000..1a26ceb19 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/test/resources/multipleGeneIdsDocument.xml @@ -0,0 +1,136 @@ + + + + JCoRe GNormPlus BioC Writer + Wed Mar 02 14:58:28 CET 2022 + PubTator.key + + 16177354 + + title + 0 + Cellular mechanisms of the adjuvant activity of the flagellin component FljB of Salmonella enterica + Serovar Typhimurium to potentiate mucosal and systemic responses. + + + NCBITaxonomyID:90371 + FamilyName + + flagellin + + + 90371 + Species + + Salmonella enterica Serovar Typhimurium + + + + abstract + 166 + An expanding area of interest is the utilization of microbe-based components to augment mucosal and + systemic immune responses to target antigens. Thus, the aim of the present study was to assess if the + flagellin component FljB from Salmonella enterica serovar Typhimurium could act as a mucosal adjuvant + and then to determine the cellular mechanism(s) by which FljB mediates its adjuvant properties. To + determine if FljB could act as a mucosal adjuvant, mice were immunized by the intranasal (i.n.) route + with antigen alone or in conjunction with FljB. Additionally, we assessed how FljB affected the levels + of the costimulatory molecules B7-1 and B7-2 on dendritic cells by flow cytometry and determined the + functional role these costimulatory molecules played in the adjuvant properties of FljB in vivo. Mice + immunized by the i.n. route with antigen and FljB exhibited significantly elevated levels of mucosal and + systemic antibody and CD4(+)-T-cell responses compared to mice given antigen only. Stimulation of + dendritic cells in vitro with FljB resulted in a pronounced increase in the surface expression of B7-1 + and B7-2. The percentage of dendritic cells expressing B7-2 but not B7-1 increased significantly when + stimulated with FljB over a concentration range of 10 to 10,000 ng/ml. Immunization of wild-type and + B7-1, B7-2, and B7-1/2 knockout mice by the i.n. route revealed that the ability of FljB to increase + B7-2 expression is largely responsible for its adjuvant effect in vivo. These findings demonstrate that + FljB can act as an effective mucosal adjuvant and that its ability to enhance the level of B7-2 + expression is predominantly responsible for its adjuvant properties. + + + 12519;12524 + Gene + + B7-1 and B7-2 + + + 12519;12524 + Gene + + B7-1 and B7-2 + + + 12519;12524 + Gene + + B7-1/2 + + + 12524 + Gene + + B7-2 + + + 12524 + Gene + + B7-2 + + + 12524 + Gene + + B7-2 + + + 12519 + Gene + + B7-1 + + + 12519 + Gene + + B7-1 + + + 12524 + Gene + + B7-2 + + + NCBITaxonomyID:90371 + FamilyName + + flagellin + + + 90371 + Species + + Salmonella enterica serovar Typhimurium + + + 10090 + Species + + mice + + + 10090 + Species + + Mice + + + 10090 + Species + + mice + + + + \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/test/resources/test-input-path/bioc_collection_3.xml b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/bioc_collection_3.xml new file mode 100644 index 000000000..a874a1823 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/bioc_collection_3.xml @@ -0,0 +1 @@ +JCoRe GNormPlus BioC WriterFri Feb 18 13:55:36 CET 2022PubTator.key10661407title0Langerin, a novel C-type lectin specific to Langerhans cells, is an endocytic receptor that induces the formation of Birbeck granules.50489GeneLangerinabstract135We have identified a type II Ca2+-dependent lectin displaying mannose-binding specificity, exclusively expressed by Langerhans cells (LC), and named Langerin. LC are uniquely characterized by Birbeck granules (BG), which are organelles consisting of superimposed and zippered membranes. Here, we have shown that Langerin is constitutively associated with BG and that antibody to Langerin is internalized into these structures. Remarkably, transfection of Langerin cDNA into fibroblasts created a compact network of membrane structures with typical features of BG. Langerin is thus a potent inducer of membrane superimposition and zippering leading to BG formation. Our data suggest that induction of BG is a consequence of the antigen-capture function of Langerin, allowing routing into these organelles and providing access to a nonclassical antigen-processing pathway.50489GeneLangerin50489GeneLangerin50489GeneLangerin50489GeneLangerin50489GeneLangerin50489GeneLangerin10639512title0BCAR1, a human homologue of the adapter protein p130Cas, and antiestrogen resistance in breast cancer cells.9564GeneBCAR19564Genep130Cas9606Specieshumanabstract109Treatment of breast cancer with the antiestrogen tamoxifen is effective in approximately one half of the patients with estrogen receptor-positive disease, but tumors recur frequently because of the development of metastases that are resistant to tamoxifen. We have previously shown that mutagenesis of human estrogen-dependent ZR-75-1 breast cancer cells by insertion of a defective retrovirus genome caused the cells to become antiestrogen resistant. In this study, we isolated and characterized the crucial gene at the breast cancer antiestrogen resistance 1 (BCAR1) locus. Transfer of the BCAR1 locus from retrovirus-mutated, antiestrogen-resistant cells to estrogen-dependent ZR-75-1 cells by cell fusion conferred an antiestrogen-resistant phenotype on the recipient cells. The complete coding sequence of BCAR1 was isolated by use of exon-trapping and complementary DNA (cDNA) library screening. Sequence analysis of human BCAR1 cDNA predicted a protein of 870 amino acids that was strongly homologous to rat p130Cas-adapter protein. Genomic analysis revealed that BCAR1 consists of seven exons and is located at chromosome 16q23.1. BCAR1 transcripts were detected in multiple human tissues and were similar in size to transcripts produced by retrovirus-mutated ZR-75-1 cells. Transfection of BCAR1 cDNA into ZR-75-1 cells again resulted in sustained cell proliferation in the presence of antiestrogens, confirming that BCAR1 was the responsible gene in the locus. Overexpression of the BCAR1 gene confers antiestrogen resistance on human ZR-75-1 breast cancer cells. Overexpression of BCAR1 in retrovirus-mutated cells appears to result from activation of the gene's promoter. The isolation and characterization of this gene open new avenues to elucidating mechanisms by which the growth of human breast cancer becomes independent of estrogen.9564Genebreast cancer antiestrogen resistance 19564GeneBCAR19564GeneBCAR19564GeneBCAR19564GeneBCAR125414Genep130Cas-adapter protein9564GeneBCAR19564GeneBCAR19564GeneBCAR19564GeneBCAR19564GeneBCAR19564GeneBCAR19606Speciespatients9606Specieshuman31931Speciesretrovirus9606Specieshuman10116Speciesrat9606Specieshuman31931Speciesretrovirus9606Specieshuman31931Speciesretrovirus9606Specieshuman9606SpeciesZR-75-19606SpeciesZR-75-19606SpeciesZR-75-19606SpeciesZR-75-19606SpeciesZR-75-1 \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_0.xml b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_0.xml new file mode 100644 index 000000000..9c1283a15 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_0.xml @@ -0,0 +1,2 @@ +JCoRe GNormPlus BioC WriterFri Feb 18 13:55:36 CET 2022PubTator.key1378843title0Cloning and expression of a cell surface receptor for advanced glycosylation end products of proteins.abstract103Advanced glycosylation end products of proteins (AGEs) are nonenzymatically glycosylated proteins which accumulate in vascular tissue in aging and at an accelerated rate in diabetes. A approximately 35-kDa polypeptide with a unique NH2-terminal sequence has been isolated from bovine lung and found to be present on the surface of endothelial cells where it mediates the binding of AGEs (receptor for advanced glycosylation end product or RAGE). Using an oligonucleotide probe based on the amino-terminal sequence of RAGE, an apparently full-length cDNA of 1.5 kilobases was isolated from a bovine lung cDNA library. This cDNA encoded a 394 amino acid mature protein comprised of the following putative domains: an extracellular domain of 332 amino acids, a single hydrophobic membrane spanning domain of 19 amino acids, and a carboxyl-terminal domain of 43 amino acids. A partial clone encoding the human counterpart of RAGE, isolated from a human lung library, was found to be approximately 90% homologous to the bovine molecule. Based on computer analysis of the amino acid sequence of RAGE and comparison with databases, RAGE is a new member of the immunoglobulin superfamily of cell surface molecules and shares significant homology with MUC 18, NCAM, and the cytoplasmic domain of CD20. Expression of the RAGE cDNA in 293 cells allowed them to bind 125I-AGE-albumin in a saturable and dose-dependent manner (Kd approximately 100 nM), blocked by antibody to RAGE. Western blots of 293 cells transfected with RAGE cDNA probed with anti-RAGE IgG demonstrated expression of immunoreactive protein compared to its absence in mock-transfected cells. These results suggest that RAGE functions as a cell surface receptor for AGEs, which could potentially mediate cellular effects of this class of glycosylated proteins.280986GeneRAGE280986GeneRAGE177GeneRAGE280986GeneRAGE280986GeneRAGE505653GeneCD20280986GeneRAGE280986GeneRAGE280986GeneRAGE280986GeneRAGE280986GeneRAGE9913Speciesbovine9913Speciesbovine9606Specieshuman9606Specieshuman9913Speciesbovine10896916title0Alpha(2) adrenoceptors regulate proliferation of human intestinal epithelial cells.150GeneAlpha(2) adrenoceptors9606Specieshumanabstract84Previous studies on rodents have suggested that catecholamines stimulate proliferation of the intestinal epithelium through activation of alpha(2) adrenoceptors located on crypt cells. The occurrence of this effect awaits demonstration in humans and the molecular mechanisms involved have not yet been elucidated. Here, we examined the effect of alpha(2) agonists on a clone of Caco2 cells expressing the human alpha(2A) adrenoceptor. Cells were transfected with a bicistronic plasmid containing the alpha2C10 and neomycin phosphotransferase genes. G418 resistant clones were assayed for receptor expression using radioligand binding. Receptor functionality was assessed by testing its ability to couple Gi proteins and to inhibit cAMP production. Mitogen activated protein kinase (MAPK) phosphorylation was followed by western blot, and cell proliferation was estimated by measuring protein and DNA content. Permanent transfection of Caco2 cells allowed us to obtain a clone (Caco2-3B) expressing alpha(2A) adrenoceptors at a density similar to that found in normal human intestinal epithelium. Caco2-3B retained morphological features and brush border enzyme expression characteristic of enterocytic differentiation. The receptor was coupled to Gi2/Gi3 proteins and its stimulation caused marked diminution of forskolin induced cAMP production. Treatment of Caco2-3B with UK14304 (alpha(2) agonist) induced a rapid increase in the phosphorylation state of MAPK, extracellular regulated protein kinase 1 (Erk1), and 2 (Erk2). This event was totally abolished in pertussis toxin treated cells and in the presence of kinase inhibitors (genistein or PD98059). It was unaffected by protein kinase C downregulation but correlated with a transient increase in Shc tyrosine phosphorylation. Finally, sustained exposure of Caco2-3B to UK14304 resulted in modest but significant acceleration of cell proliferation. None of these effects was observed in the parental cell line Caco2. The results obtained in the present study support a regulatory role for alpha(2) adrenoceptors in intestinal cell proliferation.150Genealpha(2) adrenoceptors150Genealpha(2A) adrenoceptor150Genealpha2C105595;5594;5595GeneMAPK5595;5594;5595GeneMAPK5595Geneextracellular regulated protein kinase 15595GeneErk15594GeneErk26464GeneShc150Genealpha(2) adrenoceptors9606Specieshumans9606Specieshuman9606Specieshuman9606SpeciesCaco29606SpeciesCaco2 \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_1.xml b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_1.xml new file mode 100644 index 000000000..6676e8d34 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_1.xml @@ -0,0 +1,2 @@ +JCoRe GNormPlus BioC WriterFri Feb 18 13:55:36 CET 2022PubTator.key10880510title0Human TREK2, a 2P domain mechano-sensitive K+ channel with multiple regulations by polyunsaturated fatty acids, lysophospholipids, and Gs, Gi, and Gq protein-coupled receptors.54207GeneTREK29606SpeciesHumanabstract177Mechano-sensitive and fatty acid-activated K(+) belong to the structural class of K(+) channel with two pore domains. Here, we report the isolation and the characterization of a novel member of this family. This channel, called TREK2, is closely related to TREK1 (78% of homology). Its gene is located on chromosome 14q31. TREK2 is abundantly expressed in pancreas and kidney and to a lower level in brain, testis, colon, and small intestine. In the central nervous system, TREK2 has a widespread distribution with the highest levels of expression in cerebellum, occipital lobe, putamen, and thalamus. In transfected cells, TREK2 produces rapidly activating and non-inactivating outward rectifier K(+) currents. The single-channel conductance is 100 picosiemens at +40 mV in 150 mm K(+). The currents can be strongly stimulated by polyunsaturated fatty acid such as arachidonic, docosahexaenoic, and linoleic acids and by lysophosphatidylcholine. The channel is also activated by acidification of the intracellular medium. TREK2 is blocked by application of intracellular cAMP. As with TREK1, TREK2 is activated by the volatile general anesthetics chloroform, halothane, and isoflurane and by the neuroprotective agent riluzole. TREK2 can be positively or negatively regulated by a variety of neurotransmitter receptors. Stimulation of the G(s)-coupled receptor 5HT4sR or the G(q)-coupled receptor mGluR1 inhibits channel activity, whereas activation of the G(i)-coupled receptor mGluR2 increases TREK2 currents. These multiple types of regulations suggest that TREK2 plays an important role as a target of neurotransmitter action.54207GeneTREK23776GeneTREK154207GeneTREK254207GeneTREK254207GeneTREK254207GeneTREK23776GeneTREK154207GeneTREK254207GeneTREK23360Gene5HT4sR2911GenemGluR114800GenemGluR254207GeneTREK254207GeneTREK210803599title0Enhanced growth of MCF-7 breast cancer cells overexpressing parathyroid hormone-related peptide.5744Geneparathyroid hormone-related peptide9606SpeciesMCF-7abstract97PTH-related peptide (PTHrP) is a secreted protein produced by breast cancer cells both in vivo and in vitro. Because of its structural similarity to PTH at the amino terminus, the two proteins interact with a common cell surface receptor, the PTH/PTHrP receptor. When overproduced by tumor cells, PTHrP enters the circulation, giving rise to the common paraneoplastic syndrome of humoral hypercalcemia of malignancy. Although initially discovered in malignancies, PTHrP is now known to be produced by most cells and tissues in the body. It acts as an autocrine and paracrine mediator of cell proliferation and differentiation, effects which are mediated via the PTH/PTHrP receptor. Recent evidence also has shown that, directly after translation, PTHrP is able to enter the nucleus and/or nucleolus and influence cell cycle progression and apoptosis. In this study, we have either overproduced PTHrP or inhibited endogenous PTHrP production in the breast cancer cell line, MCF-7. Overexpression of PTHrP was associated with an increase in mitogenesis, whereas inhibiting endogenous PTHrP production resulted in decreased cell proliferation. The overexpressed peptide targeted to the perinuclear space. In contrast, PTHrP interaction with the cell surface PTH/PTHrP receptor resulted in decreased cell proliferation in the same cell line. This latter effect is dependent on interaction with the receptor, in that exogenously added PTHrP moieties known not to interact with the receptor had no effect on cell growth. Furthermore, neutralization of added peptide with an anti-PTHrP antiserum completely abolished the growth inhibitory effects. In contrast, this antibody has no effect on the increased proliferation rate of the MCF-7 transfectants that overexpress PTHrP, compared with control cells. The net effect of autocrine/paracrine and intracrine effects of PTHrP in MCF-7 cells overproducing the peptide is accelerated cell growth. These findings have critical implications regarding the role of PTHrP in breast cancer, and they suggest that controlling PTHrP production in breast cancer may be useful therapeutically.5744GenePTH-related peptide5744GenePTHrP5741GenePTH5745GenePTH/PTHrP receptor5744GenePTHrP5744GenePTHrP5745GenePTH/PTHrP receptor5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5745GenePTH/PTHrP receptor5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP9606SpeciesMCF-7 \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir2/bioc_collection_2.xml b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir2/bioc_collection_2.xml new file mode 100644 index 000000000..dc8927c84 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir2/bioc_collection_2.xml @@ -0,0 +1,2 @@ +JCoRe GNormPlus BioC WriterFri Feb 18 13:55:36 CET 2022PubTator.key10722742title0Mdm2 is a RING finger-dependent ubiquitin protein ligase for itself and p53.4193GeneMdm27157Genep53abstract77Mdm2 has been shown to regulate p53 stability by targeting the p53 protein for proteasomal degradation. We now report that Mdm2 is a ubiquitin protein ligase (E3) for p53 and that its activity is dependent on its RING finger. Furthermore, we show that Mdm2 mediates its own ubiquitination in a RING finger-dependent manner, which requires no eukaryotic proteins other than ubiquitin-activating enzyme (E1) and an ubiquitin-conjugating enzyme (E2). It is apparent, therefore, that Mdm2 manifests an intrinsic capacity to mediate ubiquitination. Mutation of putative zinc coordination residues abrogated this activity, as did chelation of divalent cations. After cation chelation, the full activity could be restored by addition of zinc. We further demonstrate that the degradation of p53 and Mdm2 in cells requires additional potential zinc-coordinating residues beyond those required for the intrinsic activity of Mdm2 in vitro. Replacement of the Mdm2 RING with that of another protein (Praja1) reconstituted ubiquitination and proteasomal degradation of Mdm2. However, this RING was ineffective in ubiquitination and proteasomal targeting of p53, suggesting that there may be specificity at the level of the RING in the recognition of heterologous substrates.4193GeneMdm27157Genep537157Genep534193GeneMdm27157Genep534193GeneMdm27318Geneubiquitin-activating enzyme (E1)4193GeneMdm27157Genep534193GeneMdm24193GeneMdm24193GeneMdm264219GenePraja14193GeneMdm27157Genep531770008title0Structural analysis and expression of human desmoglein: a cadherin-like component of the desmosome.1828;281131Genedesmoglein1000Genecadherin9606Specieshumanabstract100Desmosomes are adhesive cell junctions found in great abundance in tissues that experience mechanical stress. The transmembrane desmosomal glycoproteins have been proposed to play a role in cell adhesion; desmoglein I (DGI) is a major member of this class of desmosomal molecules. However, evidence supporting a role for DGI in cell adhesion or in the plaque is lacking. In order to begin to understand DGI function we have identified human cDNA clones encoding the entire mature polypeptide of 1000 amino acids. Our data suggest that like the bovine DGI molecule human DGI is highly related to the calcium-dependent class of cell adhesion molecules known as cadherins. Four related extracellular domains located in the amino-terminal domain of the molecule contain putative calcium binding sites originally identified in the cadherins. The highest degree of similarity between human N-cadherin and human DGI, and likewise between bovine DGI and human DGI, is greatest in the most amino-terminal extracellular domain. This suggests a conserved functional role for the extracellular domains, perhaps in calcium-mediated cell adhesion. The cytoplasmic portion of the molecule contains a cadherin-like region and, like bovine DGI, a carboxy-terminal tail that is not present in the cadherins, comprising three additional domains. One of these contains a novel repeating motif of 29 +/- 1 residues, first identified in bovine DGI. Each of the highly homologous repeating units is likely to consist of two beta-strands and two turns with special characteristics. Five amino acids that are identical in bovine and human DGI lie in the second of the two predicted beta-strands, and intriguingly contain putative target sites for protein kinase C. On the basis of structural analysis, a model predicting the disposition of human DGI domains in the desmosome is proposed. Northern analysis suggests that unlike bovine epidermis, which expresses a single mRNA of reported size approximately 7.6 kb, human foreskin and cultured keratinocytes display a complex pattern with bands of approximately 7.2, 4.0 and 3.0 kb. Each of these cross-hybridizing mRNAs is coordinately expressed in normal human keratinocytes in response to long-term culture and increased calcium.1828Genedesmoglein I1828GeneDGI1828GeneDGI1828GeneDGI281131GeneDGI1828GeneDGI1000GeneN-cadherin1828GeneDGI281131GeneDGI1828GeneDGI281131GeneDGI281131GeneDGI1828GeneDGI9606Specieshuman9913Speciesbovine9606Specieshuman9606Specieshuman9606Specieshuman9913Speciesbovine9606Specieshuman9913Speciesbovine9913Speciesbovine9913Speciesbovine9606Specieshuman9606Specieshuman9913Speciesbovine9606Specieshuman9606Specieshuman1828GeneDGI \ No newline at end of file diff --git a/jcore-gnp-bioc-writer/LICENSE b/jcore-gnp-bioc-writer/LICENSE new file mode 100644 index 000000000..fbbd41e05 --- /dev/null +++ b/jcore-gnp-bioc-writer/LICENSE @@ -0,0 +1,26 @@ +BSD 2-Clause License + +Copyright (c) 2017, JULIE Lab +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/jcore-gnp-bioc-writer/README.md b/jcore-gnp-bioc-writer/README.md new file mode 100644 index 000000000..6b6af0a20 --- /dev/null +++ b/jcore-gnp-bioc-writer/README.md @@ -0,0 +1,34 @@ +# JCoRe GNormPlus BioC Writer + +**Descriptor Path**: +``` +de.julielab.jcore.consumer.gnp.desc.jcore-gnp-bioc-writer +``` + +Writes CAS documents into the BioC XML format used by the gene tagger and normalizer GNormPlus. + + + +**1. Parameters** + +| Parameter Name | Parameter Type | Mandatory | Multivalued | Description | +|----------------|----------------|-----------|-------------|-------------| +| param1 | UIMA-Type | Boolean | Boolean | Description | +| param2 | UIMA-Type | Boolean | Boolean | Description | + +**2. Predefined Settings** + +| Parameter Name | Parameter Syntax | Example | +|----------------|------------------|---------| +| param1 | Syntax-Description | `Example` | +| param2 | Syntax-Description | `Example` | + +**3. Capabilities** + +| Type | Input | Output | +|------|:-----:|:------:| +| de.julielab.jcore.types.TYPE | | `+` | +| de.julielab.jcore.types.ace.TYPE | `+` | | + + +[1] Some Literature? diff --git a/jcore-gnp-bioc-writer/component.meta b/jcore-gnp-bioc-writer/component.meta new file mode 100644 index 000000000..f8b942bf4 --- /dev/null +++ b/jcore-gnp-bioc-writer/component.meta @@ -0,0 +1,20 @@ +{ + "categories": [ + "consumer" + ], + "description": "Writes CAS documents into the BioC XML format used by the gene tagger and normalizer GNormPlus.", + "descriptors": [ + { + "category": "consumer", + "location": "de.julielab.jcore.consumer.gnp.desc.jcore-gnp-bioc-writer" + } + ], + "exposable": true, + "group": "general", + "maven-artifact": { + "artifactId": "jcore-gnp-bioc-writer", + "groupId": "de.julielab", + "version": "2.6.0" + }, + "name": "JCoRe GNormPlus BioC Writer" +} diff --git a/jcore-gnp-bioc-writer/pom.xml b/jcore-gnp-bioc-writer/pom.xml new file mode 100644 index 000000000..12e5354c1 --- /dev/null +++ b/jcore-gnp-bioc-writer/pom.xml @@ -0,0 +1,62 @@ + + + + 4.0.0 + jcore-gnp-bioc-writer + jar + de.julielab + + + de.julielab + jcore-base + 2.6.0 + + + + + ch.qos.logback + logback-classic + test + + + org.slf4j + slf4j-api + + + de.julielab + jcore-types + ${jcore-types-version} + + + com.pengyifan.bioc + pengyifan-bioc + 1.0.3 + + + de.julielab + jcore-utilities + ${jcore-utilities-version} + + + org.junit.jupiter + junit-jupiter-engine + + + org.assertj + assertj-core + + + de.julielab + jcore-descriptor-creator + + + JCoRe GNormPlus BioC Writer + + JULIE Lab Jena, Germany + http://www.julielab.de + + https://github.com/JULIELab/jcore-base/tree/master/jcore-gnp-bioc-writer + Writes CAS documents into the BioC XML format used by the gene tagger and normalizer GNormPlus. + + diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCCollectionWriter.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCCollectionWriter.java new file mode 100644 index 000000000..9d16ba23f --- /dev/null +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCCollectionWriter.java @@ -0,0 +1,60 @@ +package de.julielab.jcore.consumer.gnp; + +import com.pengyifan.bioc.BioCCollection; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.xml.stream.XMLStreamException; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * Writes a collection of BioC documents into a single file. That file is created within a subdirectory of + * some base directory und changes over time to avoid overflowing directories. + */ +public class BioCCollectionWriter { + private final static Logger log = LoggerFactory.getLogger(BioCCollectionWriter.class); + private int numFilesPerDir; + private Path baseDir; + private Path currentDir; + private int numWrittenIntoCurrentDir; + private int currentDirNum; + + public BioCCollectionWriter(int numFilesPerDir, Path baseDir) { + this.numFilesPerDir = numFilesPerDir; + this.baseDir = baseDir; + } + + public void writeBioCCollection(BioCCollection collection) throws XMLStreamException, IOException { + Path collectionFile; + synchronized (BioCCollectionWriter.class) { + // currentDir is either null at the very beginning or after a batch of documents have been written + if (currentDir == null) { + int i = 0; + do { + currentDirNum = i++; + currentDir = Path.of(baseDir.toString(), "bioc_collections_" + currentDirNum); + } while (Files.exists(currentDir)); + } + int i = 0; + do { + collectionFile = Path.of(currentDir.toString(), "bioc_collection_" + currentDirNum + "_" + i++ + ".xml"); + } while (Files.exists(collectionFile)); + if (!Files.exists(collectionFile.getParent())) { + log.debug("Creating base BioC collection directory {}", baseDir); + Files.createDirectories(collectionFile.getParent()); + } + } + if (collectionFile == null) + throw new IllegalStateException("No file for the next collection was constructed. This is a programming error."); + com.pengyifan.bioc.io.BioCCollectionWriter writer = new com.pengyifan.bioc.io.BioCCollectionWriter(collectionFile); + writer.writeCollection(collection); + ++numWrittenIntoCurrentDir; + // "close" the current directory if the number of files for it has been reached + if (numWrittenIntoCurrentDir >= numFilesPerDir) { + currentDir = null; + numWrittenIntoCurrentDir = 0; + } + } +} diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java new file mode 100644 index 000000000..64f1b17df --- /dev/null +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java @@ -0,0 +1,133 @@ +package de.julielab.jcore.consumer.gnp; + +import com.pengyifan.bioc.BioCAnnotation; +import com.pengyifan.bioc.BioCDocument; +import com.pengyifan.bioc.BioCLocation; +import com.pengyifan.bioc.BioCPassage; +import de.julielab.jcore.types.*; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.uima.cas.text.AnnotationIndex; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Extracts text passages from the CAS and adds them to a new BioCDocument. + */ +public class BioCDocumentPopulator { + private final static Logger log = LoggerFactory.getLogger(BioCDocumentPopulator.class); + private boolean addGenes; + private Class geneTypeClass; + + public BioCDocumentPopulator(boolean addGenes, String geneTypeName) throws ClassNotFoundException { + this.addGenes = addGenes; + geneTypeClass = (Class) Class.forName(geneTypeName); + } + + public BioCDocument populate(JCas jCas) { + BioCDocument doc = new BioCDocument(JCoReTools.getDocId(jCas)); + AnnotationIndex zoneIndex = jCas.getAnnotationIndex(Zone.type); + int annotationId = 0; + for (Zone z : zoneIndex) { + // skip empty zones + if (z.getCoveredText().isBlank()) + continue; + BioCPassage p = null; + if (z instanceof Title) { + Title t = (Title) z; + String titleType; + String titleTypeString = t.getTitleType(); + if (titleTypeString == null) + titleTypeString = "other"; + switch (titleTypeString) { + case "document": + titleType = "title"; + break; + case "section": + titleType = "section_title"; + break; + case "figure": + titleType = "figure_title"; + break; + case "table": + titleType = "table_title"; + break; + case "abstractSection": + // abstract sections are part of the AbstractText which is handled below + titleType = "null"; + break; + case "other": + titleType = "other_title"; + default: + log.debug("Unhandled title type {}", titleTypeString); + titleType = "other_title"; + break; + } + if (titleType != null) { + p = getPassageForAnnotation(t); + p.putInfon("type", titleType); + doc.addPassage(p); + } + } else if (z instanceof AbstractText) { + AbstractText at = (AbstractText) z; + p = getPassageForAnnotation(at); + p.putInfon("type", "abstract"); + doc.addPassage(p); + } else if (z instanceof Paragraph) { + Paragraph pa = (Paragraph) z; + p = getPassageForAnnotation(pa); + p.putInfon("type", "paragraph"); + doc.addPassage(p); + } else if (z instanceof Caption) { + Caption c = (Caption) z; + p = getPassageForAnnotation(c); + if (c.getCaptionType() == null) + throw new IllegalArgumentException("The captionType feature is null for " + c); + p.putInfon("type", c.getCaptionType()); + doc.addPassage(p); + } + if (addGenes) { + annotationId = addGenesToPassage(jCas, z, p, annotationId); + } + } + return doc; + } + + private int addGenesToPassage(JCas jCas, Zone z, BioCPassage p, int annotationId) { + if (p != null) { + Iterable geneIt = JCasUtil.subiterate(jCas, geneTypeClass, z, false, true); + for (ConceptMention g : geneIt) { + BioCAnnotation annotation = new BioCAnnotation(String.valueOf(annotationId++)); + annotation.setText(g.getCoveredText()); + String type = "Gene"; + String specificType = g.getSpecificType() != null ? g.getSpecificType().toLowerCase() : null; + // 'familiy' is an entity name typo in the ProGene corpus + if (specificType != null && (specificType.contains("familiy") || specificType.contains("family") || specificType.contains("complex"))) + type = "FamilyName"; + else if (specificType != null && specificType.contains("domain")) + type = "DomainMotif"; + annotation.putInfon("type", type); + annotation.addLocation(new BioCLocation(g.getBegin(), g.getEnd() - g.getBegin())); + p.addAnnotation(annotation); + } + } + return annotationId; + } + + /** + * Creates a BioCPassage with offset and text corresponding to the passed annotation a. + * + * @param a The annotation to create a BioCPassage for. + * @return A BioCPassage corresponding to a in offset and text. + */ + private BioCPassage getPassageForAnnotation(Annotation a) { + BioCPassage p = new BioCPassage(); + p.setOffset(a.getBegin()); + // GNormPlus doesn't seem to handle newlines well. It resulted in missing annotations when testing if the + // output format is handled well by GNormPlus. + p.setText(a.getCoveredText().replaceAll("\n", " ")); + p.putInfon("uimatype", a.getClass().getCanonicalName()); + return p; + } +} diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java new file mode 100644 index 000000000..df2803243 --- /dev/null +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java @@ -0,0 +1,103 @@ +package de.julielab.jcore.consumer.gnp; + +import com.pengyifan.bioc.BioCCollection; +import com.pengyifan.bioc.BioCDocument; +import de.julielab.jcore.types.Gene; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Path; +import java.util.Date; +import java.util.Optional; + +@ResourceMetaData(name = "JCoRe GNormPlus BioC Writer", description = "Writes CAS documents into the BioC XML format used by the gene tagger and normalizer GNormPlus.", vendor = "JULIE Lab Jena, Germany") +@TypeCapability(inputs = {}, outputs = {}) +public class GNormPlusFormatWriter extends JCasAnnotator_ImplBase { + + public static final String PARAM_NUM_DOCS_PER_FILE = "NumDocsPerFile"; + public static final String PARAM_NUM_FILES_PER_DIR = "NumFilesPerDir"; + public static final String PARAM_BASE_DIR = "BaseDirectory"; + public static final String PARAM_ADD_GENES = "AddGenes"; + public static final String PARAM_GENE_TYPE_NAME = "GeneTypeName"; + private final static Logger log = LoggerFactory.getLogger(GNormPlusFormatWriter.class); + @ConfigurationParameter(name = PARAM_NUM_DOCS_PER_FILE, description = "The number of documents (i.e. CASes) that should be written into a single BioC XML file.") + private int numDocsPerFile; + @ConfigurationParameter(name = PARAM_NUM_FILES_PER_DIR, description = "The number of files that should be put in a directory before a new one is created.") + private int numDocsPerDir; + @ConfigurationParameter(name = PARAM_BASE_DIR, description = "The base directory into which to create new directories that contain the actual BioC collection files.") + private String baseDirectory; + @ConfigurationParameter(name=PARAM_ADD_GENES, mandatory = false, defaultValue = "false", description = "If set to true, all Gene annotations in the CAS will be added to the BioC documents. The default type used is de.julielab.jcore.types.Gene. This can be changed with the " + PARAM_GENE_TYPE_NAME + " parameter.") + private boolean addGenes; + @ConfigurationParameter(name=PARAM_GENE_TYPE_NAME, mandatory = false, defaultValue = "de.julielab.jcore.types.Gene", description = "The UIMA type denoting gene annotations that should be written into the BioC format when the " + PARAM_ADD_GENES + " parameter is set to true.") + private String geneTypeName; + + private BioCDocumentPopulator bioCDocumentPopulator; + private BioCCollectionWriter bioCCollectionWriter; + private BioCCollection currentCollection; + + /** + * This method is called a single time by the framework at component + * creation. Here, descriptor parameters are read and initial setup is done. + */ + @Override + public void initialize(final UimaContext aContext) throws ResourceInitializationException { + numDocsPerFile = (int) aContext.getConfigParameterValue(PARAM_NUM_DOCS_PER_FILE); + numDocsPerDir = (int) aContext.getConfigParameterValue(PARAM_NUM_FILES_PER_DIR); + baseDirectory = (String) aContext.getConfigParameterValue(PARAM_BASE_DIR); + addGenes = (boolean) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_ADD_GENES)).orElse(false); + geneTypeName = (String) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_GENE_TYPE_NAME)).orElse(Gene.class.getCanonicalName()); + + try { + bioCDocumentPopulator = new BioCDocumentPopulator(addGenes, geneTypeName); + bioCCollectionWriter = new BioCCollectionWriter(numDocsPerDir, Path.of(baseDirectory)); + + currentCollection = new BioCCollection("UTF-8", "1.0", new Date().toString(), true, "JCoRe GNormPlus BioC Writer", "PubTator.key"); + } catch (ClassNotFoundException e) { + log.error("Gene annotation class {} could not be found.", geneTypeName, e); + throw new ResourceInitializationException(e); + } + } + + /** + * This method is called for each document going through the component. This + * is where the actual work happens. + */ + @Override + public void process(final JCas jCas) throws AnalysisEngineProcessException { + try { + BioCDocument doc = bioCDocumentPopulator.populate(jCas); + if (doc.getPassageCount() > 0) + currentCollection.addDocument(doc); + if (currentCollection.getDocmentCount() >= numDocsPerFile) { + bioCCollectionWriter.writeBioCCollection(currentCollection); + currentCollection.clearDocuments(); + currentCollection.clearInfons(); + } + } catch (Exception e) { + log.error("Exception was raised for document {}", JCoReTools.getDocId(jCas)); + throw new AnalysisEngineProcessException(e); + } + } + + @Override + public void collectionProcessComplete() throws AnalysisEngineProcessException { + super.collectionProcessComplete(); + try { + System.out.println("CollectionProcessComplete: Writing BioC collection of size " + currentCollection.getDocmentCount()); + bioCCollectionWriter.writeBioCCollection(currentCollection); + } catch (Throwable e) { + log.error("Could not write final batch of BioCDocuments.", e); + throw new AnalysisEngineProcessException(e); + } + } +} + diff --git a/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/jcore-gnp-bioc-writer.xml b/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/jcore-gnp-bioc-writer.xml new file mode 100644 index 000000000..06a19dd75 --- /dev/null +++ b/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/jcore-gnp-bioc-writer.xml @@ -0,0 +1,83 @@ + + + org.apache.uima.java + true + de.julielab.jcore.consumer.gnp.GNormPlusFormatWriter + + JCoRe GNormPlus BioC Writer + Writes CAS documents into the BioC XML format used by the gene tagger and normalizer GNormPlus. + 2.6.0 + JULIE Lab Jena, Germany + + + NumDocsPerFile + The number of documents (i.e. CASes) that should be written into a single BioC XML file. + Integer + false + true + + + NumFilesPerDir + The number of files that should be put in a directory before a new one is created. + Integer + false + true + + + BaseDirectory + The base directory into which to create new directories that contain the actual BioC collection files. + String + false + true + + + AddGenes + If set to true, all Gene annotations in the CAS will be added to the BioC documents. The default type used is de.julielab.jcore.types.Gene. This can be changed with the GeneTypeName parameter. + Boolean + false + false + + + GeneTypeName + The UIMA type denoting gene annotations that should be written into the BioC format when the AddGenes parameter is set to true. + String + false + false + + + + + AddGenes + + false + + + + GeneTypeName + + de.julielab.jcore.types.Gene + + + + + + + + + + + + + + + + + + + + true + true + false + + + \ No newline at end of file diff --git a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java new file mode 100644 index 000000000..dc4af2060 --- /dev/null +++ b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java @@ -0,0 +1,106 @@ +package de.julielab.jcore.consumer.gnp; + +import com.pengyifan.bioc.BioCCollection; +import com.pengyifan.bioc.BioCDocument; +import com.pengyifan.bioc.io.BioCCollectionWriter; +import de.julielab.jcore.types.Gene; +import org.apache.uima.jcas.JCas; +import org.junit.jupiter.api.Test; + +import java.io.ByteArrayOutputStream; +import java.nio.charset.StandardCharsets; +import java.util.Date; + +import static org.assertj.core.api.Assertions.assertThat; +class BioCDocumentPopulatorTest { + @Test + public void populate() throws Exception { + BioCDocumentPopulator populator = new BioCDocumentPopulator(false, Gene.class.getCanonicalName()); + JCas jCas = TestDocumentGenerator.prepareCas(1); + BioCDocument biocDoc = populator.populate(jCas); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + BioCCollection collection = new BioCCollection("UTF-8", "1.0", (new Date()).toString(), true, "jUnit Test", "PubTator.key"); + collection.addDocument(biocDoc); + BioCCollectionWriter collectionWriter = new BioCCollectionWriter(baos); + collectionWriter.writeCollection(collection); + String resultXml = baos.toString(StandardCharsets.UTF_8); + // Just check that the test text contents are there that are used in TestDocumentGenerator and that + // there are no duplicates + assertThat(resultXml).containsOnlyOnce("This is the title of document 1."); + assertThat(resultXml).containsOnlyOnce("title"); + // The abstract should be one single string + assertThat(resultXml).containsOnlyOnce("This abstract section belongs to document 1. There are certainly some results reported by document 1."); + assertThat(resultXml).containsOnlyOnce("INTRODUCTION"); + assertThat(resultXml).containsOnlyOnce("section_title"); + assertThat(resultXml).contains("paragraph"); + assertThat(resultXml).containsOnlyOnce("This is section 1, paragraph 1 of document 1."); + assertThat(resultXml).containsOnlyOnce("This is a second paragraph in the first section."); + assertThat(resultXml).containsOnlyOnce("table_title"); + assertThat(resultXml).containsOnlyOnce("Tab1."); + assertThat(resultXml).containsOnlyOnce("This is the table1 caption."); + } + + @Test + public void populateWithGenes() throws Exception { + BioCDocumentPopulator populator = new BioCDocumentPopulator(true, Gene.class.getCanonicalName()); + JCas jCas = TestDocumentGenerator.prepareCas(1); + new Gene(jCas, 0, 4).addToIndexes(); + new Gene(jCas, 87, 96).addToIndexes(); + BioCDocument biocDoc = populator.populate(jCas); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + BioCCollection collection = new BioCCollection("UTF-8", "1.0", (new Date()).toString(), true, "jUnit Test", "PubTator.key"); + collection.addDocument(biocDoc); + BioCCollectionWriter collectionWriter = new BioCCollectionWriter(baos); + collectionWriter.writeCollection(collection); + String resultXml = baos.toString(StandardCharsets.UTF_8); + assertThat(resultXml).containsOnlyOnce(""); + assertThat(resultXml).contains("Gene"); + assertThat(resultXml).containsOnlyOnce(""); + assertThat(resultXml).containsOnlyOnce("This"); + + assertThat(resultXml).contains(""); + assertThat(resultXml).contains("Gene"); + assertThat(resultXml).containsOnlyOnce(""); + assertThat(resultXml).containsOnlyOnce("certainly"); + } + + @Test + public void populateWithGeneFamilies() throws Exception { + BioCDocumentPopulator populator = new BioCDocumentPopulator(true, Gene.class.getCanonicalName()); + JCas jCas = TestDocumentGenerator.prepareCas(1); + Gene gene = new Gene(jCas, 0, 4); + gene.setSpecificType("protein_familiy_or_group"); + gene.addToIndexes(); + BioCDocument biocDoc = populator.populate(jCas); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + BioCCollection collection = new BioCCollection("UTF-8", "1.0", (new Date()).toString(), true, "jUnit Test", "PubTator.key"); + collection.addDocument(biocDoc); + BioCCollectionWriter collectionWriter = new BioCCollectionWriter(baos); + collectionWriter.writeCollection(collection); + String resultXml = baos.toString(StandardCharsets.UTF_8); + assertThat(resultXml).containsOnlyOnce(""); + assertThat(resultXml).contains("FamilyName"); + assertThat(resultXml).containsOnlyOnce(""); + assertThat(resultXml).containsOnlyOnce("This"); + } + + @Test + public void populateWithGeneFamilies2() throws Exception { + BioCDocumentPopulator populator = new BioCDocumentPopulator(true, Gene.class.getCanonicalName()); + JCas jCas = TestDocumentGenerator.prepareCas(1); + Gene gene = new Gene(jCas, 0, 4); + gene.setSpecificType("FamilyName"); + gene.addToIndexes(); + BioCDocument biocDoc = populator.populate(jCas); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + BioCCollection collection = new BioCCollection("UTF-8", "1.0", (new Date()).toString(), true, "jUnit Test", "PubTator.key"); + collection.addDocument(biocDoc); + BioCCollectionWriter collectionWriter = new BioCCollectionWriter(baos); + collectionWriter.writeCollection(collection); + String resultXml = baos.toString(StandardCharsets.UTF_8); + assertThat(resultXml).containsOnlyOnce(""); + assertThat(resultXml).contains("FamilyName"); + assertThat(resultXml).containsOnlyOnce(""); + assertThat(resultXml).containsOnlyOnce("This"); + } +} \ No newline at end of file diff --git a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriterTest.java b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriterTest.java new file mode 100644 index 000000000..1657961be --- /dev/null +++ b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriterTest.java @@ -0,0 +1,121 @@ +package de.julielab.jcore.consumer.gnp; + + +import com.pengyifan.bioc.BioCCollection; +import com.pengyifan.bioc.io.BioCCollectionReader; +import de.julielab.jcore.types.Title; +import de.julielab.jcore.types.pubmed.Header; +import org.apache.commons.io.FileUtils; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.stream.Collectors; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Unit tests for jcore-gnp-bioc-writer. + */ +public class GNormPlusFormatWriterTest { + + private static final Path BASEDIR = Path.of("src", "test", "resources", "testoutput"); + +// @AfterAll + public static void cleanFinally() { + FileUtils.deleteQuietly(BASEDIR.toFile()); + } + + @BeforeEach + public void cleanOutput() { + FileUtils.deleteQuietly(BASEDIR.toFile()); + } + + private AnalysisEngine getWriterInstance(int docsPerFile, int filesPerDir) throws ResourceInitializationException { + return AnalysisEngineFactory.createEngine(GNormPlusFormatWriter.class, GNormPlusFormatWriter.PARAM_BASE_DIR, BASEDIR.toString(), GNormPlusFormatWriter.PARAM_NUM_DOCS_PER_FILE, docsPerFile, GNormPlusFormatWriter.PARAM_NUM_FILES_PER_DIR, filesPerDir); + } + + @Test + public void process1() throws Exception { + // write a single document + JCas jCas = TestDocumentGenerator.prepareCas(1); + AnalysisEngine writer = getWriterInstance(1, 1); + writer.process(jCas); + writer.collectionProcessComplete(); + + assertThat(Path.of(BASEDIR.toString(), "bioc_collections_0", "bioc_collection_0_0.xml")).exists().isNotEmptyFile(); + } + + @Test + public void process2() throws Exception { + // write a single document + JCas jCas = TestDocumentGenerator.createTestJCas(); + AnalysisEngine writer = getWriterInstance(2, 3); + for (int i = 0; i < 15; ++i) { + TestDocumentGenerator.prepareCas(jCas, i); + writer.process(jCas); + jCas.reset(); + } + writer.collectionProcessComplete(); + + assertThat(Files.list(BASEDIR)).hasSize(3); + for (int i : List.of(0, 1, 2)) { + List fileIndices = i < 2 ? List.of(0, 1, 2) : List.of(0,1); + for (int j : fileIndices) { + assertThat(Path.of(BASEDIR.toString(), "bioc_collections_"+i, "bioc_collection_"+i+"_"+j+".xml")).exists().isNotEmptyFile(); + } + } + // there should only be two files in the last directory + assertThat(Path.of(BASEDIR.toString(), "bioc_collections_2", "bioc_collection_2_2.xml")).doesNotExist(); + + // the last file should only contain a single document + BioCCollectionReader reader = new BioCCollectionReader(Path.of(BASEDIR.toString(), "bioc_collections_2", "bioc_collection_2_1.xml")); + BioCCollection lastCollection = reader.readCollection(); + assertThat(lastCollection.getDocmentCount()).isEqualTo(1); + + } + + @Test + public void omitEmptyDocuments() throws Exception { + // GNormPlus doesn't handle documents well which do not have any passage. Then, at some later document in the same collection, array out of bounds exceptions appear. + // Make sure we just don't write empty documents. They wouldn't have any annotations anyway. + JCas jCas = TestDocumentGenerator.createTestJCas(); + Header h = new Header(jCas); + h.setDocId("1"); + h.addToIndexes(); + AnalysisEngine writer = getWriterInstance(1, 1); + writer.process(jCas); + jCas.reset(); + jCas.setDocumentText("Hello."); + Header h2 = new Header(jCas); + h2.setDocId("2"); + h2.addToIndexes(); + Title title = new Title(jCas, 0, 6); + title.setTitleType("document"); + title.addToIndexes(); + writer.process(jCas); + writer.collectionProcessComplete(); + // assert that no empty documents were written into the collection + assertThat(Files.lines(Path.of(BASEDIR.toString(), "bioc_collections_0", "bioc_collection_0_0.xml")).map(String::trim).collect(Collectors.joining())).doesNotContain(""); + assertThat(Files.lines(Path.of(BASEDIR.toString(), "bioc_collections_0", "bioc_collection_0_0.xml")).map(String::trim).collect(Collectors.joining())).contains("2"); + } + + @Test + public void omitEmptyDocuments2() throws Exception { + // Additionally to not writing empty documents, we also don't want to write empty collections. This, too, causes out of bounds errors in GNormPlus. + JCas jCas = TestDocumentGenerator.createTestJCas(); + Header h = new Header(jCas); + h.setDocId("1"); + h.addToIndexes(); + AnalysisEngine writer = getWriterInstance(1, 1); + writer.process(jCas); + // assert that no empty documents were written into the collection + assertThat(Path.of(BASEDIR.toString(), "bioc_collections_0", "bioc_collection_0_0.xml")).doesNotExist(); + } +} diff --git a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/TestDocumentGenerator.java b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/TestDocumentGenerator.java new file mode 100644 index 000000000..17e13f984 --- /dev/null +++ b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/TestDocumentGenerator.java @@ -0,0 +1,87 @@ +package de.julielab.jcore.consumer.gnp; + +import de.julielab.jcore.types.*; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.uima.UIMAException; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; + +public class TestDocumentGenerator { + + public static JCas createTestJCas() throws UIMAException { + return JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-semantics-biology-types"); + } + + public static JCas prepareCas(int docId) throws UIMAException { + JCas jCas = createTestJCas(); + return prepareCas(jCas, docId); + } + + public static JCas prepareCas(JCas jCas, int docId) { + Header h = new de.julielab.jcore.types.pubmed.Header(jCas); + h.setDocId(String.valueOf(docId)); + h.addToIndexes(); + + StringBuilder sb = new StringBuilder(); + String ls = System.getProperty("line.separator"); + int currentBegin = sb.length(); + sb.append("This is the title of document ").append(docId).append("."); + Title t = new Title(jCas, currentBegin, sb.length()); + t.setTitleType("document"); + t.addToIndexes(); + currentBegin = sb.length(); + sb.append("This abstract section belongs to document ").append(docId).append("."); + AbstractSectionHeading ash1 = new AbstractSectionHeading(jCas); + ash1.setLabel("BACKGROUND"); + ash1.setTitleType("abstract"); + AbstractSection as1 = new AbstractSection(jCas, currentBegin, sb.length()); + as1.setAbstractSectionHeading(ash1); + sb.append(ls); + currentBegin = sb.length(); + sb.append("There are certainly some results reported by document ").append(docId).append("."); + AbstractSectionHeading ash2 = new AbstractSectionHeading(jCas); + ash2.setLabel("RESULTS"); + ash2.setTitleType("abstract"); + AbstractSection as2 = new AbstractSection(jCas, currentBegin, sb.length()); + as2.setAbstractSectionHeading(ash2); + AbstractText at = new AbstractText(jCas, as1.getBegin(), as2.getEnd()); + at.setStructuredAbstractParts(JCoReTools.addToFSArray(JCoReTools.addToFSArray(null, as1), as2)); + at.addToIndexes(); + sb.append(ls); + currentBegin = sb.length(); + sb.append("INTRODUCTION This is section 1, paragraph 1 of document ").append(docId).append("."); + SectionTitle st1 = new SectionTitle(jCas, currentBegin, currentBegin + 12); + st1.setTitleType("section"); + Section s1 = new Section(jCas, currentBegin, sb.length()); + st1.addToIndexes(); + s1.setSectionHeading(st1); + s1.addToIndexes(); + // paragraphs do not include the heading + Paragraph p11 = new Paragraph(jCas, s1.getBegin() + 13, s1.getEnd()); + p11.addToIndexes(); + currentBegin = sb.length(); + sb.append("This is a second paragraph in the first section."); + Paragraph p12 = new Paragraph(jCas, currentBegin, sb.length()); + p12.addToIndexes(); + currentBegin = sb.length(); + int objectBegin = sb.length(); + sb.append("Let this be table content."); + currentBegin = sb.length(); + sb.append("Tab1."); + Title tabTitle = new Title(jCas, currentBegin, sb.length()); + tabTitle.setTitleType("table"); + tabTitle.addToIndexes(); + currentBegin = sb.length(); + sb.append("This is the table1 caption."); + Caption tCap = new Caption(jCas, currentBegin, sb.length()); + tCap.setCaptionType("table"); + tCap.addToIndexes(); + Table tab = new Table(jCas, objectBegin, sb.length()); + tab.setObjectTitle(tabTitle); + tab.setObjectCaption(tCap); + tab.addToIndexes(); + tab.addToIndexes(); + jCas.setDocumentText(sb.toString()); + return jCas; + } +} diff --git a/jcore-iexml-consumer/component.meta b/jcore-iexml-consumer/component.meta index 621a4d340..6156d38c4 100644 --- a/jcore-iexml-consumer/component.meta +++ b/jcore-iexml-consumer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-iexml-consumer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe IEXML Consumer" } diff --git a/jcore-iexml-consumer/pom.xml b/jcore-iexml-consumer/pom.xml index 0cca60dfb..5337141d4 100644 --- a/jcore-iexml-consumer/pom.xml +++ b/jcore-iexml-consumer/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 Generates stand-off IEXML files as used in the Mantra challenge. @@ -74,11 +74,11 @@ de.julielab jcore-mantra-xml-types - 2.5.1-SNAPSHOT + 2.6.0 - junit - junit + org.junit.jupiter + junit-jupiter-engine
diff --git a/jcore-iexml-consumer/src/main/resources/de/julielab/jcore/consumer/iexml/desc/jcore-iexml-consumer.xml b/jcore-iexml-consumer/src/main/resources/de/julielab/jcore/consumer/iexml/desc/jcore-iexml-consumer.xml index 3d3cfbee2..7c2ac53ad 100644 --- a/jcore-iexml-consumer/src/main/resources/de/julielab/jcore/consumer/iexml/desc/jcore-iexml-consumer.xml +++ b/jcore-iexml-consumer/src/main/resources/de/julielab/jcore/consumer/iexml/desc/jcore-iexml-consumer.xml @@ -5,7 +5,7 @@ JCoRe IEXML Consumer - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-iexml-reader/component.meta b/jcore-iexml-reader/component.meta index eac29d502..5054c474a 100644 --- a/jcore-iexml-reader/component.meta +++ b/jcore-iexml-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-iexml-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe IEXML Reader" } diff --git a/jcore-iexml-reader/pom.xml b/jcore-iexml-reader/pom.xml index 94b02b301..20516c98e 100644 --- a/jcore-iexml-reader/pom.xml +++ b/jcore-iexml-reader/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -75,11 +75,11 @@ de.julielab jcore-mantra-xml-types - 2.5.1-SNAPSHOT + 2.6.0 - junit - junit + org.junit.jupiter + junit-jupiter-engine Reader for IEXML files as used in the Mantra project/challenge diff --git a/jcore-iexml-reader/src/main/resources/de/julielab/jcore/reader/iexml/desc/jcore-iexml-reader.xml b/jcore-iexml-reader/src/main/resources/de/julielab/jcore/reader/iexml/desc/jcore-iexml-reader.xml index 89f48191c..f26160c4b 100644 --- a/jcore-iexml-reader/src/main/resources/de/julielab/jcore/reader/iexml/desc/jcore-iexml-reader.xml +++ b/jcore-iexml-reader/src/main/resources/de/julielab/jcore/reader/iexml/desc/jcore-iexml-reader.xml @@ -5,7 +5,7 @@ JCoRe IEXML Reader - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-ign-reader/component.meta b/jcore-ign-reader/component.meta index 9ea912d40..82c8dd0a4 100644 --- a/jcore-ign-reader/component.meta +++ b/jcore-ign-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-ign-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe IGN Reader" } diff --git a/jcore-ign-reader/pom.xml b/jcore-ign-reader/pom.xml index f1f2ebfd5..96be0ee0d 100644 --- a/jcore-ign-reader/pom.xml +++ b/jcore-ign-reader/pom.xml @@ -5,7 +5,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 jcore-ign-reader @@ -17,7 +17,7 @@ com.pengyifan.bioc pengyifan-bioc - 1.0.2 + 1.0.3 de.julielab @@ -35,8 +35,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-ign-reader/src/main/resources/de/julielab/jcore/reader/ign/desc/jcore-ign-reader.xml b/jcore-ign-reader/src/main/resources/de/julielab/jcore/reader/ign/desc/jcore-ign-reader.xml index 91d8abac5..e0cf416e5 100644 --- a/jcore-ign-reader/src/main/resources/de/julielab/jcore/reader/ign/desc/jcore-ign-reader.xml +++ b/jcore-ign-reader/src/main/resources/de/julielab/jcore/reader/ign/desc/jcore-ign-reader.xml @@ -7,7 +7,7 @@ The IGNReader reads IGN corpus files in BioC-format. There are XML files comprising the actual text (as well as passage and sentence annotations) and there are separate XML files comprising the annotations. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-ign-reader/src/test/java/de/julielab/jcore/reader/ign/IGNReaderTest.java b/jcore-ign-reader/src/test/java/de/julielab/jcore/reader/ign/IGNReaderTest.java index 11e48e537..e0fdec94c 100644 --- a/jcore-ign-reader/src/test/java/de/julielab/jcore/reader/ign/IGNReaderTest.java +++ b/jcore-ign-reader/src/test/java/de/julielab/jcore/reader/ign/IGNReaderTest.java @@ -19,11 +19,11 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.Collection; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class IGNReaderTest { private static final String READER_DESCRIPTOR = "de.julielab.jcore.reader.ign.desc.jcore-ign-reader"; diff --git a/jcore-iob-consumer/.gitignore b/jcore-iob-consumer/.gitignore index 2960b6e4b..673a2bee7 100644 --- a/jcore-iob-consumer/.gitignore +++ b/jcore-iob-consumer/.gitignore @@ -1 +1 @@ -src/test/resources/iob-output/ +src/test/resources/iob-output/* diff --git a/jcore-iob-consumer/component.meta b/jcore-iob-consumer/component.meta index faa7e6b5e..e8ae24666 100644 --- a/jcore-iob-consumer/component.meta +++ b/jcore-iob-consumer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-iob-consumer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe CAS to IOB Consumer" } diff --git a/jcore-iob-consumer/pom.xml b/jcore-iob-consumer/pom.xml index 7625d1c8c..74fcec1f2 100644 --- a/jcore-iob-consumer/pom.xml +++ b/jcore-iob-consumer/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -34,14 +34,14 @@ 1.0.7 - junit - junit - - - commons-io - commons-io - test + org.junit.jupiter + junit-jupiter-engine + + + + + JCoRe CAS to IOB Consumer diff --git a/jcore-iob-consumer/src/main/java/de/julielab/jcore/consumer/cas2iob/main/ToIOBConsumer.java b/jcore-iob-consumer/src/main/java/de/julielab/jcore/consumer/cas2iob/main/ToIOBConsumer.java index c2e21d98d..de57edd6f 100644 --- a/jcore-iob-consumer/src/main/java/de/julielab/jcore/consumer/cas2iob/main/ToIOBConsumer.java +++ b/jcore-iob-consumer/src/main/java/de/julielab/jcore/consumer/cas2iob/main/ToIOBConsumer.java @@ -75,7 +75,7 @@ public class ToIOBConsumer extends JCasAnnotator_ImplBase { private final String PARAGRAPH_END_MARK = "PARAGRAPH_END_MARKER"; // there will be 2 empty lines for each sentence marker @ConfigurationParameter(name = PARAM_OUTFOLDER, description = "Path to folder where IOB-files should be written to.") String outFolder = null; - @ConfigurationParameter(name = PARAM_TYPE_PATH, mandatory = false, description = "The path of the UIMA types, e.g. \"de.julielab.jcore.\" (with terminating \".\"!). It is prepended to the class names in labelNameMethods. This parameter may be null which is equivalent to the empty String \"\".") + @ConfigurationParameter(name = PARAM_TYPE_PATH, mandatory = false, description = "The path of the UIMA types, e.g. \"de.julielab.jcore.types.\" (with terminating \".\"!). It is prepended to the class names in labelNameMethods. This parameter may be null which is equivalent to the empty String \"\".") String typePath = null; @ConfigurationParameter(name = PARAM_LABELS, mandatory = false, description = "The labels NOT to be exported into IOB format. Label does here not refer to an UIMA type but to the specific label aquired by the labelNameMethod.") String[] labels = null; @@ -84,9 +84,9 @@ public class ToIOBConsumer extends JCasAnnotator_ImplBase { int id = 1; @ConfigurationParameter(name = PARAM_MODE, mandatory = false, description = "This parameter determines whether the IOB or IO annotation schema should be used. The parameter defaults to IOB, the value is not case sensitive.", defaultValue = "IOB") private String mode = null; - @ConfigurationParameter(name = PARAM_LABEL_METHODS, description = "This is the primary parameter to define from which types IOB labels should be derived. The parameter expects pairs of UIMA-annotation-type-names and their corresponding method for extracting the annotation label. Format: <annotationName>[\\s=/\\\\|]<method Name>. The annotation name is fully qualified name of the UIMA type. For abbreviation purposes, the \"" + PARAM_TYPE_PATH + "\" parameter can be used to define a type prefix that will then be prepended to all UIMA type names given in this parameter. So, for example, the prefix \"de.julielab.jcore.types.\" will allow to use the \"specificType\" feature of the \"de.julielab.jcore.types.Gene\" type by providing \"Gene=getSpecificType\". If the name of the annotation class itself is to be being used as label, only the class name is expected: <annotationName> (here, again, applies the use of the \"" + PARAM_TYPE_PATH + "\" parameter). You also may specify a mix of pairs and single class names. If you give the name extracting method for a class and have also specified its superclass as a single class name, the given method is used rather than the superclass name.") + @ConfigurationParameter(name = PARAM_LABEL_METHODS, description = "This is the primary parameter to define from which types IOB labels should be derived. The parameter expects pairs of UIMA-annotation-type-names and their corresponding method for extracting the annotation label. Format: [\\s=/\\\\|]. The annotation name is fully qualified name of the UIMA type. For abbreviation purposes, the \"" + PARAM_TYPE_PATH + "\" parameter can be used to define a type prefix that will then be prepended to all UIMA type names given in this parameter. So, for example, the prefix \"de.julielab.jcore.types.\" will allow to use the \"specificType\" feature of the \"de.julielab.jcore.types.Gene\" type by providing \"Gene=getSpecificType\". If the name of the annotation class itself is to be being used as label, only the class name is expected: (here, again, applies the use of the \"" + PARAM_TYPE_PATH + "\" parameter). You also may specify a mix of pairs and single class names. If you give the name extracting method for a class and have also specified its superclass as a single class name, the given method is used rather than the superclass name.") private String[] labelNameMethods; - @ConfigurationParameter(name = PARAM_IOB_LABEL_NAMES, mandatory = false, description = "Pairs of label names in UIMA (aquired by the methods given in labelNameMethods) and the name the label is supposed to get in the outcoming IOB file. Format: <UIMA label name>[\\s=/\\\\|]<IOB label name>") + @ConfigurationParameter(name = PARAM_IOB_LABEL_NAMES, mandatory = false, description = "Pairs of label names in UIMA (aquired by the methods given in labelNameMethods) and the name the label is supposed to get in the outcoming IOB file. Format: [\\s=/\\\\|]<IOB label name>") private String[] iobLabelNames; @ConfigurationParameter(name = PARAM_ADD_POS, mandatory = false, description = "If set to true and if annotations of (sub-)type de.julielab.jcore.types.POSTag are present in the CAS, the PoS tags will be added to the output file as the second column. Defaults to false.") private Boolean addPos; @@ -117,7 +117,7 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept addPos = Optional.ofNullable((Boolean) aContext.getConfigParameterValue(PARAM_ADD_POS)).orElse(false); separator = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_COLUMN_SEPARATOR)).orElse("\t"); - separator = separator.replaceAll("\\\\t", "\t"); + separator = separator.replaceAll("\\\\t", "\t"); iobMarkSeparator = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_IOB_MARK_SEPARATOR)).orElse("_"); @@ -181,7 +181,6 @@ public void process(JCas jCas) { bw.newLine(); } else if (token.getText().equals("") || token.getText().equals(PARAGRAPH_END_MARK)) { bw.newLine(); - bw.newLine(); } else { final Stream.Builder sb = Stream.builder(); sb.accept(token.getText()); @@ -192,7 +191,8 @@ public void process(JCas jCas) { bw.newLine(); } } - + // newline at the very end; this makes it easy to concatenate multiple output IOB files into one larger file + bw.newLine(); if (bw != null) { bw.close(); } @@ -236,8 +236,8 @@ public IOToken[] convertToIOB(JCas jcas) { while (paragraphIter.hasNext()) { paragraphs.add((Paragraph) paragraphIter.next()); } + Paragraph dParagraph = null; if (paragraphs.isEmpty()) { - Paragraph dParagraph = null; try { dParagraph = (Paragraph) JCoReAnnotationTools.getAnnotationByClassName(jcas, Paragraph.class.getName()); } catch (ClassNotFoundException | SecurityException @@ -249,6 +249,7 @@ public IOToken[] convertToIOB(JCas jcas) { } dParagraph.setBegin(0); dParagraph.setEnd(jcas.getDocumentText().length()); + dParagraph.setComponentId(ToIOBConsumer.class.getCanonicalName()); dParagraph.addToIndexes(jcas); paragraphs.add(dParagraph); @@ -284,7 +285,7 @@ public IOToken[] convertToIOB(JCas jcas) { // if we are at the first token, we need to add a sentence break mark which is // later replaced by an empty line if (i == 0 && overallSentCount > 0) { - IOToken ioToken = null; + IOToken ioToken; //if (sentCount == 0) { if (currentParagraph != lastPara) { // add paragraph end before this sentence @@ -331,6 +332,10 @@ public IOToken[] convertToIOB(JCas jcas) { } } + // remove helper paragraph annotation + if (dParagraph != null) + dParagraph.removeFromIndexes(); + return ret; } @@ -385,6 +390,7 @@ private void tokenLabeling(TreeMap ioTokenMap, Iterator[] anno } + /** * @param ioTokenMap * @param label diff --git a/jcore-iob-consumer/src/main/java/de/julielab/jcore/consumer/cas2iob/utils/UIMAUtils.java b/jcore-iob-consumer/src/main/java/de/julielab/jcore/consumer/cas2iob/utils/UIMAUtils.java index 3e6affd02..fa06059f6 100644 --- a/jcore-iob-consumer/src/main/java/de/julielab/jcore/consumer/cas2iob/utils/UIMAUtils.java +++ b/jcore-iob-consumer/src/main/java/de/julielab/jcore/consumer/cas2iob/utils/UIMAUtils.java @@ -9,6 +9,7 @@ package de.julielab.jcore.consumer.cas2iob.utils; import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; diff --git a/jcore-iob-consumer/src/main/resources/de/julielab/jcore/consumer/cas2iob/desc/jcore-iob-consumer.xml b/jcore-iob-consumer/src/main/resources/de/julielab/jcore/consumer/cas2iob/desc/jcore-iob-consumer.xml index a333e4aaf..3f8e58712 100644 --- a/jcore-iob-consumer/src/main/resources/de/julielab/jcore/consumer/cas2iob/desc/jcore-iob-consumer.xml +++ b/jcore-iob-consumer/src/main/resources/de/julielab/jcore/consumer/cas2iob/desc/jcore-iob-consumer.xml @@ -6,7 +6,7 @@ JCoRe IOB Writer This component help to write CAS entity or chunk annotations into a text file in IOB format. - 2.5.1-SNAPSHOT + 2.6.0 outFolder @@ -17,7 +17,7 @@ typePath - The path of the UIMA types, e.g. "de.julielab.jcore." (with terminating "."!). It is prepended to the class names in labelNameMethods. This parameter may be null which is equivalent to the empty String "". + The path of the UIMA types, e.g. "de.julielab.jcore.types." (with terminating "."!). It is prepended to the class names in labelNameMethods. This parameter may be null which is equivalent to the empty String "". String false false @@ -38,14 +38,14 @@ labelNameMethods - This is the primary parameter to define from which types IOB labels should be derived. The parameter expects pairs of UIMA-annotation-type-names and their corresponding method for extracting the annotation label. Format: &lt;annotationNAme&gt;[\s=/\\|]&lt;method Name&gt;. The annotation name is fully qualified name of the UIMA type. For abbreviation purposes, the "typePath" parameter can be used to define a type prefix that will then be prepended to all UIMA type names given in this parameter. So, for example, the prefix "de.julielab.jcore.types." will allow to use the "specificType" feature of the "de.julielab.jcore.types.Gene" type by providing "Gene=getSpecificType". If the name of the annotation class itself is to be being used as label, only the class name is expected: &lt;annotationName&gt; (here, again, applies the use of the "typePath" parameter). You also may specify a mix of pairs and single class names. If you give the name extracting method for a class and have also specified its superclass as a single class name, the given method is used rather than the superclass name. + This is the primary parameter to define from which types IOB labels should be derived. The parameter expects pairs of UIMA-annotation-type-names and their corresponding method for extracting the annotation label. Format: <annotationName>[\s=/\\|]<method Name>. The annotation name is fully qualified name of the UIMA type. For abbreviation purposes, the "typePath" parameter can be used to define a type prefix that will then be prepended to all UIMA type names given in this parameter. So, for example, the prefix "de.julielab.jcore.types." will allow to use the "specificType" feature of the "de.julielab.jcore.types.Gene" type by providing "Gene=getSpecificType". If the name of the annotation class itself is to be being used as label, only the class name is expected: <annotationName> (here, again, applies the use of the "typePath" parameter). You also may specify a mix of pairs and single class names. If you give the name extracting method for a class and have also specified its superclass as a single class name, the given method is used rather than the superclass name. String true true iobLabelNames - Pairs of label names in UIMA (aquired by the methods given in labelNameMethods) and the name the label is supposed to get in the outcoming IOB file. Format: &lt;UIMA label name&gt;[\s=/\\|]&lt;IOB label name&gt; + Pairs of label names in UIMA (aquired by the methods given in labelNameMethods) and the name the label is supposed to get in the outcoming IOB file. Format: <UIMA label name>[\s=/\\|]&lt;IOB label name&gt; String true false diff --git a/jcore-iob-consumer/src/test/java/de/julielab/jcore/consumer/cas2iob/main/ToIOBConsumerTest.java b/jcore-iob-consumer/src/test/java/de/julielab/jcore/consumer/cas2iob/main/ToIOBConsumerTest.java index fefa5975a..a7a8f111f 100644 --- a/jcore-iob-consumer/src/test/java/de/julielab/jcore/consumer/cas2iob/main/ToIOBConsumerTest.java +++ b/jcore-iob-consumer/src/test/java/de/julielab/jcore/consumer/cas2iob/main/ToIOBConsumerTest.java @@ -30,7 +30,7 @@ import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileInputStream; @@ -47,14 +47,20 @@ public void testWriteIOB() throws Exception { final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types" , "de.julielab.jcore.types.jcore-document-meta-types"); - jCas.setDocumentText("BRCA influences cancer."); + jCas.setDocumentText("BRCA influences cancer. Our data suggest this."); - new Sentence(jCas, 0, jCas.getDocumentText().length()).addToIndexes(); + new Sentence(jCas, 0, 23).addToIndexes(); + new Sentence(jCas, 24, jCas.getDocumentText().length()).addToIndexes(); new Gene(jCas, 0, 4).addToIndexes(); new Token(jCas, 0, 4).addToIndexes(); new Token(jCas, 5, 15).addToIndexes(); new Token(jCas, 16, 22).addToIndexes(); new Token(jCas, 22, 23).addToIndexes(); + new Token(jCas, 24, 27).addToIndexes(); + new Token(jCas, 28, 32).addToIndexes(); + new Token(jCas, 33, 40).addToIndexes(); + new Token(jCas, 41, 45).addToIndexes(); + new Token(jCas, 45, 46).addToIndexes(); final String outputDir = "src/test/resources/iob-output"; final AnalysisEngine iobwriter = AnalysisEngineFactory.createEngine("de.julielab.jcore.consumer.cas2iob.desc.jcore-iob-consumer", @@ -68,7 +74,14 @@ public void testWriteIOB() throws Exception { assertThat(IOUtils.readLines(new FileInputStream(file), "UTF-8")).containsExactly("BRCA B_Gene", "influences O", "cancer O", - ". O"); + ". O", + "", + "Our O", + "data O", + "suggest O", + "this O", + ". O", + ""); } @Test @@ -115,7 +128,8 @@ public void testWriteIOBWithPos() throws Exception { assertThat(IOUtils.readLines(new FileInputStream(file), "UTF-8")).containsExactly("BRCA NN B-Gene", "influences VBZ O", "cancer NN O", - ". . O"); + ". . O", + ""); } } diff --git a/jcore-iob-consumer/src/test/resources/de/julielab/jcore/consumer/cas2iob/desc/ToIOBConsumerTest.xml b/jcore-iob-consumer/src/test/resources/de/julielab/jcore/consumer/cas2iob/desc/ToIOBConsumerTest.xml index 36199e77d..aef1816a7 100644 --- a/jcore-iob-consumer/src/test/resources/de/julielab/jcore/consumer/cas2iob/desc/ToIOBConsumerTest.xml +++ b/jcore-iob-consumer/src/test/resources/de/julielab/jcore/consumer/cas2iob/desc/ToIOBConsumerTest.xml @@ -5,7 +5,7 @@ ToIOBConsumerTest - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-iob-consumer/src/test/resources/de/julielab/jcore/consumer/cas2iob/types/TestTypeSystem.xml b/jcore-iob-consumer/src/test/resources/de/julielab/jcore/consumer/cas2iob/types/TestTypeSystem.xml index 7b3f82a25..05c84c7b1 100644 --- a/jcore-iob-consumer/src/test/resources/de/julielab/jcore/consumer/cas2iob/types/TestTypeSystem.xml +++ b/jcore-iob-consumer/src/test/resources/de/julielab/jcore/consumer/cas2iob/types/TestTypeSystem.xml @@ -2,7 +2,7 @@ TestTypeSystem including julie morpho-syntax and semantics -2.5.1-SNAPSHOT +2.6.0 diff --git a/jcore-iob-consumer/src/test/resources/iob-output/1.iob b/jcore-iob-consumer/src/test/resources/iob-output/1.iob deleted file mode 100644 index 5d171bb3b..000000000 --- a/jcore-iob-consumer/src/test/resources/iob-output/1.iob +++ /dev/null @@ -1,4 +0,0 @@ -BRCA NN B-Gene -influences VBZ O -cancer NN O -. . O diff --git a/jcore-jedis-integration-tests/pom.xml b/jcore-jedis-integration-tests/pom.xml new file mode 100644 index 000000000..79aa622d6 --- /dev/null +++ b/jcore-jedis-integration-tests/pom.xml @@ -0,0 +1,64 @@ + + + + jedis-parent + de.julielab + 2.6.0 + ../jedis-parent + + 4.0.0 + + jcore-jedis-integration-tests + + + + de.julielab + jcore-xml-db-reader + 2.6.0 + + + de.julielab + jcore-xmi-db-writer + 2.6.0 + + + de.julielab + jcore-db-checkpoint-ae + 2.6.0 + + + de.julielab + jcore-flow-controllers + 2.6.0 + + + de.julielab + jcore-types + ${jcore-types-version} + test + + + de.julielab + costosys + + + ch.qos.logback + logback-classic + + + org.assertj + assertj-core + + + de.julielab + jcore-db-test-utilities + + + org.junit.jupiter + junit-jupiter-engine + + + + \ No newline at end of file diff --git a/jcore-jedis-integration-tests/src/test/java/de/julielab/jcore/jedis/integrationtests/UpdateWithHashComparison.java b/jcore-jedis-integration-tests/src/test/java/de/julielab/jcore/jedis/integrationtests/UpdateWithHashComparison.java new file mode 100644 index 000000000..63e967924 --- /dev/null +++ b/jcore-jedis-integration-tests/src/test/java/de/julielab/jcore/jedis/integrationtests/UpdateWithHashComparison.java @@ -0,0 +1,264 @@ +package de.julielab.jcore.jedis.integrationtests; + +import de.julielab.costosys.dbconnection.DataBaseConnector; +import de.julielab.costosys.dbconnection.SubsetStatus; +import de.julielab.jcore.ae.checkpoint.DBCheckpointAE; +import de.julielab.jcore.consumer.xmi.XMIDBWriter; +import de.julielab.jcore.db.test.DBTestUtils; +import de.julielab.jcore.flow.annotationdefined.AnnotationDefinedFlowController; +import de.julielab.jcore.reader.db.DBMultiplierReader; +import de.julielab.jcore.reader.xml.XMLDBMultiplier; +import de.julielab.jcore.types.Annotation; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.factory.*; +import org.apache.uima.flow.FlowControllerDescription; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.metadata.TypeSystemDescription; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.PostgreSQLContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.io.File; +import java.nio.file.Path; +import java.util.*; + +import static org.assertj.core.api.Assertions.assertThat; + +@Testcontainers +public class UpdateWithHashComparison { + private static final String SOURCE_XML_TABLE = "_data.source_xml_table"; + private static final String TARGET_XMI_TABLE = "_data_xmi.target_xmi_table"; + private static final String XML_SUBSET_TABLE = "test_subset"; + private static final String XMI_MIRROR_TABLE = "test_xmi_mirror"; + @Container + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:11.12"); + private static String costosysConfigSourceTable; + private static String costosysConfigTargetTable; + /** + * The collection reader that feeds the XMLDBMultiplier the database rows to read. + */ + private static CollectionReader testCr; + /** + * The top-level aggregate containing the XMLDBMultiplier and two "child" aggregates, one for the analysis engines + * and one for the CAS consumers. In this test, the aggregate delegates are all realized by instances of {@link TestAnnotator}. + */ + private static AnalysisEngine testAggregate; + private static JCas cas; + private static DataBaseConnector dbc; + private static List namesOfRunComponents = new ArrayList<>(); + private static Set idsOfProcessedDocuments = new LinkedHashSet<>(); + + @BeforeAll + public static void setup() throws Exception { + DBTestUtils.createAndSetHiddenConfig(Path.of("src", "test", "resources", "hiddenConfig").toString(), postgres); + + dbc = DBTestUtils.getDataBaseConnector(postgres); + dbc.setActiveTableSchema("medline_2017"); + costosysConfigSourceTable = DBTestUtils.createTestCostosysConfig("medline_2017", 1, postgres); + costosysConfigTargetTable = DBTestUtils.createTestCostosysConfig("xmi_text", 1, postgres); + new File(costosysConfigSourceTable).deleteOnExit(); + new File(costosysConfigTargetTable).deleteOnExit(); + prepareSourceXMLTable(dbc); + dbc.defineMirrorSubset(XML_SUBSET_TABLE, SOURCE_XML_TABLE, true, "Test subset"); + assertThat(dbc.getNumRows(SOURCE_XML_TABLE)).isEqualTo(3); + createTestPipelineComponents(); + } + + @AfterAll + public static void shutdown() { + dbc.close(); + } + + private static void prepareSourceXMLTable(DataBaseConnector dbc) throws Exception { + dbc.createTable(SOURCE_XML_TABLE, "Test XML Table"); + dbc.importFromXMLFile(Path.of("src", "test", "resources", "pubmed21n1016_excerpt_original.xml.gz").toString(), SOURCE_XML_TABLE); + } + + /** + *

Creates test components in a structure that mimics the structure used by the jcore-pipeline-builder.

+ *

This consists of: + *

    + *
  1. a CollectionReader
  2. + *
  3. an AAE containing all other components: + *
      + *
    1. an optional CAS multiplier
    2. + *
    3. an aggregate containing all AEs
    4. + *
    5. an aggregate containing all CAS consumers
    6. + *
    + *
  4. + * The CAS consumers in this test consist of two "mock" CCs, a "real" XMI Writer and DB Checkpoint AE. + *
+ * We here want to test if we can successfully route the CAS through those inner AAEs when the multiplier adds + * the correct {@link de.julielab.jcore.types.casflow.ToVisit} annotation using a {@link de.julielab.jcore.flow.annotationdefined.AnnotationDefinedFlowController}. + *

+ */ + private static void createTestPipelineComponents() throws Exception { + TypeSystemDescription tsDesc = TypeSystemDescriptionFactory.createTypeSystemDescription("de.julielab.jcore.types.jcore-document-meta-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types", "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.jcore-casflow-types", "de.julielab.jcore.types.jcore-xmi-splitter-types"); + + testCr = CollectionReaderFactory.createReader(DBMultiplierReader.class, + tsDesc, + DBMultiplierReader.PARAM_TABLE, XML_SUBSET_TABLE, + DBMultiplierReader.PARAM_RESET_TABLE, false, + DBMultiplierReader.PARAM_COSTOSYS_CONFIG_NAME, costosysConfigSourceTable, + // We set a batch size of 1 to have more refined testing. + // Otherwise, the multiplier would receive all 3 test documents at once and + // would process them all in one batch + DBMultiplierReader.PARAM_BATCH_SIZE, 1 + ); + + AnalysisEngineDescription testAe1 = AnalysisEngineFactory.createEngineDescription(TestAnnotator.class, tsDesc, "name", "TestAE 1"); + AnalysisEngineDescription testAe2 = AnalysisEngineFactory.createEngineDescription(TestAnnotator.class, tsDesc, "name", "TestAE 2"); + AnalysisEngineDescription testCc1 = AnalysisEngineFactory.createEngineDescription(TestAnnotator.class, tsDesc, "name", "TestCC 1"); + AnalysisEngineDescription testCc2 = AnalysisEngineFactory.createEngineDescription(TestAnnotator.class, tsDesc, "name", "TestCC 2"); + AnalysisEngineDescription xmiDbWriter = AnalysisEngineFactory.createEngineDescription(XMIDBWriter.class, + XMIDBWriter.PARAM_TABLE_DOCUMENT, TARGET_XMI_TABLE, + XMIDBWriter.PARAM_ANNOS_TO_STORE, new String[]{"de.julielab.jcore.types.Annotation"}, + XMIDBWriter.PARAM_STORE_ALL, false, + XMIDBWriter.PARAM_STORE_BASE_DOCUMENT, true, + XMIDBWriter.PARAM_STORE_RECURSIVELY, false, + XMIDBWriter.PARAM_ADD_SHA_HASH, "document_text", + XMIDBWriter.PARAM_COSTOSYS_CONFIG, costosysConfigTargetTable, + XMIDBWriter.PARAM_UPDATE_MODE, true, + XMIDBWriter.PARAM_DO_GZIP, false + ); + AnalysisEngineDescription dbCheckpointAe = AnalysisEngineFactory.createEngineDescription(DBCheckpointAE.class, + DBCheckpointAE.PARAM_CHECKPOINT_NAME, "end", + DBCheckpointAE.PARAM_COSTOSYS_CONFIG, costosysConfigSourceTable, + DBCheckpointAE.PARAM_INDICATE_FINISHED, true + ); + + FlowControllerDescription flowControllerDescription = FlowControllerFactory.createFlowControllerDescription(AnnotationDefinedFlowController.class); + AnalysisEngineDescription aeAaeDesc = AnalysisEngineFactory.createEngineDescription(List.of(testAe1, testAe2), List.of("TestAE 1", "TestAE 2"), null, null, flowControllerDescription); + AnalysisEngineDescription ccAaeDesc = AnalysisEngineFactory.createEngineDescription(List.of(testCc1, testCc2, xmiDbWriter, dbCheckpointAe), List.of("TestCC 1", "TestCC 2", "XMI Writer", "Checkpoint Writer"), null, null, flowControllerDescription); + + AnalysisEngineDescription multiplierDescription = AnalysisEngineFactory.createEngineDescription(XMLDBMultiplier.class, + tsDesc, + XMLDBMultiplier.PARAM_MAPPING_FILE, Path.of("src", "test", "resources", "medlineMappingFile.xml").toString(), + // The core of this whole test: The components to be visited in case of matching hash codes. + // We want to skip all components except the checkpoint writer that marks the document as + // "processed" in the XML subset table + XMLDBMultiplier.PARAM_TO_VISIT_KEYS, new String[]{"Checkpoint Writer"}, + // The next three parameters are required for the hash comparison + XMLDBMultiplier.PARAM_ADD_SHA_HASH, "document_text", + XMLDBMultiplier.PARAM_TABLE_DOCUMENT, TARGET_XMI_TABLE, + XMLDBMultiplier.PARAM_TABLE_DOCUMENT_SCHEMA, "xmi_text"); + + testAggregate = AnalysisEngineFactory.createEngine(List.of(multiplierDescription, aeAaeDesc, ccAaeDesc), List.of("multiplier", "AeAAE", "CcAAE"), null, null); + + cas = JCasFactory.createJCas(tsDesc); + } + + @Test + public void testInitialProcessingProcessing() throws Exception { + assertThat(testCr.hasNext()); + while (testCr.hasNext()) { + testCr.getNext(cas.getCas()); + testAggregate.process(cas); + // Check that all components have been visited as expected from a normal, fixed flow + assertThat(namesOfRunComponents).containsExactly("TestAE 1", "TestAE 2", "TestCC 1", "TestCC 2"); + namesOfRunComponents.clear(); + cas.reset(); + } + testAggregate.collectionProcessComplete(); + assertThat(dbc.tableExists(TARGET_XMI_TABLE)); + // After this first processing, the XMI document table exists. We can now create a mirror on it. This is important + // because we want to see that the mirror is only reset for rows that have actually changed in subsequent tests. + dbc.defineMirrorSubset(XMI_MIRROR_TABLE, TARGET_XMI_TABLE, true, "The XMI test mirror table.", "xmi_text"); + // We mark the XMI mirror subset as completely processed. This simulates a state where the initial batch of + // documents has been completely processed, before the update comes in. + dbc.markAsProcessed(XMI_MIRROR_TABLE); + SubsetStatus status = dbc.status(XML_SUBSET_TABLE, EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED, DataBaseConnector.StatusElement.IN_PROCESS)); + // Check that all rows have been processed in the XML source subset table. + assertThat(status.isProcessed).isEqualTo(3); + assertThat(status.inProcess).isEqualTo(0); + + assertThat(idsOfProcessedDocuments).hasSize(3); + // Check that there are actual IDs, not null string or something like that + for (String id : idsOfProcessedDocuments) + assertThat(id).matches("[0-9]+"); + } + + /** + * Adds its name to {@link #namesOfRunComponents}. + */ + public static class TestAnnotator extends JCasAnnotator_ImplBase { + @ConfigurationParameter(name = "name") + private String name; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException { + super.initialize(aContext); + this.name = (String) aContext.getConfigParameterValue("name"); + } + + @Override + public void process(JCas jCas) { + assertThat(jCas.getDocumentText()).isNotBlank(); + namesOfRunComponents.add(name); + idsOfProcessedDocuments.add(JCoReTools.getDocId(jCas)); + new Annotation(jCas).addToIndexes(); + } + } + + @Nested + class AfterInitialProcessing { + @Test + public void updateXML() throws Exception { + dbc.updateFromXML(Path.of("src", "test", "resources", "pubmed21n1016_excerpt_partially_changed.xml.gz").toString(), SOURCE_XML_TABLE, true); + // The update contains all three originally imported XML documents. Only that the second has not been changed. + // But the XML mirror should have been reset completely. + SubsetStatus status = dbc.status(XML_SUBSET_TABLE, EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED, DataBaseConnector.StatusElement.IN_PROCESS)); + // Check that the XML mirror subset has been reset due to the update + assertThat(status.isProcessed).isEqualTo(0); + assertThat(status.inProcess).isEqualTo(0); + } + + @Nested + class AfterUpdatingXML { + @Test + public void testOnlyNewDocumentsProcessed() throws Exception { + + testCr.reconfigure(); + testAggregate.reconfigure(); + assertThat(testCr.hasNext()).withFailMessage("The XML DB Collection reader does not report any non-processed rows.").isTrue(); + // Run the whole pipeline again. Only this time we only expect all the components run in a single case. + List allNamesOfRunComponents = new ArrayList<>(); + while (testCr.hasNext()) { + cas.reset(); + testCr.getNext(cas.getCas()); + testAggregate.process(cas); + // Check that all components have been visited as expected from a normal, fixed flow + allNamesOfRunComponents.addAll(namesOfRunComponents); + namesOfRunComponents.clear(); + cas.reset(); + } + testAggregate.collectionProcessComplete(); + // There should be only two components documents now that have visited all components + assertThat(allNamesOfRunComponents).containsExactly("TestAE 1", "TestAE 2", "TestCC 1", "TestCC 2", "TestAE 1", "TestAE 2", "TestCC 1", "TestCC 2"); + testAggregate.collectionProcessComplete(); + // Check again that all the XML documents have been processed. + SubsetStatus status = dbc.status(XML_SUBSET_TABLE, EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED)); + // Check that all rows have been processed in the XML source subset table. + assertThat(status.isProcessed).isEqualTo(3); + + // Now the more interesting part: In the XMI mirror there should now be two unprocessed tables, namely + // the two documents with a changed document text. The unchanged document should still be marked as + // processed. + SubsetStatus xmiMirrorStatus = dbc.status(XMI_MIRROR_TABLE, EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED)); + // Check that all rows have been processed in the XML source subset table. + assertThat(xmiMirrorStatus.isProcessed).isEqualTo(1); + } + } + } +} diff --git a/jcore-jedis-integration-tests/src/test/resources/logback-test.xml b/jcore-jedis-integration-tests/src/test/resources/logback-test.xml new file mode 100644 index 000000000..e2ec34c57 --- /dev/null +++ b/jcore-jedis-integration-tests/src/test/resources/logback-test.xml @@ -0,0 +1,19 @@ + + + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + + + + + \ No newline at end of file diff --git a/jcore-jedis-integration-tests/src/test/resources/medlineMappingFile.xml b/jcore-jedis-integration-tests/src/test/resources/medlineMappingFile.xml new file mode 100644 index 000000000..cd9892953 --- /dev/null +++ b/jcore-jedis-integration-tests/src/test/resources/medlineMappingFile.xml @@ -0,0 +1,457 @@ + + + + /MedlineCitation/Article/ArticleTitle + + + /MedlineCitation/Article/Abstract + de.julielab.jcore.reader.xmlmapper.mapper.StructuredAbstractParser + + + + /MedlineCitation/OtherAbstract + + + /MedlineCitation/Article/VernacularTitle + + + + + de.julielab.jcore.types.Title + + + 0 + + + + titleType + java.lang.String + + document + + + + + + + de.julielab.jcore.types.pubmed.AbstractText + + + + 2 + + + + abstractType + java.lang.String + + other + + + + + + de.julielab.jcore.types.Title + + + 3 + + + + titleType + java.lang.String + + document_vernacular + + + + + + + de.julielab.jcore.types.pubmed.Header + + + + /MedlineCitation/ArticleIdList/ArticleId[@IdType="doi"] + + doi + java.lang.String + + + /MedlineCitation/PMID + docId + java.lang.String + + + /MedlineCitation/@Status + citationStatus + java.lang.String + + + + /MedlineCitation/Article/Language + + language + java.lang.String + + de + + + en + + + es + + + fr + + + it + + + pt + + + eng + + + ger + + + fre + + + ita + + + other + + + + source + java.lang.String + + de.julielab.jcore.reader.xmlmapper.typeParser.SourceParser + + + + authors + + org.apache.uima.jcas.cas.FSArray + + de.julielab.jcore.reader.xmlmapper.typeParser.FSArrayParser + + true + + authorInfo + + de.julielab.jcore.types.AuthorInfo + + true + + + /MedlineCitation/Article/AuthorList/Author[LastName] + + + foreName + java.lang.String + ForeName + + + foreName + java.lang.String + FirstName + + + lastName + java.lang.String + LastName + + + initials + java.lang.String + Initials + + + affiliation + java.lang.String + + AffiliationInfo/Affiliation + + + + + + + org.apache.uima.jcas.cas.FSArray + + pubTypeList + true + + + de.julielab.jcore.types.Journal + + + /MedlineCitation/Article/PublicationTypeList/PublicationType + + Journal + true + + java.lang.String + name + . + + + java.lang.String + ISSN + + /MedlineCitation/Article/Journal/ISSN + + + + java.lang.String + Volume + + /MedlineCitation/Article/Journal/JournalIssue/Volume + + + + java.lang.String + Issue + + /MedlineCitation/Article/Journal/JournalIssue/Issue + + + + java.lang.String + Title + + /MedlineCitation/Article/Journal/Title + + + + java.lang.String + ShortTitle + + /MedlineCitation/MedlineJournalInfo/MedlineTA + + + + java.lang.String + nlmId + + /MedlineCitation/MedlineJournalInfo/NlmUniqueID + + + + java.lang.String + Pages + + /MedlineCitation/Article/Pagination/MedlinePgn + + + + true + + de.julielab.jcore.types.Date + + PubDate + + de.julielab.jcore.reader.xmlmapper.typeParser.PubDateParser + + + /MedlineCitation/Article/Journal/JournalIssue/PubDate + + + int + month + + + int + year + + + int + day + + + + + + org.apache.uima.jcas.cas.FSArray + otherIDs + true + + de.julielab.jcore.types.pubmed.OtherID + + /MedlineCitation/OtherID + true + + id + java.lang.String + . + + + source + java.lang.String + @Source + + + + + + + de.julielab.jcore.types.pubmed.ManualDescriptor + + + /MedlineCitation/GeneSymbolList + GeneSymbolList + true + + org.apache.uima.jcas.cas.StringArray + + + + KeywordList + true + + org.apache.uima.jcas.cas.FSArray + + + Keyword + true + + /MedlineCitation/KeywordList/Keyword + + + de.julielab.jcore.types.Keyword + + + Name + . + java.lang.String + + + + + ChemicalList + true + + org.apache.uima.jcas.cas.FSArray + + + Chemical + true + + /MedlineCitation/ChemicalList/Chemical + + + de.julielab.jcore.types.Chemical + + + RegistryNumber + RegistryNumber + java.lang.String + + + NameOfSubstance + NameOfSubstance + java.lang.String + + + + + DBInfoList + true + + org.apache.uima.jcas.cas.FSArray + + + DBInfo + true + + /MedlineCitation/DataBankList/DataBank + + + de.julielab.jcore.types.DBInfo + + + Name + DataBankName + java.lang.String + + + AcList + + AccessionNumberList + + + true + + org.apache.uima.jcas.cas.StringArray + + + + + + MeSHList + true + + org.apache.uima.jcas.cas.FSArray + + + meshHeading + true + + /MedlineCitation/MeshHeadingList/MeshHeading + + + de.julielab.jcore.types.MeshHeading + + + DescriptorName + java.lang.String + DescriptorName + + + + DescriptorNameMajorTopic + + DescriptorName/@MajorTopicYN + boolean + + Y + true + + + N + false + + + + QualifierName + java.lang.String + QualifierName + + + + QualifierNameMajorTopic + + QualifierName/@MajorTopicYN + boolean + + Y + true + + + N + false + + + + + + \ No newline at end of file diff --git a/jcore-jedis-integration-tests/src/test/resources/pubmed21n1016_excerpt_original.xml.gz b/jcore-jedis-integration-tests/src/test/resources/pubmed21n1016_excerpt_original.xml.gz new file mode 100644 index 000000000..365b8d3e0 Binary files /dev/null and b/jcore-jedis-integration-tests/src/test/resources/pubmed21n1016_excerpt_original.xml.gz differ diff --git a/jcore-jedis-integration-tests/src/test/resources/pubmed21n1016_excerpt_partially_changed.xml.gz b/jcore-jedis-integration-tests/src/test/resources/pubmed21n1016_excerpt_partially_changed.xml.gz new file mode 100644 index 000000000..ee6542535 Binary files /dev/null and b/jcore-jedis-integration-tests/src/test/resources/pubmed21n1016_excerpt_partially_changed.xml.gz differ diff --git a/jcore-jedis-integration-tests/src/test/resources/pubmedMappingFile.xml b/jcore-jedis-integration-tests/src/test/resources/pubmedMappingFile.xml new file mode 100644 index 000000000..9a76854ae --- /dev/null +++ b/jcore-jedis-integration-tests/src/test/resources/pubmedMappingFile.xml @@ -0,0 +1,436 @@ + + + + /PubmedArticle/MedlineCitation/Article/ArticleTitle + + + /PubmedArticle/MedlineCitation/Article/Abstract + de.julielab.jcore.reader.xmlmapper.mapper.StructuredAbstractParser + + + + /PubmedArticle/MedlineCitation/OtherAbstract + + + /PubmedArticle/MedlineCitation/Article/VernacularTitle + + + + + de.julielab.jcore.types.Title + + + 0 + + + + titleType + java.lang.String + + document + + + + + + + de.julielab.jcore.types.pubmed.AbstractText + + + + 2 + + + + abstractType + java.lang.String + + other + + + + + + de.julielab.jcore.types.Title + + + 3 + + + + titleType + java.lang.String + + document_vernacular + + + + + + + de.julielab.jcore.types.pubmed.Header + + + + /PubmedArticle/MedlineCitation/ArticleIdList/ArticleId[@IdType="doi"] + + doi + java.lang.String + + + /PubmedArticle/MedlineCitation/PMID + docId + java.lang.String + + + /PubmedArticle/MedlineCitation/@Status + citationStatus + java.lang.String + + + + /PubmedArticle/MedlineCitation/Article/Language + + language + java.lang.String + + de + + + en + + + es + + + fr + + + it + + + pt + + + eng + + + ger + + + fre + + + ita + + + other + + + + source + java.lang.String + + de.julielab.jcore.reader.xmlmapper.typeParser.SourceParser + + + + authors + + org.apache.uima.jcas.cas.FSArray + + de.julielab.jcore.reader.xmlmapper.typeParser.FSArrayParser + + true + + authorInfo + + de.julielab.jcore.types.AuthorInfo + + true + + + /PubmedArticle/MedlineCitation/Article/AuthorList/Author[LastName] + + + foreName + java.lang.String + ForeName + + + foreName + java.lang.String + FirstName + + + lastName + java.lang.String + LastName + + + initials + java.lang.String + Initials + + + affiliation + java.lang.String + + AffiliationInfo/Affiliation + + + + + + + org.apache.uima.jcas.cas.FSArray + + pubTypeList + true + + + de.julielab.jcore.types.Journal + + + /PubmedArticle/MedlineCitation/Article/PublicationTypeList/PublicationType + + Journal + true + + java.lang.String + name + . + + + java.lang.String + ISSN + + /PubmedArticle/MedlineCitation/Article/Journal/ISSN + + + + java.lang.String + Volume + + /PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/Volume + + + + java.lang.String + Issue + + /PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/Issue + + + + java.lang.String + Title + + /PubmedArticle/MedlineCitation/Article/Journal/Title + + + + java.lang.String + ShortTitle + + /PubmedArticle/MedlineCitation/MedlineJournalInfo/MedlineTA + + + + java.lang.String + nlmId + + /PubmedArticle/MedlineCitation/MedlineJournalInfo/NlmUniqueID + + + + java.lang.String + Pages + + /PubmedArticle/MedlineCitation/Article/Pagination/MedlinePgn + + + + true + + de.julielab.jcore.types.Date + + PubDate + + de.julielab.jcore.reader.xmlmapper.typeParser.PubDateParser + + + /PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/PubDate + + + int + month + + + int + year + + + int + day + + + + + + + + de.julielab.jcore.types.pubmed.ManualDescriptor + + + /PubmedArticle/MedlineCitation/GeneSymbolList + GeneSymbolList + true + + org.apache.uima.jcas.cas.StringArray + + + + KeywordList + true + + org.apache.uima.jcas.cas.FSArray + + + Keyword + true + + /PubmedArticle/MedlineCitation/KeywordList/Keyword + + + de.julielab.jcore.types.Keyword + + + Name + . + java.lang.String + + + + + ChemicalList + true + + org.apache.uima.jcas.cas.FSArray + + + Chemical + true + + /PubmedArticle/MedlineCitation/ChemicalList/Chemical + + + de.julielab.jcore.types.Chemical + + + RegistryNumber + RegistryNumber + java.lang.String + + + NameOfSubstance + NameOfSubstance + java.lang.String + + + + + DBInfoList + true + + org.apache.uima.jcas.cas.FSArray + + + DBInfo + true + + /PubmedArticle/MedlineCitation/DataBankList/DataBank + + + de.julielab.jcore.types.DBInfo + + + Name + DataBankName + java.lang.String + + + AcList + + AccessionNumberList + + + true + + org.apache.uima.jcas.cas.StringArray + + + + + + MeSHList + true + + org.apache.uima.jcas.cas.FSArray + + + meshHeading + true + + /PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading + + + de.julielab.jcore.types.MeshHeading + + + DescriptorName + java.lang.String + DescriptorName + + + + DescriptorNameMajorTopic + + DescriptorName/@MajorTopicYN + boolean + + Y + true + + + N + false + + + + QualifierName + java.lang.String + QualifierName + + + + QualifierNameMajorTopic + + QualifierName/@MajorTopicYN + boolean + + Y + true + + + N + false + + + + + + \ No newline at end of file diff --git a/jcore-jemas-ae/src/main/resources/de/julielab/jcore/ae/jemas/desc/jcore-jemas-ae.xml b/jcore-jemas-ae/src/main/resources/de/julielab/jcore/ae/jemas/desc/jcore-jemas-ae.xml index 436c249b2..3d2f6c9fd 100644 --- a/jcore-jemas-ae/src/main/resources/de/julielab/jcore/ae/jemas/desc/jcore-jemas-ae.xml +++ b/jcore-jemas-ae/src/main/resources/de/julielab/jcore/ae/jemas/desc/jcore-jemas-ae.xml @@ -5,7 +5,7 @@ JCoRe JEmAS A UIMA-based implementation of the core functionality of JEmAS, the Jena Emotion Analysis System. - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-jnet-ae/component.meta b/jcore-jnet-ae/component.meta index dbdfe4186..b39b004c5 100644 --- a/jcore-jnet-ae/component.meta +++ b/jcore-jnet-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-jnet-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe JNET AE" } diff --git a/jcore-jnet-ae/pom.xml b/jcore-jnet-ae/pom.xml index cfd5c716c..d67df3ba0 100644 --- a/jcore-jnet-ae/pom.xml +++ b/jcore-jnet-ae/pom.xml @@ -11,14 +11,15 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 + org.apache.maven.plugins maven-assembly-plugin - 2.4 + 3.3.0 jar-with-dependencies @@ -106,6 +107,12 @@ de.julielab uea-stemmer 0.1 + + + junit + junit + + de.julielab @@ -117,8 +124,8 @@ julielab-java-utilities - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-jnet-ae/src/main/resources/de/julielab/jcore/ae/jnet/desc/jcore-jnet-ae.xml b/jcore-jnet-ae/src/main/resources/de/julielab/jcore/ae/jnet/desc/jcore-jnet-ae.xml index db23c98b2..f4b666e6d 100644 --- a/jcore-jnet-ae/src/main/resources/de/julielab/jcore/ae/jnet/desc/jcore-jnet-ae.xml +++ b/jcore-jnet-ae/src/main/resources/de/julielab/jcore/ae/jnet/desc/jcore-jnet-ae.xml @@ -6,7 +6,7 @@ JCoRe JNET AE - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/cli/JNETApplicationTest.java b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/cli/JNETApplicationTest.java index cdfe60693..153d2714c 100644 --- a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/cli/JNETApplicationTest.java +++ b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/cli/JNETApplicationTest.java @@ -6,12 +6,12 @@ package de.julielab.jcore.ae.jnet.cli; -import org.junit.After; -import org.junit.Test; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; import java.io.File; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertTrue; public class JNETApplicationTest { private static final String PREFIX = "src/test/resources/de/julielab/jcore/ae/jnet/cli/"; @@ -32,7 +32,7 @@ public class JNETApplicationTest { - @After + @AfterEach public void deleteModel() { File modelFile = new File(UNITTEST_MODEL_GZ); if (modelFile.exists()) diff --git a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/tagger/NETaggerTest.java b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/tagger/NETaggerTest.java index f21a11d09..e05e6a6c1 100644 --- a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/tagger/NETaggerTest.java +++ b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/tagger/NETaggerTest.java @@ -2,7 +2,7 @@ import cc.mallet.types.Instance; import cc.mallet.types.InstanceList; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.InputStream; import java.util.ArrayList; diff --git a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/ConsistencyPreservationTest.java b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/ConsistencyPreservationTest.java index 3031116d3..f551411fd 100644 --- a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/ConsistencyPreservationTest.java +++ b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/ConsistencyPreservationTest.java @@ -18,7 +18,6 @@ package de.julielab.jcore.ae.jnet.uima; import de.julielab.jcore.types.*; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.cas.CAS; import org.apache.uima.cas.FSIterator; @@ -28,6 +27,7 @@ import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -35,6 +35,8 @@ import java.util.Iterator; import java.util.TreeSet; +import static org.junit.jupiter.api.Assertions.*; + /** * Please note that in the original test there were "GoodEntityMentions" and * "BadEntityMentions". Both types were only used for this test which caused @@ -45,7 +47,7 @@ * @author faessler * */ -public class ConsistencyPreservationTest extends TestCase { +public class ConsistencyPreservationTest { private static final Logger LOGGER = LoggerFactory.getLogger(ConsistencyPreservationTest.class); @@ -133,12 +135,14 @@ private void initJCas4DoAbbreviationBased(final JCas jcas) throws Exception { e5.addToIndexes(); } + @Test public void testConsistencyPreservation() throws Exception { final String modeString = ConsistencyPreservation.MODE_STRING + "," + ConsistencyPreservation.MODE_ACRO2FULL + "," + ConsistencyPreservation.MODE_FULL2ACRO; new ConsistencyPreservation(modeString); } + @Test public void testAcroMatch() throws Exception { final String modeString = ConsistencyPreservation.MODE_FULL2ACRO + "," + ConsistencyPreservation.MODE_ACRO2FULL; @@ -186,6 +190,7 @@ public void testAcroMatch() throws Exception { } + @Test public void testStringMatch() throws Exception { LOGGER.info("testStringMatch() - starting..."); final CAS cas = CasCreationUtils.createCas( @@ -229,6 +234,7 @@ public void testStringMatch() throws Exception { assertTrue(allOK); } + @Test public void testStringMatch2() throws Exception { // This test checks whether the consistence preservation algorithm // correctly detects already existing annotations even when there are @@ -269,6 +275,7 @@ public void testStringMatch2() throws Exception { assertEquals(3, count); } + @Test public void testStringMatch3() throws Exception { // This test checks whether the consistence preservation algorithm // correctly detects already existing annotations even when there are @@ -309,6 +316,7 @@ public void testStringMatch3() throws Exception { assertEquals(5, count); } + @Test public void testStringMatchTokenBoundaries() throws Exception { // This test checks whether the consistency preservation algorithm // sticks to token boundaries if the respective mode is on @@ -350,6 +358,7 @@ public void testStringMatchTokenBoundaries() throws Exception { assertEquals(1, count); } + @Test public void testStringMatchTokenBoundaries2() throws Exception { // Test for multi token entities String text = "This is BCA alpha. But we haven't annotated BCA alpha in all cases. Also not some other BCA."; @@ -430,7 +439,8 @@ else if (g.getSpecificType().equals("type2")) } assertEquals(2, oCount); } - + + @Test public void testStringMatchTokenBoundaries3() throws Exception { // Test for multi token entities with correct prefix but wrong ending String text = "Group 1. And Group B."; diff --git a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.java b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.java index 44dd4e90d..e2143f3e9 100644 --- a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.java +++ b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.java @@ -20,7 +20,6 @@ import de.julielab.jcore.types.*; import de.julielab.jcore.utility.index.JCoReCoverIndex; import de.julielab.jnet.tagger.Unit; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; @@ -37,6 +36,7 @@ import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; import org.apache.uima.util.XMLParser; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; @@ -52,7 +52,9 @@ import java.util.regex.Pattern; import java.util.stream.Collectors; -public class EntityAnnotatorTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class EntityAnnotatorTest { /** * Logger for this class @@ -66,12 +68,8 @@ public class EntityAnnotatorTest extends TestCase { private static final String ENTITY_ANNOTATOR_DESC = PREFIX+"EntityAnnotatorTest.xml"; private static final String NEGATIVE_LIST = PREFIX+"negativeList"; - @Override - protected void setUp() throws Exception { - super.setUp(); - // PropertyConfigurator.configure("src/test/java/log4j.properties"); - } + @Test public void testIgnoreLabel() throws ResourceInitializationException { // load AE @@ -124,6 +122,7 @@ public void testIgnoreLabel() throws ResourceInitializationException { /** * test whether Annotator can be initialized properly from given descriptor */ + @Test public void testInitialize() { LOGGER.debug("testInitialize()"); AnalysisEngine entityAnnotator = null; @@ -150,6 +149,7 @@ public void testInitialize() { * test whether process method runs successfully. Output must be checked by * a human manually */ + @Test public void testProcess() throws InvalidXMLException, ResourceInitializationException, IOException, SAXException, CASException, AnalysisEngineProcessException { LOGGER.debug("testProcess()"); @@ -176,6 +176,7 @@ public void testProcess() throws InvalidXMLException, ResourceInitializationExce * unit sentence and removing duplicates. Prediction is "simulated" (labels * are set). */ + @Test public void testSimulatedProcess() throws IllegalAccessException, NoSuchFieldException, ResourceInitializationException, InvalidXMLException, IOException, CASException, SAXException { LOGGER.debug("testCreateUnitSentence() - starting"); @@ -280,6 +281,7 @@ else if (unit.getRep().equals("ceta")) * @throws IllegalAccessException * @throws IllegalArgumentException */ + @Test public void testWriteToCAS() throws SecurityException, NoSuchFieldException, ResourceInitializationException, InvalidXMLException, IOException, CASException, IllegalArgumentException, IllegalAccessException { LOGGER.debug("testWriteToCAS()"); diff --git a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/MiniTestapp.java b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/MiniTestapp.java index 1b1ed323f..006328391 100644 --- a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/MiniTestapp.java +++ b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/MiniTestapp.java @@ -35,8 +35,8 @@ import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.XMLInputSource; import org.apache.uima.util.XMLSerializer; -import org.junit.After; -import org.junit.Test; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; @@ -47,7 +47,7 @@ import java.io.IOException; import java.nio.charset.Charset; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class MiniTestapp { @@ -61,7 +61,7 @@ public class MiniTestapp { private static final String ANNOTATOR_DESC = PREFIX + "EntityAnnotatorTest.xml"; - @After + @AfterEach public void clean() { if (new File(TEST_XMI_OUT).isFile()) { new File(TEST_XMI_OUT).delete(); diff --git a/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.xml b/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.xml index 12859863d..b07631439 100644 --- a/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.xml +++ b/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.xml @@ -6,7 +6,7 @@ EntityTaggerAnnotator - 2.5.1-SNAPSHOT + 2.6.0 julielab diff --git a/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/miniapp_out_template.xmi b/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/miniapp_out_template.xmi deleted file mode 100644 index 029dc8db3..000000000 --- a/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/miniapp_out_template.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/tsDescriptor.xml b/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/tsDescriptor.xml index b26a4688d..6bfe94e8e 100644 --- a/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/tsDescriptor.xml +++ b/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/tsDescriptor.xml @@ -2,7 +2,7 @@ aceComplete -2.5.1-SNAPSHOT +2.6.0 diff --git a/jcore-jpos-ae/component.meta b/jcore-jpos-ae/component.meta index 86f05e5d5..6cacfad71 100644 --- a/jcore-jpos-ae/component.meta +++ b/jcore-jpos-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-jpos-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe JPOS AE" } diff --git a/jcore-jpos-ae/pom.xml b/jcore-jpos-ae/pom.xml index 480afdf16..04e41a7e3 100644 --- a/jcore-jpos-ae/pom.xml +++ b/jcore-jpos-ae/pom.xml @@ -11,7 +11,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -114,8 +114,8 @@ 2.1.2 - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-jpos-ae/src/main/resources/de/julielab/jcore/ae/jpos/desc/jcore-jpos.xml b/jcore-jpos-ae/src/main/resources/de/julielab/jcore/ae/jpos/desc/jcore-jpos.xml index be5593812..e8777ae38 100644 --- a/jcore-jpos-ae/src/main/resources/de/julielab/jcore/ae/jpos/desc/jcore-jpos.xml +++ b/jcore-jpos-ae/src/main/resources/de/julielab/jcore/ae/jpos/desc/jcore-jpos.xml @@ -6,7 +6,7 @@ JCoRe JPOS AE - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-jpos-ae/src/test/java/de/julielab/jcore/ae/jpos/postagger/POSAnnotatorTest.java b/jcore-jpos-ae/src/test/java/de/julielab/jcore/ae/jpos/postagger/POSAnnotatorTest.java index c7a03c06d..50c639d51 100644 --- a/jcore-jpos-ae/src/test/java/de/julielab/jcore/ae/jpos/postagger/POSAnnotatorTest.java +++ b/jcore-jpos-ae/src/test/java/de/julielab/jcore/ae/jpos/postagger/POSAnnotatorTest.java @@ -17,9 +17,9 @@ import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class POSAnnotatorTest { diff --git a/jcore-jpos-ae/src/test/resources/POSTagAnnotatorTest.xml b/jcore-jpos-ae/src/test/resources/POSTagAnnotatorTest.xml index 384265369..3c7f1b099 100644 --- a/jcore-jpos-ae/src/test/resources/POSTagAnnotatorTest.xml +++ b/jcore-jpos-ae/src/test/resources/POSTagAnnotatorTest.xml @@ -6,7 +6,7 @@ JPOSAnnotator - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab diff --git a/jcore-jsbd-ae/component.meta b/jcore-jsbd-ae/component.meta index 025d9b87f..5ab9a4df2 100644 --- a/jcore-jsbd-ae/component.meta +++ b/jcore-jsbd-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-jsbd-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Sentence Annotator" } diff --git a/jcore-jsbd-ae/pom.xml b/jcore-jsbd-ae/pom.xml index d5622f97b..b0b6524c2 100644 --- a/jcore-jsbd-ae/pom.xml +++ b/jcore-jsbd-ae/pom.xml @@ -11,14 +11,15 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 + org.apache.maven.plugins maven-assembly-plugin - 2.4 + 3.3.0 jar-with-dependencies @@ -76,7 +77,6 @@ org.assertj assertj-core - 3.9.1 de.julielab @@ -102,6 +102,16 @@ cc.mallet mallet 2.0.8 + + + junit + junit + + + + + org.apache.commons + commons-lang3 de.julielab @@ -112,6 +122,10 @@ de.julielab jcore-descriptor-creator + + org.junit.jupiter + junit-jupiter + JULIE Lab Jena, Germany diff --git a/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java b/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java index a27107477..583db41a1 100644 --- a/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java +++ b/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java @@ -29,6 +29,7 @@ import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; @@ -146,77 +147,89 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept * @throws AnalysisEngineProcessException */ public void process(JCas aJCas) throws AnalysisEngineProcessException { - if (StringUtils.isBlank(aJCas.getDocumentText())) { - final String docId = JCoReTools.getDocId(aJCas); - LOGGER.warn("The document text of document {} is empty.", docId); - return; - } - JCoReCondensedDocumentText documentText; try { - // If there are no cut-away types, the document text will remain unchanged. - documentText = new JCoReCondensedDocumentText(aJCas, cutAwayTypes); - } catch (ClassNotFoundException e1) { - throw new AnalysisEngineProcessException(e1); - } - - if (sentenceDelimiterTypes != null) { + if (StringUtils.isBlank(aJCas.getDocumentText())) { + final String docId = JCoReTools.getDocId(aJCas); + LOGGER.warn("The document text of document {} is empty.", docId); + final AnnotationIndex annotationIndex = aJCas.getAnnotationIndex(); + LOGGER.warn("All annotations in CAS:"); + for (Annotation a : annotationIndex) { + System.out.println(a); + } + return; + } + JCoReCondensedDocumentText documentText; try { - // the index merger gives us access to all delimiter type - // indexes in one - JCoReAnnotationIndexMerger indexMerger = new JCoReAnnotationIndexMerger(sentenceDelimiterTypes, false, - null, aJCas); + // If there are no cut-away types, the document text will remain unchanged. + documentText = new JCoReCondensedDocumentText(aJCas, cutAwayTypes, Set.of(','), true); + } catch (ClassNotFoundException e1) { + LOGGER.error("Could not create the text without annotations to be cut away in document {}", JCoReTools.getDocId(aJCas), e1); + throw new AnalysisEngineProcessException(e1); + } - // the idea: collect all start and end offsets of sentence - // delimiter annotations (sections, titles, captions, ...) in a - // list and sort ascending; then, perform sentence segmentation - // between every two adjacent offsets. This way, no sentence can - // cross any delimiter annotation border - List borders = new ArrayList<>(); - borders.add(0); - borders.add(aJCas.getDocumentText().length()); - while (indexMerger.incrementAnnotation()) { - Annotation a = (Annotation) indexMerger.getAnnotation(); - // Here we convert the original offsets to the condensed offsets. If there are - // no cut-away types, the offsets will just remain unchanged. Otherwise we now - // have the borders of the condensed text passages associated with the sentence - // delimiter annotation. - borders.add(documentText.getCondensedOffsetForOriginalOffset(a.getBegin())); - borders.add(documentText.getCondensedOffsetForOriginalOffset(a.getEnd())); - } - borders.sort(null); + if (sentenceDelimiterTypes != null) { + try { + // the index merger gives us access to all delimiter type + // indexes in one + JCoReAnnotationIndexMerger indexMerger = new JCoReAnnotationIndexMerger(sentenceDelimiterTypes, false, + null, aJCas); - // now do sentence segmentation between annotation borders - for (int i = 1; i < borders.size(); ++i) { - int start = borders.get(i - 1); - int end = borders.get(i); + // the idea: collect all start and end offsets of sentence + // delimiter annotations (sections, titles, captions, ...) in a + // list and sort ascending; then, perform sentence segmentation + // between every two adjacent offsets. This way, no sentence can + // cross any delimiter annotation border + List borders = new ArrayList<>(); + borders.add(0); + borders.add(documentText.getCondensedOffsetForOriginalOffset(aJCas.getDocumentText().length())); + while (indexMerger.incrementAnnotation()) { + Annotation a = (Annotation) indexMerger.getAnnotation(); + // Here we convert the original offsets to the condensed offsets. If there are + // no cut-away types, the offsets will just remain unchanged. Otherwise we now + // have the borders of the condensed text passages associated with the sentence + // delimiter annotation. + borders.add(documentText.getCondensedOffsetForOriginalOffset(a.getBegin())); + borders.add(documentText.getCondensedOffsetForOriginalOffset(a.getEnd())); + } + borders.sort(null); - // skip leading whites spaces - while (start < end && Character.isWhitespace(aJCas.getDocumentText().charAt(start))) - ++start; + // now do sentence segmentation between annotation borders + for (int i = 1; i < borders.size(); ++i) { + int start = borders.get(i - 1); + int end = borders.get(i); - // get the string between the current annotation borders and recognize sentences - String textSpan = documentText.getCodensedText().substring(start, end); - if (!StringUtils.isBlank(textSpan)) - doSegmentation(documentText, textSpan, start); - } + // skip leading whites spaces + while (start < end && (Character.isWhitespace(documentText.getCodensedText().charAt(start)))) + ++start; - } catch (ClassNotFoundException e) { - throw new AnalysisEngineProcessException(e); - } - } else { - // if no processingScope set -> use documentText - if (aJCas.getDocumentText() != null && aJCas.getDocumentText().length() > 0) { - doSegmentation(documentText, documentText.getCodensedText(), 0); - } else { - if (numEmptyCases.get() < 10) { - LOGGER.debug("document text empty. Skipping this document."); - numEmptyCases.incrementAndGet(); - } else if (numEmptyCases.get() == 10) { - LOGGER.warn("Encountered 10 documents with an empty text body. This message will not appear again " + - "to avoid scrolling in cases where this is expected."); + // get the string between the current annotation borders and recognized sentences + String textSpan = documentText.getCodensedText().substring(start, end); + if (!StringUtils.isBlank(textSpan)) + doSegmentation(documentText, textSpan, start); + } + + } catch (ClassNotFoundException e) { + throw new AnalysisEngineProcessException(e); } + } else { + // sentence delimiter types are not given + // if no processingScope set -> use documentText + if (aJCas.getDocumentText() != null && aJCas.getDocumentText().length() > 0) { + doSegmentation(documentText, documentText.getCodensedText(), 0); + } else { + if (numEmptyCases.get() < 10) { + LOGGER.debug("document text empty. Skipping this document."); + numEmptyCases.incrementAndGet(); + } else if (numEmptyCases.get() == 10) { + LOGGER.warn("Encountered 10 documents with an empty text body. This message will not appear again " + + "to avoid scrolling in cases where this is expected."); + } + } } + } catch (Throwable t) { + LOGGER.error("Could not perform sentence splitting of document {}", JCoReTools.getDocId(aJCas), t); + throw t; } } @@ -359,7 +372,7 @@ private void splitAtWhitespaces(JCoReCondensedDocumentText documentText, Sentenc lastEnd = s.getEnd(); currentSentenceLength = 0; } else { - LOGGER.warn("Not creating whitespace-segmented sub-sentence because its offsets would be invalid: {}-{}", subBegin, subEnd); + LOGGER.debug("Not creating whitespace-segmented sub-sentence because its offsets would be invalid: {}-{}", subBegin, subEnd); } } currentSentenceLength += wsMatcher.end(); @@ -372,7 +385,7 @@ private void splitAtWhitespaces(JCoReCondensedDocumentText documentText, Sentenc s.setComponentId(this.getClass().getName()); subSentences.add(s); } else { - LOGGER.warn("Not creating whitespace-segmented sub-sentence because its offsets would be invalid: {}-{}", subBegin, subEnd); + LOGGER.debug("Not creating whitespace-segmented sub-sentence because its offsets would be invalid: {}-{}", subBegin, subEnd); } } diff --git a/jcore-jsbd-ae/src/main/resources/de/julielab/jcore/ae/jsbd/desc/jcore-jsbd-ae.xml b/jcore-jsbd-ae/src/main/resources/de/julielab/jcore/ae/jsbd/desc/jcore-jsbd-ae.xml index 8bb60791a..b1293df62 100644 --- a/jcore-jsbd-ae/src/main/resources/de/julielab/jcore/ae/jsbd/desc/jcore-jsbd-ae.xml +++ b/jcore-jsbd-ae/src/main/resources/de/julielab/jcore/ae/jsbd/desc/jcore-jsbd-ae.xml @@ -6,7 +6,7 @@ de.julielab.jcore.ae.jsbd.main.SentenceAnnotator Descriptor automatically generated by uimaFIT - 2.5.1-SNAPSHOT + 2.6.0 de.julielab.jcore.ae.jsbd.main diff --git a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/Abstract2UnitPipeTest.java b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/Abstract2UnitPipeTest.java index 3d7f63cc7..91ffa9f45 100644 --- a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/Abstract2UnitPipeTest.java +++ b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/Abstract2UnitPipeTest.java @@ -15,8 +15,8 @@ import cc.mallet.types.Token; import cc.mallet.types.TokenSequence; import org.assertj.core.data.Offset; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.util.ArrayList; import java.util.List; @@ -27,7 +27,7 @@ public class Abstract2UnitPipeTest { protected static Pipe pipe; - @Before + @BeforeEach public void init() { pipe = new Abstract2UnitPipe(false); } diff --git a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/SentenceSplitterTest.java b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/SentenceSplitterTest.java index 8715c714b..a3ce21a17 100644 --- a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/SentenceSplitterTest.java +++ b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/SentenceSplitterTest.java @@ -18,7 +18,7 @@ import cc.mallet.pipe.Pipe; import cc.mallet.types.Instance; import cc.mallet.types.InstanceList; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,8 +26,8 @@ import java.util.ArrayList; import java.util.List; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; /** * Test for the class {@link SentenceSplitter} diff --git a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java index 5506d38b8..1e820d945 100644 --- a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java +++ b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java @@ -1,17 +1,17 @@ -/** +/** * SentenceAnnotatorTest.java - * + *

* Copyright (c) 2015, JULIE Lab. - * All rights reserved. This program and the accompanying materials + * All rights reserved. This program and the accompanying materials * are made available under the terms of the BSD-2-Clause License - * + *

* Author: tomanek - * + *

* Current version: 2.2 * Since version: 1.0 - * - * Creation date: Nov 29, 2006 - * + *

+ * Creation date: Nov 29, 2006 + *

* This is a JUnit test for the SentenceAnnotator. **/ @@ -25,6 +25,7 @@ import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.impl.XmiCasDeserializer; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; @@ -34,276 +35,330 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; +import java.io.FileInputStream; import java.io.IOException; +import java.nio.file.Path; import java.util.*; import java.util.stream.Collectors; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.Assert.*; +import static org.assertj.core.api.Assertions.assertThatCode; +import static org.junit.jupiter.api.Assertions.*; + public class SentenceAnnotatorTest { - /** - * Logger for this class - */ - private static final Logger LOGGER = LoggerFactory.getLogger(SentenceAnnotatorTest.class); - - private static final String LOGGER_PROPERTIES = "src/test/java/log4j.properties"; - - // uncomment to test with/without scope - // private static final String DESCRIPTOR = - // "src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotatorTest.xml"; - private static final String DESCRIPTOR = "src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml"; - - // last sentence has no EOS symbol to test that also this is handled - // correctly - private static final String[] TEST_TEXT = { "First sentence. Second \t sentence! \n Last sentence?", - "Hallo, jemand da? Nein, niemand.", "A test. It can't be just one sentence. Testing the test.", "" }; - - private static final String[] TEST_TEXT_OFFSETS = { "0-15;16-34;40-54", "0-17;18-32", "0-7;8-38;39-56", "" }; - - private static final int[] endOffsets = { 54, 32, 27, 0 }; - - /** - * Use the model in resources, split the text in TEST_TEXT and compare the - * split result against TEST_TEXT_OFFSETS - */ - @Test - public void testProcess() { - - boolean annotationsOK = true; - - XMLInputSource sentenceXML = null; - ResourceSpecifier sentenceSpec = null; - AnalysisEngine sentenceAnnotator = null; - - try { - sentenceXML = new XMLInputSource(DESCRIPTOR); - sentenceSpec = UIMAFramework.getXMLParser().parseResourceSpecifier(sentenceXML); - sentenceAnnotator = UIMAFramework.produceAnalysisEngine(sentenceSpec); - } catch (Exception e) { - LOGGER.error("testProcess()", e); - } - - for (int i = 0; i < TEST_TEXT.length; i++) { - - JCas jcas = null; - try { - jcas = sentenceAnnotator.newJCas(); - } catch (ResourceInitializationException e) { - LOGGER.error("testProcess()", e); - } - - if (LOGGER.isDebugEnabled()) { - LOGGER.debug("testProcess() - testing text: " + TEST_TEXT[i]); - } - jcas.setDocumentText(TEST_TEXT[i]); - - // make one test scope ranging over complete document text - // annotations for the processing scope - TestScope scope1 = new TestScope(jcas, 0, endOffsets[i]); - scope1.addToIndexes(); - // TestScope scope2 = new TestScope(jcas,37,54); - - - try { - sentenceAnnotator.process(jcas, null); - } catch (Exception e) { - LOGGER.error("testProcess()", e); - } - - // get the offsets of the sentences - JFSIndexRepository indexes = jcas.getJFSIndexRepository(); - Iterator sentIter = indexes.getAnnotationIndex(Sentence.type).iterator(); - - String predictedOffsets = getPredictedOffsets(i, sentIter); - - // compare offsets - if (!predictedOffsets.equals(TEST_TEXT_OFFSETS[i])) { - annotationsOK = false; - continue; - } - } - assertTrue(annotationsOK); - } - - - private String getPredictedOffsets(int i, Iterator sentIter) { - String predictedOffsets = ""; - while (sentIter.hasNext()) { - Sentence s = (Sentence) sentIter.next(); - LOGGER.debug("sentence: " + s.getCoveredText() + ": " + s.getBegin() + " - " + s.getEnd()); - predictedOffsets += (predictedOffsets.length() > 0) ? ";" : ""; - predictedOffsets += s.getBegin() + "-" + s.getEnd(); - } - - if (LOGGER.isDebugEnabled()) { - LOGGER.debug("testProcess() - predicted: " + predictedOffsets); - } - if (LOGGER.isDebugEnabled()) { - LOGGER.debug("testProcess() - wanted: " + TEST_TEXT_OFFSETS[i]); - } - return predictedOffsets; - } - - @Test - public void testUimaFitIntegration() throws UIMAException, IOException { - AnalysisEngine sentenceAE = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, - SentenceAnnotator.PARAM_MODEL_FILE, "de/julielab/jcore/ae/jsbd/model/test-model.gz", - SentenceAnnotator.PARAM_POSTPROCESSING, "biomed"); - JCas cas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types"); - String abstractText = FileUtils.readFileToString(new File("src/test/resources/test-abstract.txt"), "UTF-8"); - cas.setDocumentText(abstractText); - sentenceAE.process(cas); - Collection sentences = JCasUtil.select(cas, Sentence.class); - for (Sentence sentence : sentences) { - System.out.println(sentence.getCoveredText()); - } - assertEquals(14, sentences.size()); - } - - @Test - public void testModelClassPathResource() throws Exception { - AnalysisEngine sentenceAE = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, - SentenceAnnotator.PARAM_MODEL_FILE, "de/julielab/jcore/ae/jsbd/model/test-model.gz", - SentenceAnnotator.PARAM_POSTPROCESSING, "biomed"); - JCas cas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types"); - String abstractText = FileUtils.readFileToString(new File("src/test/resources/test-abstract.txt"), "UTF-8"); - cas.setDocumentText(abstractText); - sentenceAE.process(cas); - Collection sentences = JCasUtil.select(cas, Sentence.class); - System.out.println(sentences.size()); - for (Sentence sentence : sentences) { - System.out.println(sentence.getCoveredText()); - } - assertEquals(14, sentences.size()); - } - - @Test - public void testSentenceDelimiterTypes() throws Exception { - JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", - "de.julielab.jcore.types.jcore-document-structure-types"); - - jCas.setDocumentText("Introduction " + "We here show good results. This is a figure caption " - + "And this is a paragraph without a fullstop for some reason " + "Conclusion " - + "We are the greatest."); - Title t1 = new Title(jCas, 0, 12); - Caption c = new Caption(jCas, 40, 64); - Paragraph p = new Paragraph(jCas, 65, 123); - Title t2 = new Title(jCas, 124, 134); - t1.addToIndexes(); - c.addToIndexes(); - p.addToIndexes(); - t2.addToIndexes(); - assertEquals("Introduction", t1.getCoveredText()); - assertEquals("This is a figure caption", c.getCoveredText()); - assertEquals("And this is a paragraph without a fullstop for some reason", p.getCoveredText()); - assertEquals("Conclusion", t2.getCoveredText()); - - AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, - "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_SENTENCE_DELIMITER_TYPES, - new LinkedHashSet( - Arrays.asList(Title.class.getName(), Caption.class.getName(), Paragraph.class.getName()))); - - jsbd.process(jCas.getCas()); - - Set> expectedSpans = new HashSet<>(); - expectedSpans.add(Range.between(0, 12)); - expectedSpans.add(Range.between(13, 39)); - expectedSpans.add(Range.between(40, 64)); - expectedSpans.add(Range.between(65, 123)); - expectedSpans.add(Range.between(124, 134)); - expectedSpans.add(Range.between(135, 155)); - - FSIterator it = jCas.getAnnotationIndex(Sentence.type).iterator(); - assertTrue(it.hasNext()); - while (it.hasNext()) { - Annotation sentence = it.next(); - Range sentenceRange = Range.between(sentence.getBegin(), sentence.getEnd()); - assertTrue("Range " + sentenceRange + " was not expected", expectedSpans.remove(sentenceRange)); - } - assertTrue(expectedSpans.isEmpty()); - } - - @Test - public void testSentenceWhitespaces() throws Exception { - JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", - "de.julielab.jcore.types.jcore-document-structure-types"); - - // This text is taken from pmid 23092121 - jCas.setDocumentText(" : We present a theoretical study of the electronic subband structure and collective electronic excitation associated with plasmon and surface plasmon modes in metal-based hollow nanosphere. The dependence of the electronic subband energy on the sample parameters of the hollow nanosphere is examined."); - - AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, - "de/julielab/jcore/ae/jsbd/model/test-model.gz"); - - jsbd.process(jCas.getCas()); + /** + * Logger for this class + */ + private static final Logger LOGGER = LoggerFactory.getLogger(SentenceAnnotatorTest.class); + + private static final String LOGGER_PROPERTIES = "src/test/java/log4j.properties"; + + // uncomment to test with/without scope + // private static final String DESCRIPTOR = + // "src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotatorTest.xml"; + private static final String DESCRIPTOR = "src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml"; + + // last sentence has no EOS symbol to test that also this is handled + // correctly + private static final String[] TEST_TEXT = {"First sentence. Second \t sentence! \n Last sentence?", + "Hallo, jemand da? Nein, niemand.", "A test. It can't be just one sentence. Testing the test.", ""}; + + private static final String[] TEST_TEXT_OFFSETS = {"0-15;16-34;40-54", "0-17;18-32", "0-7;8-38;39-56", ""}; + + private static final int[] endOffsets = {54, 32, 27, 0}; + + /** + * Use the model in resources, split the text in TEST_TEXT and compare the + * split result against TEST_TEXT_OFFSETS + */ + @Test + public void testProcess() { + + boolean annotationsOK = true; + + XMLInputSource sentenceXML = null; + ResourceSpecifier sentenceSpec = null; + AnalysisEngine sentenceAnnotator = null; + + try { + sentenceXML = new XMLInputSource(DESCRIPTOR); + sentenceSpec = UIMAFramework.getXMLParser().parseResourceSpecifier(sentenceXML); + sentenceAnnotator = UIMAFramework.produceAnalysisEngine(sentenceSpec); + } catch (Exception e) { + LOGGER.error("testProcess()", e); + } + + for (int i = 0; i < TEST_TEXT.length; i++) { + + JCas jcas = null; + try { + jcas = sentenceAnnotator.newJCas(); + } catch (ResourceInitializationException e) { + LOGGER.error("testProcess()", e); + } + + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("testProcess() - testing text: " + TEST_TEXT[i]); + } + jcas.setDocumentText(TEST_TEXT[i]); + + // make one test scope ranging over complete document text + // annotations for the processing scope + TestScope scope1 = new TestScope(jcas, 0, endOffsets[i]); + scope1.addToIndexes(); + // TestScope scope2 = new TestScope(jcas,37,54); + + + try { + sentenceAnnotator.process(jcas, null); + } catch (Exception e) { + LOGGER.error("testProcess()", e); + } + + // get the offsets of the sentences + JFSIndexRepository indexes = jcas.getJFSIndexRepository(); + Iterator sentIter = indexes.getAnnotationIndex(Sentence.type).iterator(); + + String predictedOffsets = getPredictedOffsets(i, sentIter); + + // compare offsets + if (!predictedOffsets.equals(TEST_TEXT_OFFSETS[i])) { + annotationsOK = false; + continue; + } + } + assertTrue(annotationsOK); + } + + + private String getPredictedOffsets(int i, Iterator sentIter) { + String predictedOffsets = ""; + while (sentIter.hasNext()) { + Sentence s = (Sentence) sentIter.next(); + LOGGER.debug("sentence: " + s.getCoveredText() + ": " + s.getBegin() + " - " + s.getEnd()); + predictedOffsets += (predictedOffsets.length() > 0) ? ";" : ""; + predictedOffsets += s.getBegin() + "-" + s.getEnd(); + } + + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("testProcess() - predicted: " + predictedOffsets); + } + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("testProcess() - wanted: " + TEST_TEXT_OFFSETS[i]); + } + return predictedOffsets; + } + + @Test + public void testUimaFitIntegration() throws UIMAException, IOException { + AnalysisEngine sentenceAE = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, + SentenceAnnotator.PARAM_MODEL_FILE, "de/julielab/jcore/ae/jsbd/model/test-model.gz", + SentenceAnnotator.PARAM_POSTPROCESSING, "biomed"); + JCas cas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types"); + String abstractText = FileUtils.readFileToString(new File("src/test/resources/test-abstract.txt"), "UTF-8"); + cas.setDocumentText(abstractText); + sentenceAE.process(cas); + Collection sentences = JCasUtil.select(cas, Sentence.class); + for (Sentence sentence : sentences) { + System.out.println(sentence.getCoveredText()); + } + assertEquals(14, sentences.size()); + } + + @Test + public void testModelClassPathResource() throws Exception { + AnalysisEngine sentenceAE = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, + SentenceAnnotator.PARAM_MODEL_FILE, "de/julielab/jcore/ae/jsbd/model/test-model.gz", + SentenceAnnotator.PARAM_POSTPROCESSING, "biomed"); + JCas cas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types"); + String abstractText = FileUtils.readFileToString(new File("src/test/resources/test-abstract.txt"), "UTF-8"); + cas.setDocumentText(abstractText); + sentenceAE.process(cas); + Collection sentences = JCasUtil.select(cas, Sentence.class); + System.out.println(sentences.size()); + for (Sentence sentence : sentences) { + System.out.println(sentence.getCoveredText()); + } + assertEquals(14, sentences.size()); + } + + @Test + public void testSentenceDelimiterTypes() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + + jCas.setDocumentText("Introduction " + "We here show good results. This is a figure caption " + + "And this is a paragraph without a fullstop for some reason " + "Conclusion " + + "We are the greatest."); + Title t1 = new Title(jCas, 0, 12); + Caption c = new Caption(jCas, 40, 64); + Paragraph p = new Paragraph(jCas, 65, 123); + Title t2 = new Title(jCas, 124, 134); + t1.addToIndexes(); + c.addToIndexes(); + p.addToIndexes(); + t2.addToIndexes(); + assertEquals("Introduction", t1.getCoveredText()); + assertEquals("This is a figure caption", c.getCoveredText()); + assertEquals("And this is a paragraph without a fullstop for some reason", p.getCoveredText()); + assertEquals("Conclusion", t2.getCoveredText()); + + AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, + "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_SENTENCE_DELIMITER_TYPES, + new LinkedHashSet( + Arrays.asList(Title.class.getName(), Caption.class.getName(), Paragraph.class.getName()))); + + jsbd.process(jCas.getCas()); + + Set> expectedSpans = new HashSet<>(); + expectedSpans.add(Range.between(0, 12)); + expectedSpans.add(Range.between(13, 39)); + expectedSpans.add(Range.between(40, 64)); + expectedSpans.add(Range.between(65, 123)); + expectedSpans.add(Range.between(124, 134)); + expectedSpans.add(Range.between(135, 155)); + + FSIterator it = jCas.getAnnotationIndex(Sentence.type).iterator(); + assertTrue(it.hasNext()); + while (it.hasNext()) { + Annotation sentence = it.next(); + Range sentenceRange = Range.between(sentence.getBegin(), sentence.getEnd()); + assertTrue(expectedSpans.remove(sentenceRange), "Range " + sentenceRange + " was not expected"); + } + assertTrue(expectedSpans.isEmpty()); + } + + @Test + public void testSentenceWhitespaces() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + + // This text is taken from pmid 23092121 + jCas.setDocumentText(" : We present a theoretical study of the electronic subband structure and collective electronic excitation associated with plasmon and surface plasmon modes in metal-based hollow nanosphere. The dependence of the electronic subband energy on the sample parameters of the hollow nanosphere is examined."); + + AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, + "de/julielab/jcore/ae/jsbd/model/test-model.gz"); + + jsbd.process(jCas.getCas()); Sentence sentence = JCasUtil.select(jCas, Sentence.class).iterator().next(); assertFalse(sentence.getCoveredText().startsWith(" ")); } - @Test - public void testTrailingNewline() throws Exception { - JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", - "de.julielab.jcore.types.jcore-document-structure-types"); + @Test + public void testTrailingNewline() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); - // This text is taken from PMC3408706. Note the "paragraph separator" at the end - jCas.setDocumentText("In1 the next step, we plan to use higher level QM/MM methods to calculate the energy barrier of the reaction catalyzed by endonuclease APE1, in compliance with the mechanism proposed, and to screen for effective inhibitors with the use of the constructed mechanistic full-atomic model of the enzyme. \u2029"); + // This text is taken from PMC3408706. Note the "paragraph separator" at the end + jCas.setDocumentText("In1 the next step, we plan to use higher level QM/MM methods to calculate the energy barrier of the reaction catalyzed by endonuclease APE1, in compliance with the mechanism proposed, and to screen for effective inhibitors with the use of the constructed mechanistic full-atomic model of the enzyme. \u2029"); new InternalReference(jCas, 2, 3).addToIndexes(); - AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, - "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_CUT_AWAY_TYPES, new String[]{InternalReference.class.getCanonicalName()}); - - jsbd.process(jCas.getCas()); - - - Sentence sentence = JCasUtil.select(jCas, Sentence.class).iterator().next(); - assertFalse(sentence.getCoveredText().endsWith("\u2029")); - } - - @Test - public void testSplitAtNewlines() throws Exception { - JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", - "de.julielab.jcore.types.jcore-document-structure-types"); - - String ls = System.getProperty("line.separator"); - jCas.setDocumentText("line1"+ls+"line2"+ls+"line3"); - - AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, - "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_ALWAYS_SPLIT_NEWLINE, true); - - jsbd.process(jCas.getCas()); - - - Collection sentences = JCasUtil.select(jCas, Sentence.class).stream().map(Annotation::getCoveredText).collect(Collectors.toList()); - assertThat(sentences).containsExactly("line1", "line2", "line3"); - } - -// -// @Test -// public void testmuh() throws Exception { -// JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", -// "de.julielab.jcore.types.jcore-document-structure-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types", -// "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types"); -// -// XmiCasDeserializer.deserialize(new FileInputStream("/Users/faessler/uima-pipelines/jedis-doc-to-xmi/data/output-xmi/4768370.xmi"), jCas.getCas()); -// JCasUtil.select(jCas, Sentence.class).forEach(Annotation::removeFromIndexes); -// AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, -// "/Users/faessler/Coding/git/jcore-projects/jcore-jsbd-ae-biomedical-english/src/main/resources/de/julielab/jcore/ae/jsbd/model/jsbd-biomed-oversampled-abstracts-split-at-punctuation.mod.gz", SentenceAnnotator.PARAM_MAX_SENTENCE_LENGTH, 1000); -// -// jsbd.process(jCas.getCas()); -// -// Set set = new TreeSet<>(); -// for (Sentence s : JCasUtil.select(jCas, Sentence.class)) { -// set.add(s.getEnd() - s.getBegin()); -// } -// XmiCasSerializer.serialize(jCas.getCas(), new FileOutputStream("smallSentences.xmi")); -// } + AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, + "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_CUT_AWAY_TYPES, new String[]{InternalReference.class.getCanonicalName()}); + + jsbd.process(jCas.getCas()); + + + Sentence sentence = JCasUtil.select(jCas, Sentence.class).iterator().next(); + assertFalse(sentence.getCoveredText().endsWith("\u2029")); + } + + @Test + public void testSplitAtNewlines() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + + String ls = System.getProperty("line.separator"); + jCas.setDocumentText("line1" + ls + "line2" + ls + "line3"); + + AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, + "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_ALWAYS_SPLIT_NEWLINE, true); + + jsbd.process(jCas.getCas()); + + + Collection sentences = JCasUtil.select(jCas, Sentence.class).stream().map(Annotation::getCoveredText).collect(Collectors.toList()); + assertThat(sentences).containsExactly("line1", "line2", "line3"); + } + + + @Test + public void testErrordoc() throws Exception { + // The XMI document uses here is from PMC and is an example of a source of error the previously occurred. + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types", + "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types"); + + XmiCasDeserializer.deserialize(new FileInputStream(Path.of("src", "test", "resources", "errordocs", "PMC5478802.xmi").toFile()), jCas.getCas()); + JCasUtil.select(jCas, Sentence.class).forEach(Annotation::removeFromIndexes); + AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, + "de/julielab/jcore/ae/jsbd/model/test-model.gz", + SentenceAnnotator.PARAM_MAX_SENTENCE_LENGTH, 1000, + SentenceAnnotator.PARAM_SENTENCE_DELIMITER_TYPES, new String[]{ + "de.julielab.jcore.types.Title", "de.julielab.jcore.types.AbstractText", "de.julielab.jcore.types.AbstractSectionHeading", "de.julielab.jcore.types.AbstractSection", "de.julielab.jcore.types.Section", "de.julielab.jcore.types.Paragraph", "de.julielab.jcore.types.Zone", "de.julielab.jcore.types.Caption", "de.julielab.jcore.types.Figure", "de.julielab.jcore.types.Table"}, + SentenceAnnotator.PARAM_CUT_AWAY_TYPES, new String[]{de.julielab.jcore.types.pubmed.InternalReference.class.getCanonicalName()} + ); + + assertThatCode(() -> jsbd.process(jCas.getCas())).doesNotThrowAnyException(); + } + + @Test + public void testErrordoc2() throws Exception { + // This XMI file has larger cut away types where an original offset request actually lies inside of a + // cut away annotation. This case led to errors prior to a respective bug fix in the + // JCoReCondensedDocumentText + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types", + "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types"); + + XmiCasDeserializer.deserialize(new FileInputStream(Path.of("src", "test", "resources", "errordocs", "PMC8205280.xmi").toFile()), jCas.getCas()); + JCasUtil.select(jCas, Sentence.class).forEach(Annotation::removeFromIndexes); + AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, + "de/julielab/jcore/ae/jsbd/model/test-model.gz", + SentenceAnnotator.PARAM_MAX_SENTENCE_LENGTH, 1000, + SentenceAnnotator.PARAM_SENTENCE_DELIMITER_TYPES, new String[]{ + "de.julielab.jcore.types.Title", "de.julielab.jcore.types.AbstractText", "de.julielab.jcore.types.AbstractSectionHeading", "de.julielab.jcore.types.AbstractSection", "de.julielab.jcore.types.Section", "de.julielab.jcore.types.Paragraph", "de.julielab.jcore.types.Zone", "de.julielab.jcore.types.Caption", "de.julielab.jcore.types.Figure", "de.julielab.jcore.types.Table"}, + SentenceAnnotator.PARAM_CUT_AWAY_TYPES, new String[]{de.julielab.jcore.types.pubmed.InternalReference.class.getCanonicalName()} + ); + + assertThatCode(() -> jsbd.process(jCas.getCas())).doesNotThrowAnyException(); + } + + @Test + public void testErrordoc3() throws Exception { + // This document has multiple sentences that begin with a Figure reference mention ("Figure 2 shows..."). + // By cutting away all the internal reference annotation spans for sentence tagging, the "Figure 2" was + // ultimately appended to the previous sentence, causing errors. Thus, the option to omit internal references + // with letters was added to the condensed document text. This is a test that everything is working as intended. + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types", + "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.extensions.jcore-semantics-mention-extension-types"); + + XmiCasDeserializer.deserialize(new FileInputStream(Path.of("src", "test", "resources", "errordocs", "PMC5070457.xmi").toFile()), jCas.getCas()); + JCasUtil.select(jCas, Sentence.class).forEach(Annotation::removeFromIndexes); + AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, + "de/julielab/jcore/ae/jsbd/model/test-model.gz", + SentenceAnnotator.PARAM_MAX_SENTENCE_LENGTH, 1000, + SentenceAnnotator.PARAM_SENTENCE_DELIMITER_TYPES, new String[]{ + "de.julielab.jcore.types.Title", "de.julielab.jcore.types.AbstractText", "de.julielab.jcore.types.AbstractSectionHeading", "de.julielab.jcore.types.AbstractSection", "de.julielab.jcore.types.Section", "de.julielab.jcore.types.Paragraph", "de.julielab.jcore.types.Zone", "de.julielab.jcore.types.Caption", "de.julielab.jcore.types.Figure", "de.julielab.jcore.types.Table"}, + SentenceAnnotator.PARAM_CUT_AWAY_TYPES, new String[]{de.julielab.jcore.types.pubmed.InternalReference.class.getCanonicalName()} + ); + assertThatCode(() -> jsbd.process(jCas.getCas())).doesNotThrowAnyException(); + Collection sentences = JCasUtil.select(jCas, Sentence.class); + for (var s : sentences) { + String coveredText = s.getCoveredText(); + if (coveredText.contains("They concluded")) + assertThat(coveredText).endsWith("filament19."); + } + } } diff --git a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotatorTest.xml b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotatorTest.xml index 66314d4bf..0bcda6a91 100644 --- a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotatorTest.xml +++ b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotatorTest.xml @@ -6,7 +6,7 @@ JCoRe Sentence Annotator This is the UIMA Wrapper for the JULIE Sentence Boundary Detector. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml index 63b003324..41089e381 100644 --- a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml +++ b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml @@ -6,7 +6,7 @@ JCoRe Sentence Annotator This is the UIMA Wrapper for the JULIE Sentence Boundary Detector. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany @@ -54,6 +54,7 @@ + diff --git a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/paragraph-scope-type.xml b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/paragraph-scope-type.xml index 282896d88..8b95a7994 100644 --- a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/paragraph-scope-type.xml +++ b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/paragraph-scope-type.xml @@ -2,7 +2,7 @@ test-entity-type.xml A mini type system with one type only, used for testing consistency preservation - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-jsbd-ae/src/test/resources/errordocs/PMC5070457.xmi b/jcore-jsbd-ae/src/test/resources/errordocs/PMC5070457.xmi new file mode 100644 index 000000000..dd0c227ca --- /dev/null +++ b/jcore-jsbd-ae/src/test/resources/errordocs/PMC5070457.xmi @@ -0,0 +1,5 @@ + +PMC5070457 \ No newline at end of file diff --git a/jcore-jsbd-ae/src/test/resources/errordocs/PMC5478802.xmi b/jcore-jsbd-ae/src/test/resources/errordocs/PMC5478802.xmi new file mode 100644 index 000000000..c4d8ca95a --- /dev/null +++ b/jcore-jsbd-ae/src/test/resources/errordocs/PMC5478802.xmi @@ -0,0 +1,5 @@ + +PMC5478802 \ No newline at end of file diff --git a/jcore-jsbd-ae/src/test/resources/errordocs/PMC8205280.xmi b/jcore-jsbd-ae/src/test/resources/errordocs/PMC8205280.xmi new file mode 100644 index 000000000..b2063eca5 --- /dev/null +++ b/jcore-jsbd-ae/src/test/resources/errordocs/PMC8205280.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/jcore-jsbd-ae/src/test/resources/errordocs/README.md b/jcore-jsbd-ae/src/test/resources/errordocs/README.md new file mode 100644 index 000000000..d2278611f --- /dev/null +++ b/jcore-jsbd-ae/src/test/resources/errordocs/README.md @@ -0,0 +1,4 @@ +# Errored Documents for Tests + +Documents in this directory were subject of sentence splitting errors. The errors are fixed +using the documents in a test. \ No newline at end of file diff --git a/jcore-jtbd-ae/component.meta b/jcore-jtbd-ae/component.meta index 377c042d7..0cd1c8929 100644 --- a/jcore-jtbd-ae/component.meta +++ b/jcore-jtbd-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-jtbd-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Token Annotator" } diff --git a/jcore-jtbd-ae/pom.xml b/jcore-jtbd-ae/pom.xml index 03523ba12..e811fa22f 100644 --- a/jcore-jtbd-ae/pom.xml +++ b/jcore-jtbd-ae/pom.xml @@ -10,14 +10,15 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 + org.apache.maven.plugins maven-assembly-plugin - 2.4 + 3.3.0 jar-with-dependencies @@ -85,14 +86,25 @@ jcore-types ${jcore-types-version} + + de.julielab + jcore-utilities + ${jcore-utilities-version} + cc.mallet mallet 2.0.8 + + + junit + junit + + - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe Token Annotator diff --git a/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipe.java b/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipe.java index c52e1ad12..833f97e8f 100755 --- a/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipe.java +++ b/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipe.java @@ -524,7 +524,7 @@ else if (superUnitRep.length() <= 8) // check whether superunit might be a chemical // therefor we check the number typical special characters contained - if ((superUnitRep.length() > 6) + if ((superUnitRep.length() > 6 && superUnitRep.length() < 200) && superUnitRep.matches("(.*[\\W].*){5,}") && !superUnitRep.contains("-->")) token.setFeatureValue("SU_isChemical", 1); diff --git a/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotator.java b/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotator.java index 1ddd664f7..c073983a2 100644 --- a/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotator.java +++ b/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotator.java @@ -26,6 +26,7 @@ import de.julielab.jcore.ae.jtbd.Unit; import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; +import de.julielab.jcore.utility.JCoReTools; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; @@ -153,8 +154,12 @@ public void process(final JCas aJCas) throws AnalysisEngineProcessException { int length = sentence.getEnd() - sentence .getBegin(); LOGGER.debug("going to next sentence having length: " + length); - if (length > 1000) - LOGGER.warn("Current sentence has length {}.", length); + if (length > 1000) { + if (LOGGER.isWarnEnabled()) { + String docId = JCoReTools.getDocId(aJCas); + LOGGER.warn("Current sentence has length {} (document ID {}).", length, docId); + } + } final String text = sentence.getCoveredText(); writeTokensToCAS(text, sentence.getBegin(), aJCas); } diff --git a/jcore-jtbd-ae/src/main/resources/de/julielab/jcore/ae/jtbd/desc/jcore-jtbd.xml b/jcore-jtbd-ae/src/main/resources/de/julielab/jcore/ae/jtbd/desc/jcore-jtbd.xml index 337463371..3e8e5a5e0 100644 --- a/jcore-jtbd-ae/src/main/resources/de/julielab/jcore/ae/jtbd/desc/jcore-jtbd.xml +++ b/jcore-jtbd-ae/src/main/resources/de/julielab/jcore/ae/jtbd/desc/jcore-jtbd.xml @@ -6,7 +6,7 @@ JCoRe Token Annotator - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipeTest.java b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipeTest.java index 46d4826c1..140945584 100644 --- a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipeTest.java +++ b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipeTest.java @@ -17,19 +17,22 @@ package de.julielab.jcore.ae.jtbd; -import junit.framework.TestCase; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; -public class Sentence2TokenPipeTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class Sentence2TokenPipeTest { private static final Logger LOGGER = LoggerFactory .getLogger(Sentence2TokenPipeTest.class); private static final String TEST_SENTENCE = "this is a \t junit -test"; + @Test public void testMakeLabel() { final ArrayList expectedLabels = new ArrayList(); expectedLabels.add("P"); @@ -55,6 +58,7 @@ public void testMakeLabel() { assertTrue(allOK); } + @Test public void testMakeUnits() { final ArrayList expectedUnits = new ArrayList(); expectedUnits.add("this"); diff --git a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/TokenizerTest.java b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/TokenizerTest.java index c953307c1..e99c1f2f2 100644 --- a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/TokenizerTest.java +++ b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/TokenizerTest.java @@ -24,7 +24,7 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -39,7 +39,7 @@ import java.util.List; import java.util.stream.Collectors; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; /** * Test for the class {@link Tokenizer} diff --git a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotatorTest.java b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotatorTest.java index 4e3dfe9b3..543abf443 100644 --- a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotatorTest.java +++ b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotatorTest.java @@ -18,7 +18,6 @@ import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; @@ -26,13 +25,15 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Iterator; -public class TokenAnnotatorTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TokenAnnotatorTest { /** * Logger for this class diff --git a/jcore-jtbd-ae/src/test/resources/de/julielab/jcore/ae/jtbd/desc/TokenAnnotatorTest.xml b/jcore-jtbd-ae/src/test/resources/de/julielab/jcore/ae/jtbd/desc/TokenAnnotatorTest.xml index 6a670af49..415da5d4c 100644 --- a/jcore-jtbd-ae/src/test/resources/de/julielab/jcore/ae/jtbd/desc/TokenAnnotatorTest.xml +++ b/jcore-jtbd-ae/src/test/resources/de/julielab/jcore/ae/jtbd/desc/TokenAnnotatorTest.xml @@ -6,7 +6,7 @@ JCoRe Token Annotator - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-julielab-entity-evaluator-consumer/component.meta b/jcore-julielab-entity-evaluator-consumer/component.meta index 9ffe2edc3..78d9a4f68 100644 --- a/jcore-julielab-entity-evaluator-consumer/component.meta +++ b/jcore-julielab-entity-evaluator-consumer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-julielab-entity-evaluator-consumer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe JULIE Lab Entity Evaluator Converter" } diff --git a/jcore-julielab-entity-evaluator-consumer/pom.xml b/jcore-julielab-entity-evaluator-consumer/pom.xml index 7ad4d9597..4b5547be5 100644 --- a/jcore-julielab-entity-evaluator-consumer/pom.xml +++ b/jcore-julielab-entity-evaluator-consumer/pom.xml @@ -4,7 +4,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 jcore-julielab-entity-evaluator-consumer JCoRe JULIE Lab Entity Evaluator Converter @@ -18,7 +18,7 @@ de.julielab julielab-entity-evaluator - 1.2.0 + 1.3.0 de.julielab @@ -45,8 +45,8 @@ julielab-java-utilities - junit - junit + org.junit.jupiter + junit-jupiter-engine org.apache.commons diff --git a/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java b/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java index b92b32ad1..5dadad803 100644 --- a/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java +++ b/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java @@ -56,6 +56,7 @@ public class EntityEvaluatorConsumer extends JCasAnnotator_ImplBase { public static final String PARAM_TYPE_PREFIX = "TypePrefix"; public final static String PARAM_ENTITY_TYPES = "EntityTypes"; public static final String PARAM_FEATURE_FILTERS = "FeatureFilters"; + public static final String PARAM_ALLOW_REGEX_FOR_FILTERS = "AllowRegexForFilters"; public final static String PARAM_OFFSET_MODE = "OffsetMode"; public final static String PARAM_OFFSET_SCOPE = "OffsetScope"; public final static String PARAM_OUTPUT_FILE = "OutputFile"; @@ -77,6 +78,8 @@ public class EntityEvaluatorConsumer extends JCasAnnotator_ImplBase { private String typePrefix; @ConfigurationParameter(name = PARAM_FEATURE_FILTERS, mandatory = false, description = "Optional. Only lets those entities contribute to the output file that fulfill the given feature value(s). The syntax is :=. The ':' prefix is optional. If omitted, the filters will be applied to all entities given in the " + PARAM_ENTITY_TYPES + " parameter. An arbitrary number of filter expressions may be specified. In such cases, it is important to understand the boolean structure after which the expressions are evaluated in order to omit an annotation or take it into account for the output. The filter expressions are first grouped by feature path. Within such a group, the filter values form a disjunction. Thus, if any filter in a group is satisfied, the whole group is satisfied. The different groups form a conjunction. Thus, if any group is not satisfied, the whole conjunction is unsatisfied and the respective annotation will be omitted from output.") private String[] featureFilterDefinitions; + @ConfigurationParameter(name = PARAM_ALLOW_REGEX_FOR_FILTERS, mandatory = false, description = "Optional. If set to true, the filter values specified with the " + PARAM_FEATURE_FILTERS + " parameter are interpreted as regular expressions. The actual feature values are than matched by regular expression resolution instead of testing string equality.") + boolean allowRegexForFilters; @ConfigurationParameter(name = PARAM_OUTPUT_FILE, description = "Output file to which all entity information is written in the format\n" + "docId EGID begin end confidence\n" + "Where the fields are separated by tab stops. If the file name ends with .gz, the output file will automatically be gzipped.") @@ -157,13 +160,11 @@ private void addOffsetsColumn(JCas aJCas) { } private void addDocumentIdColumn(JCas aJCas) throws CASException { - if (outputColumnNames.contains(DOCUMENT_ID_COLUMN)) { - Column c = columns.get(DOCUMENT_ID_COLUMN); - if (c == null) - c = new Column(DOCUMENT_ID_COLUMN + ":" + Header.class.getCanonicalName() + "=/docId", null, aJCas.getTypeSystem()); - c = new DocumentIdColumn(c); - columns.put(DOCUMENT_ID_COLUMN, c); - } + Column c = columns.get(DOCUMENT_ID_COLUMN); + if (c == null) + c = new Column(DOCUMENT_ID_COLUMN + ":" + Header.class.getCanonicalName() + "=/docId", null, aJCas.getTypeSystem()); + c = new DocumentIdColumn(c); + columns.put(DOCUMENT_ID_COLUMN, c); } private void addDocumentTextSha256Column() { @@ -183,7 +184,7 @@ private void addSentenceIdColumn(JCas aJCas) throws CASException { Column docIdColumn = columns.get(DOCUMENT_ID_COLUMN); String documentId = null; if (docIdColumn != null) - documentId = docIdColumn.getValue(aJCas.getDocumentAnnotationFs(), aJCas).getFirst(); + documentId = docIdColumn.getValue(null, aJCas).getFirst(); Type sentenceType = c.getSingleType(); // put all sentences into an index with an // overlap-comparator - this way the index can be @@ -249,10 +250,11 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept super.initialize(aContext); outputColumnNamesArray = (String[]) aContext.getConfigParameterValue(PARAM_OUTPUT_COLUMNS); - columnDefinitionDescriptions = (String[]) aContext.getConfigParameterValue(PARAM_COLUMN_DEFINITIONS); + columnDefinitionDescriptions = Optional.ofNullable((String[]) aContext.getConfigParameterValue(PARAM_COLUMN_DEFINITIONS)).orElse(new String[0]); typePrefix = (String) aContext.getConfigParameterValue(PARAM_TYPE_PREFIX); featureFilterDefinitions = (String[]) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_FEATURE_FILTERS)).orElse(new String[0]); + allowRegexForFilters = (Boolean) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_ALLOW_REGEX_FOR_FILTERS)).orElse(false); outputFilePath = (String) aContext.getConfigParameterValue(PARAM_OUTPUT_FILE); appendThreadNameToOutputFile = Optional.ofNullable((Boolean) aContext.getConfigParameterValue(PARAM_APPEND_THREAD_NAME_TO_OUTPUT_FILE)).orElse(false); entityTypeStrings = (String[]) aContext.getConfigParameterValue(PARAM_ENTITY_TYPES); @@ -265,7 +267,7 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept offsetMode = null == offsetModeStr ? OffsetMode.CharacterSpan : OffsetMode.valueOf(offsetModeStr); if (null == offsetScopeStr) { - offsetScope = outputColumnNames.contains(SENTENCE_ID_COLUMN) ? OffsetScope.Sentence : OffsetScope.Document; + offsetScope = OffsetScope.Document; } else { offsetScope = OffsetScope.valueOf(offsetScopeStr); } @@ -281,6 +283,7 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept log.info("{}: {}", PARAM_OUTPUT_COLUMNS, outputColumnNames); log.info("{}: {}", PARAM_COLUMN_DEFINITIONS, columnDefinitionDescriptions); log.info("{}: {}", PARAM_FEATURE_FILTERS, featureFilterDefinitions); + log.info("{}: {}", PARAM_ALLOW_REGEX_FOR_FILTERS, allowRegexForFilters); log.info("{}: {}", PARAM_ENTITY_TYPES, entityTypeStrings); log.info("{}: {}", PARAM_TYPE_PREFIX, typePrefix); log.info("{}: {}", PARAM_OUTPUT_FILE, outputFilePath); @@ -329,7 +332,7 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { throw new IllegalArgumentException("No entity names are given, neither by the " + PARAM_ENTITY_TYPES + " parameter nor in the " + PARAM_COLUMN_DEFINITIONS + " parameter."); removeSubsumedTypes(entityTypes, ts); - featureFilters = Stream.of(featureFilterDefinitions).map(d -> new FeatureValueFilter(d, typePrefix, ts)).collect(Collectors.groupingBy(filter -> filter.getPathValuePair().fp.getFeaturePath())); + featureFilters = Stream.of(featureFilterDefinitions).map(d -> new FeatureValueFilter(d, typePrefix, ts, allowRegexForFilters)).collect(Collectors.groupingBy(filter -> filter.getPathValuePair().fp.getFeaturePath())); addDocumentIdColumn(aJCas); addDocumentTextSha256Column(); diff --git a/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/FeatureValueFilter.java b/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/FeatureValueFilter.java index c84ba2ade..25a1a25d2 100644 --- a/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/FeatureValueFilter.java +++ b/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/FeatureValueFilter.java @@ -17,6 +17,7 @@ import java.util.Collections; import java.util.Set; +import java.util.function.BiFunction; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -27,10 +28,12 @@ public class FeatureValueFilter { protected Set types; protected PathValuePair pathValuePair; private Matcher mfull; + private BiFunction featureValueMatchTest; - public FeatureValueFilter(String columnDefinition, String typePrefix, TypeSystem ts) { + public FeatureValueFilter(String columnDefinition, String typePrefix, TypeSystem ts, boolean allowRegexForFilters) { this(); parseAndAddDefinition(columnDefinition, typePrefix, ts); + featureValueMatchTest = allowRegexForFilters ? String::matches : String::equals; } public FeatureValueFilter() { @@ -60,7 +63,7 @@ public boolean contradictsFeatureFilter(TOP a) { return false; String fpValue = pathValuePair.fp.getValueAsString(a); if (fpValue != null) - return pathValuePair.targetValue == null || !fpValue.equals(pathValuePair.targetValue); + return pathValuePair.targetValue == null || !featureValueMatchTest.apply(fpValue, pathValuePair.targetValue); return pathValuePair.targetValue != null; } diff --git a/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/OffsetsColumn.java b/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/OffsetsColumn.java index 44d08b055..0b5c599d5 100644 --- a/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/OffsetsColumn.java +++ b/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/OffsetsColumn.java @@ -15,11 +15,13 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.TOP; import org.apache.uima.jcas.tcas.Annotation; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.*; public class OffsetsColumn extends Column { - +private final static Logger log = LoggerFactory.getLogger(OffsetsColumn.class); private OffsetMode offsetMode; private JCoReTreeMapAnnotationIndex sentenceIndex; private OffsetScope offsetScope; @@ -61,9 +63,14 @@ public Deque getValue(TOP a, JCas aJCas) { if (offsetScope == OffsetScope.Sentence) { Annotation s = sentenceIndex.get(an); - if (this.offsetMode == OffsetMode.NonWsCharacters) - numWsMap = getNumWsMapForSentence(s); - annotationOffset = s.getBegin(); + if (s != null) { + if (this.offsetMode == OffsetMode.NonWsCharacters) + numWsMap = getNumWsMapForSentence(s); + annotationOffset = s.getBegin(); + } else { + log.warn("There was no sentence for annotation {}, returning begin offset as -1.", an); + annotationOffset = -1; + } } final String offsets = getOffsets(an, numWsMap, annotationOffset); diff --git a/jcore-julielab-entity-evaluator-consumer/src/main/resources/de/julielab/jcore/consumer/entityevaluator/desc/jcore-julielab-entity-evaluator-consumer.xml b/jcore-julielab-entity-evaluator-consumer/src/main/resources/de/julielab/jcore/consumer/entityevaluator/desc/jcore-julielab-entity-evaluator-consumer.xml index 4ffda6700..f46b9c244 100644 --- a/jcore-julielab-entity-evaluator-consumer/src/main/resources/de/julielab/jcore/consumer/entityevaluator/desc/jcore-julielab-entity-evaluator-consumer.xml +++ b/jcore-julielab-entity-evaluator-consumer/src/main/resources/de/julielab/jcore/consumer/entityevaluator/desc/jcore-julielab-entity-evaluator-consumer.xml @@ -6,7 +6,7 @@ JCoRe Entity Evaluator and TSV Consumer This component was originally created to output the tab separated format used the JULIE Entity Evaluator. However, this component can be used to create a TSV file from any annotation or annotation set. The component allows to define columns by specifying the annotation type to draw feature values from and a feature path that specifies the location of the desired feature. All feature paths will be applied to each configured annotation, returning null values if an annotation does not exhibit a value for a column's feature path. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany @@ -58,6 +58,13 @@ true false + + AllowRegexForFilters + Optional. If set to true, the filter values specified with the FeatureFilters parameter are interpreted as regular expressions. The actual feature values are than matched by regular expression resolution instead of testing string equality. + Boolean + false + false + OutputFile Output file to which all entity information is written in the format diff --git a/jcore-julielab-entity-evaluator-consumer/src/test/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumerTest.java b/jcore-julielab-entity-evaluator-consumer/src/test/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumerTest.java index 69010da56..b50a25edd 100644 --- a/jcore-julielab-entity-evaluator-consumer/src/test/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumerTest.java +++ b/jcore-julielab-entity-evaluator-consumer/src/test/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumerTest.java @@ -15,6 +15,7 @@ import de.julielab.jcore.types.pubmed.ManualDescriptor; import de.julielab.jcore.utility.JCoReTools; import org.apache.commons.codec.binary.Base64; +import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.JCasFactory; @@ -22,7 +23,7 @@ import org.apache.uima.jcas.cas.DoubleArray; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.cas.StringArray; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.ByteArrayInputStream; import java.io.File; @@ -34,16 +35,14 @@ import java.util.zip.GZIPInputStream; import static de.julielab.jcore.consumer.entityevaluator.EntityEvaluatorConsumer.*; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class EntityEvaluatorConsumerTest { @Test public void testEntityEvaluatorConsumerSingleEntity() throws Exception { - JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types", - "de.julielab.jcore.types.jcore-semantics-biology-types", - "de.julielab.jcore.types.jcore-document-meta-types"); + JCas jcas = getjCas(); AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class, PARAM_COLUMN_DEFINITIONS, new String[] { DOCUMENT_ID_COLUMN + ": Header = /docId", @@ -75,11 +74,49 @@ public void testEntityEvaluatorConsumerSingleEntity() throws Exception { assertEquals("document1 document1:0 23 gene", lines.get(0)); } + private JCas getjCas() throws UIMAException { + return JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types", + "de.julielab.jcore.types.jcore-semantics-biology-types", + "de.julielab.jcore.types.jcore-document-meta-pubmed-types"); + } + + @Test + public void testEntityEvaluatorConsumerSingleEntity2() throws Exception { + // The same test as above but minus the DocumentId column + JCas jcas = getjCas(); + AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class, + PARAM_COLUMN_DEFINITIONS, + new String[] { "geneid:Gene=/resourceEntryList[0]/entryId", "name:/:coveredText()" }, + // We here use the default SentenceId column, we did not provide a definition! + PARAM_OUTPUT_COLUMNS, new String[] { SENTENCE_ID_COLUMN, "geneid", "name" }, + PARAM_TYPE_PREFIX, "de.julielab.jcore.types", PARAM_OUTPUT_FILE, "src/test/resources/outfile-test.tsv"); + + jcas.setDocumentText("One gene one sentence."); + Header h = new Header(jcas); + h.setDocId("document1"); + h.addToIndexes(); + Sentence s = new Sentence(jcas, 0, jcas.getDocumentText().length()); + s.setId("sentence1"); + s.addToIndexes(); + Gene g = new Gene(jcas, 4, 8); + GeneResourceEntry re = new GeneResourceEntry(jcas); + re.setEntryId("23"); + FSArray array = new FSArray(jcas, 1); + array.set(0, re); + g.setResourceEntryList(array); + g.addToIndexes(); + + consumer.process(jcas.getCas()); + consumer.collectionProcessComplete(); + + List lines = Files.readLines(new File("src/test/resources/outfile-test.tsv"), Charset.forName("UTF-8")); + assertEquals(1, lines.size()); + assertEquals("document1:0 23 gene", lines.get(0)); + } + @Test public void testEntityEvaluatorConsumerNoEntities() throws Exception { - JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types", - "de.julielab.jcore.types.jcore-semantics-biology-types", - "de.julielab.jcore.types.jcore-document-meta-types"); + JCas jcas = getjCas(); AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class, PARAM_COLUMN_DEFINITIONS, new String[] { DOCUMENT_ID_COLUMN + ": Header = /docId", @@ -107,9 +144,7 @@ public void testEntityEvaluatorConsumerNoEntities() throws Exception { @Test public void testEntityEvaluatorConsumerSingleEntityDocumentTextHash() throws Exception { - JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types", - "de.julielab.jcore.types.jcore-semantics-biology-types", - "de.julielab.jcore.types.jcore-document-meta-types"); + JCas jcas = getjCas(); AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class, PARAM_COLUMN_DEFINITIONS, new String[] { @@ -143,9 +178,7 @@ public void testEntityEvaluatorConsumerSingleEntityDocumentTextHash() throws Exc @Test public void testEntityEvaluatorConsumerMultipleEntities() throws Exception { - JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types", - "de.julielab.jcore.types.jcore-semantics-biology-types", - "de.julielab.jcore.types.jcore-document-meta-types"); + JCas jcas = getjCas(); AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class, PARAM_COLUMN_DEFINITIONS, new String[] { SENTENCE_ID_COLUMN + ": Sentence=/id", @@ -180,9 +213,7 @@ public void testEntityEvaluatorConsumerMultipleEntities() throws Exception { @Test public void testEntityEvaluatorConsumerSingleEntityNoWSOffsets() throws Exception { - JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types", - "de.julielab.jcore.types.jcore-semantics-biology-types", - "de.julielab.jcore.types.jcore-document-meta-types"); + JCas jcas = getjCas(); AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class, PARAM_COLUMN_DEFINITIONS, new String[] { DOCUMENT_ID_COLUMN + ": Header = /docId", SENTENCE_ID_COLUMN + ": Sentence=/id", @@ -219,9 +250,7 @@ public void testEntityEvaluatorConsumerSuperType() throws Exception { // other, e.g. EntityMention and Gene, then we don't want to traverse // the subsumed types on their own. They are contained in the annotation // index of their super type. - JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types", - "de.julielab.jcore.types.jcore-semantics-biology-types", - "de.julielab.jcore.types.jcore-document-meta-types"); + JCas jcas = getjCas(); AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class, PARAM_COLUMN_DEFINITIONS, new String[] { DOCUMENT_ID_COLUMN + ": Header = /docId", SENTENCE_ID_COLUMN + ": Sentence=/id", @@ -261,23 +290,21 @@ public void testCreateNonWsOffsetMap() throws Exception { TreeMap numWsMap = (TreeMap) method.invoke(null, "one two three"); // first check the actual map entries (after each white space position // there should be an entry) - assertEquals(new Integer(0), numWsMap.get(0)); - assertEquals(new Integer(1), numWsMap.get(4)); - assertEquals(new Integer(2), numWsMap.get(8)); + assertEquals(Integer.valueOf(0), numWsMap.get(0)); + assertEquals(Integer.valueOf(1), numWsMap.get(4)); + assertEquals(Integer.valueOf(2), numWsMap.get(8)); // now check the intended use; using the floor element, we should be // able to the correct value even for those positions we don't have an // explicit mapping for - assertEquals(new Integer(0), numWsMap.floorEntry(2).getValue()); - assertEquals(new Integer(1), numWsMap.floorEntry(5).getValue()); - assertEquals(new Integer(2), numWsMap.floorEntry(11).getValue()); + assertEquals(Integer.valueOf(0), numWsMap.floorEntry(2).getValue()); + assertEquals(Integer.valueOf(1), numWsMap.floorEntry(5).getValue()); + assertEquals(Integer.valueOf(2), numWsMap.floorEntry(11).getValue()); } @Test public void testEntityEvaluatorConsumerFeatureFilter() throws Exception { - JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types", - "de.julielab.jcore.types.jcore-semantics-biology-types", - "de.julielab.jcore.types.jcore-document-meta-types"); + JCas jcas = getjCas(); AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class, PARAM_COLUMN_DEFINITIONS, new String[] { DOCUMENT_ID_COLUMN + ": Header = /docId", SENTENCE_ID_COLUMN + ": Sentence=/id", @@ -320,6 +347,53 @@ public void testEntityEvaluatorConsumerFeatureFilter() throws Exception { assertEquals("document1 document1:0 42 One", lines.get(0)); } + @Test + public void testEntityEvaluatorConsumerFeatureFilterRegEx() throws Exception { + JCas jcas = getjCas(); + AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class, + PARAM_COLUMN_DEFINITIONS, + new String[] { DOCUMENT_ID_COLUMN + ": Header = /docId", SENTENCE_ID_COLUMN + ": Sentence=/id", + "genetype:Gene=/specificType", "name:/:coveredText()" }, + PARAM_OUTPUT_COLUMNS, new String[] { DOCUMENT_ID_COLUMN, SENTENCE_ID_COLUMN, "genetype", "name" }, + PARAM_TYPE_PREFIX, "de.julielab.jcore.types", PARAM_OUTPUT_FILE, "src/test/resources/outfile-test.tsv", + PARAM_FEATURE_FILTERS, new String[] { "Gene:/specificType=Group[3-4]{2,3}s?" }, + PARAM_ALLOW_REGEX_FOR_FILTERS, true); + + jcas.setDocumentText("One gene one sentence."); + Header h = new Header(jcas); + h.setDocId("document1"); + h.addToIndexes(); + Sentence s = new Sentence(jcas, 0, jcas.getDocumentText().length()); + s.setId("sentence1"); + s.addToIndexes(); + { + Gene g = new Gene(jcas, 4, 8); + // should not pass filter + g.setSpecificType("Group123"); + g.addToIndexes(); + } + { + Gene g = new Gene(jcas, 0, 3); + // should pass filter + g.setSpecificType("Group33s"); + g.addToIndexes(); + } + { + Gene g = new Gene(jcas, 0, 3); + // should pass filter + g.setSpecificType("Group344"); + g.addToIndexes(); + } + + consumer.process(jcas.getCas()); + consumer.collectionProcessComplete(); + + List lines = Files.readLines(new File("src/test/resources/outfile-test.tsv"), Charset.forName("UTF-8")); + assertEquals(2, lines.size()); + assertEquals("document1 document1:0 Group33s One", lines.get(0)); + assertEquals("document1 document1:0 Group344 One", lines.get(1)); + } + @Test public void testParallelMultiValues() throws Exception { JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types", @@ -367,9 +441,7 @@ public void testParallelMultiValues() throws Exception { @Test public void testCartesianMultiValues() throws Exception { - JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types", - "de.julielab.jcore.types.jcore-semantics-biology-types", - "de.julielab.jcore.types.jcore-document-meta-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types"); + JCas jcas = getjCas(); AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class, PARAM_COLUMN_DEFINITIONS, new String[] { diff --git a/jcore-likelihood-assignment-ae/component.meta b/jcore-likelihood-assignment-ae/component.meta index 671dbf79e..f73f0297a 100644 --- a/jcore-likelihood-assignment-ae/component.meta +++ b/jcore-likelihood-assignment-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-likelihood-assignment-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Likelihood Assignment AE" } diff --git a/jcore-likelihood-assignment-ae/pom.xml b/jcore-likelihood-assignment-ae/pom.xml index e49c1a243..d28f1775b 100644 --- a/jcore-likelihood-assignment-ae/pom.xml +++ b/jcore-likelihood-assignment-ae/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -27,14 +27,19 @@ de.julielab jcore-descriptor-creator + + de.julielab + jcore-utilities + ${jcore-utilities-version} + de.julielab jcore-types ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe Likelihood Assignment AE diff --git a/jcore-likelihood-assignment-ae/src/main/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotator.java b/jcore-likelihood-assignment-ae/src/main/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotator.java index 622c6cded..4c31a62f9 100644 --- a/jcore-likelihood-assignment-ae/src/main/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotator.java +++ b/jcore-likelihood-assignment-ae/src/main/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotator.java @@ -1,209 +1,298 @@ - package de.julielab.jcore.ae.likelihoodassignment; import de.julielab.jcore.types.ConceptMention; import de.julielab.jcore.types.LikelihoodIndicator; import de.julielab.jcore.types.Sentence; +import de.julielab.jcore.utility.JCoReAnnotationIndexMerger; +import de.julielab.jcore.utility.JCoReAnnotationTools; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.FSIterator; +import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.NavigableMap; -import java.util.TreeMap; +import java.util.*; -@ResourceMetaData(name="JCoRe Likelihood Assignment AE", description = "Analysis Engine to assign likelihood indicators to their corresponding entities and events.") -@TypeCapability(inputs="de.julielab.jcore.types.LikelihoodIndicator") +@ResourceMetaData(name = "JCoRe Likelihood Assignment AE", description = "Analysis Engine to assign likelihood indicators to their corresponding entities and events.") +@TypeCapability(inputs = "de.julielab.jcore.types.LikelihoodIndicator") public class LikelihoodAssignmentAnnotator extends JCasAnnotator_ImplBase { - private static final Logger LOGGER = LoggerFactory - .getLogger(LikelihoodAssignmentAnnotator.class); - - /** - * Maps sentence ends to sentence begins. - */ - private TreeMap sentMap; - /** - * Maps concept mentions to their begins. - */ - private TreeMap> conceptMap; - /** - * Maps likelihood indicators to their begins. - */ - private TreeMap likelihoodMap; - - /** - * Quantifies likelihood values. - */ - private HashMap likelihoodValueMap; - - public void initialize(UimaContext aContext) - throws ResourceInitializationException { - super.initialize(aContext); - - // ordinal scale for likelihood indicators; - // used when there are multiple occurrences (the lowest category is - // chosen) - likelihoodValueMap = new HashMap<>(); - likelihoodValueMap.put("negation", 1); - likelihoodValueMap.put("low", 2); - likelihoodValueMap.put("investigation", 3); - likelihoodValueMap.put("moderate", 4); - likelihoodValueMap.put("high", 5); - } - - @Override - public void process(JCas aJCas) throws AnalysisEngineProcessException { - assignLikelihood(aJCas); - } - - /** - * If a sentence contains a likelihood indicator, this indicator is assigned - * to all concept mentions occurring in the sentence. If a sentence does not - * contain a likelihood indicator, the default likelihood category (i.e. - * 'assertion') is assigned to all concept mentions occurring in the - * sentence. In case of multiple likelihood indicators the lowest likelihood - * category is chosen. - * - * @param aJCas - */ - private void assignLikelihood(JCas aJCas) { - buildTreeMaps(aJCas); - - // create default likelihood indicator for assertions (has begin = 0 and - // end = 0) - LikelihoodIndicator assertionIndicator = new LikelihoodIndicator(aJCas); - assertionIndicator.setLikelihood("assertion"); - assertionIndicator.setComponentId(this.getClass().getName()); - assertionIndicator.addToIndexes(); - - // iterate over sentences - for (int sentBegin : sentMap.keySet()) { - int sentEnd = sentMap.get(sentBegin); - boolean sentHasLikelihood = false; - boolean multipleLikelihood = false; - Integer firstLikelihoodBegin = 0; - Integer lastLikelihoodBegin = 0; - - // determine whether the sentence contains a likelihood indicator at - // all and whether it even contains multiple likelihood indicators - firstLikelihoodBegin = likelihoodMap.ceilingKey(sentBegin); - if (firstLikelihoodBegin != null) { - if (firstLikelihoodBegin > sentEnd) { - sentHasLikelihood = false; - } else { - sentHasLikelihood = true; - } - } - if (sentHasLikelihood == true) { - lastLikelihoodBegin = likelihoodMap.floorKey(sentEnd); - if (firstLikelihoodBegin == lastLikelihoodBegin) { - multipleLikelihood = false; - } else { - multipleLikelihood = true; - } - } - - // determine which likelihood category to assign to concept mentions - // in the sentence and create the corresponding likelihood indicator - LikelihoodIndicator assignedLikelihood = null; - if (sentHasLikelihood == true) { - if (multipleLikelihood = true) { - // determine the lowest likelihood category in the sentence - NavigableMap likelihoodSubMap = likelihoodMap - .subMap(firstLikelihoodBegin, true, - lastLikelihoodBegin, true); - int currentLikelihoodValue = 100; - for (int i : likelihoodSubMap.keySet()) { - LikelihoodIndicator likelihood = likelihoodSubMap - .get(i); - String likelihoodCat = likelihood.getLikelihood(); - int likelihoodValue = likelihoodValueMap - .get(likelihoodCat); - if (likelihoodValue < currentLikelihoodValue) { - assignedLikelihood = likelihood; - currentLikelihoodValue = likelihoodValue; - } - } - } else { - LikelihoodIndicator likelihood = likelihoodMap - .get(firstLikelihoodBegin); - assignedLikelihood = likelihood; - } - } else { - assignedLikelihood = assertionIndicator; - } - - // get all events in the sentence and assign the corresponding - // likelihood indicator - if (conceptMap.ceilingKey(sentBegin) != null) { - int firstConceptBegin = conceptMap.ceilingKey(sentBegin); - if (firstConceptBegin > sentEnd) { - continue; - } else { - int lastConceptBegin = conceptMap.floorKey(sentEnd); - NavigableMap> conceptSubMap = conceptMap - .subMap(firstConceptBegin, true, lastConceptBegin, - true); - for (int i : conceptSubMap.keySet()) { - ArrayList conceptList = conceptSubMap - .get(i); - for (ConceptMention concept : conceptList) { - concept.setLikelihood(assignedLikelihood); - } - } - } - } - } - } - - @SuppressWarnings("rawtypes") - public void buildTreeMaps(JCas aJCas) { - FSIterator sentIt = aJCas.getAnnotationIndex(Sentence.type).iterator(); - FSIterator conceptIt = aJCas.getAnnotationIndex(ConceptMention.type) - .iterator(); - FSIterator likelihoodIt = aJCas.getAnnotationIndex( - LikelihoodIndicator.type).iterator(); - - sentMap = new TreeMap(); - while (sentIt.hasNext()) { - Sentence sent = (Sentence) sentIt.next(); - int sentBegin = sent.getBegin(); - int sentEnd = sent.getEnd(); - sentMap.put(sentBegin, sentEnd); - } - - conceptMap = new TreeMap>(); - while (conceptIt.hasNext()) { - ConceptMention concept = (ConceptMention) conceptIt.next(); - int conceptBegin = concept.getBegin(); - if (conceptMap.containsKey(conceptBegin)) { - ArrayList conceptList = conceptMap - .get(conceptBegin); - conceptList.add(concept); - conceptMap.put(conceptBegin, conceptList); - } else { - ArrayList conceptList = new ArrayList(); - conceptList.add(concept); - conceptMap.put(conceptBegin, conceptList); - } - } - - likelihoodMap = new TreeMap(); - while (likelihoodIt.hasNext()) { - LikelihoodIndicator likelihood = (LikelihoodIndicator) likelihoodIt - .next(); - int likelihoodBegin = likelihood.getBegin(); - likelihoodMap.put(likelihoodBegin, likelihood); - } - } + public static final String PARAM_ASSIGNMENT_STRATEGY = "AssignmentStrategy"; + public static final String PARAM_CONCEPT_TYPE_NAME = "ConceptTypeName"; + public static final String STRATEGY_ALL = "all"; + public static final String STRATEGY_NEXT_CONCEPT = "next-concept"; + private static final Logger LOGGER = LoggerFactory + .getLogger(LikelihoodAssignmentAnnotator.class); + @ConfigurationParameter(name = PARAM_ASSIGNMENT_STRATEGY, mandatory = false, defaultValue = STRATEGY_NEXT_CONCEPT, description = "There are two available assignment strategies for likelihood indicators to ConceptMentions, '" + STRATEGY_ALL + "' and '" + STRATEGY_NEXT_CONCEPT + "'. The first, 'all', assigns the lowest likelihood indicator in a sentence to all ConceptMention in this sentence. The second assigns a likelihood indicator only to the directly following ConceptMention in the same sentence. The latter strategy fares a bit better in evaluations carried out for the publication of this approach. Defaults to '" + STRATEGY_NEXT_CONCEPT + "'.") + private String assignmentStrategy; + @ConfigurationParameter(name = PARAM_CONCEPT_TYPE_NAME, mandatory = false, defaultValue = "de.julielab.jcore.types.ConceptMention", description = "The qualified UIMA type name for the concept annotation for which likelihood assignment should be performed. Must be a subclass of de.julielab.jcore.types.ConceptMention. Defaults to de.julielab.jcore.types.ConceptMention.") + private String conceptTypeName; + /** + * Maps sentence ends to sentence begins. + */ + private TreeMap sentMap; + /** + * Maps concept mentions to their begins. + */ + private TreeMap> conceptMap; + /** + * Maps likelihood indicators to their begins. + */ + private TreeMap likelihoodMap; + + /** + * Quantifies likelihood values. + */ + private HashMap likelihoodValueMap; + private ConceptMention conceptTypeTemplate; + + public void initialize(UimaContext aContext) + throws ResourceInitializationException { + super.initialize(aContext); + + assignmentStrategy = (String) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_ASSIGNMENT_STRATEGY)).orElse("next-concept"); + conceptTypeName = (String) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_CONCEPT_TYPE_NAME)).orElse(ConceptMention.class.getCanonicalName()); + + // ordinal scale for likelihood indicators; + // used when there are multiple occurrences (the lowest category is + // chosen) + likelihoodValueMap = new HashMap<>(); + likelihoodValueMap.put("negation", 1); + likelihoodValueMap.put("low", 2); + likelihoodValueMap.put("investigation", 3); + likelihoodValueMap.put("moderate", 4); + likelihoodValueMap.put("high", 5); + } + + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException { + if (conceptTypeTemplate == null) { + try { + conceptTypeTemplate = (ConceptMention) JCoReAnnotationTools.getAnnotationByClassName(aJCas, conceptTypeName); + } catch (Exception e) { + LOGGER.error("Could not obtain the specified concept UIMA type with name " + conceptTypeName + ".", e); + throw new AnalysisEngineProcessException(e); + } + } + // We have two strategies available for the assignment of likelhood indicators to ConceptMentions. + // Either the original one, implemented in 'assignLikelihood', where likelihood indicators in a sentences are + // assigned to all ConceptMentions in the same sentence or a simplified one that, according to + // Christine Engelmann, actually fared a bit better in evaluations, were a likelihood indicator is only + // assigned to the next following ConceptMention, implemented in 'assignLikelihoodToNextConceptMention'. + if (assignmentStrategy.equalsIgnoreCase(STRATEGY_NEXT_CONCEPT)) + assignLikelihoodToNextConceptMention(aJCas); + else if (assignmentStrategy.equalsIgnoreCase(STRATEGY_ALL)) + assignLikelihood(aJCas); + else + throw new AnalysisEngineProcessException(new IllegalArgumentException("The " + PARAM_ASSIGNMENT_STRATEGY + " parameter requires one of two values, " + STRATEGY_ALL + " or " + STRATEGY_NEXT_CONCEPT + " but was set to " + assignmentStrategy + ".")); + } + + /** + *

Simple assignment strategy that sets the direct nearest previous likelihood indicator to each ConceptMention.

+ *

No other ConceptMention must stand in between because then, a previous ConceptMention would be assigned the + * likelihood indicator.

+ *

This strategy was proposed by Christine Engelmann because it fared a bit better in her evaluations than + * the alternative strategy implemented in {@link #assignLikelihood(JCas)}.

+ * + * @param aJCas The CAS to do likelihood assignment in. + * @throws AnalysisEngineProcessException If the creation of the {@link JCoReAnnotationIndexMerger}, that is used internally, fails. + */ + private void assignLikelihoodToNextConceptMention(JCas aJCas) throws AnalysisEngineProcessException { + // create default likelihood indicator for assertions (has begin = 0 and + // end = 0) + LikelihoodIndicator assertionIndicator = new LikelihoodIndicator(aJCas); + assertionIndicator.setLikelihood("assertion"); + assertionIndicator.setComponentId(this.getClass().getName()); + assertionIndicator.addToIndexes(); + + for (Sentence sentence : aJCas.getAnnotationIndex(Sentence.type)) { + // We use the annotation merger that gives us a sorted sequence of annotations of specified types. + // Then, we must only assign for each concept the directly preceding likelihood annotation, if there is one. + JCoReAnnotationIndexMerger merger; + try { + merger = new JCoReAnnotationIndexMerger(Set.of(JCasUtil.getAnnotationType(aJCas, conceptTypeTemplate.getClass()), JCasUtil.getAnnotationType(aJCas, LikelihoodIndicator.class)), true, sentence, aJCas); + } catch (ClassNotFoundException e) { + LOGGER.error("Could not create JCoReAnnotationIndexMerger", e); + throw new AnalysisEngineProcessException(e); + } + LikelihoodIndicator previousLikelihood = null; + boolean previousLikelihoodConsumed = false; + int lastAssignedCmBegin = 0; + int lastAssignedCmEnd = 0; + while (merger.incrementAnnotation()) { + final Annotation annotation = (Annotation) merger.getAnnotation(); + ConceptMention cm = null; + if (conceptTypeTemplate.getClass().isAssignableFrom(annotation.getClass())) { + cm = (ConceptMention) annotation; + // default likelihood is assertion + cm.setLikelihood(assertionIndicator); + } + // check if there is a likelihood anntotion preceeding the ConceptMention in this sentence without + // another ConceptMention in between - except when multiple ConceptMentions exist in the same offsets + // which is possible for EventMentions that exist on the EventTrigger annotation. The trigger may + // refer to multiple events. + if (cm != null && (previousLikelihood != null && (!previousLikelihoodConsumed || (lastAssignedCmBegin == cm.getBegin() && lastAssignedCmEnd == cm.getEnd())))) { + cm.setLikelihood(previousLikelihood); + // this likelihood indicator has been "consumed" + previousLikelihoodConsumed = true; + lastAssignedCmBegin = cm.getBegin(); + lastAssignedCmEnd = cm.getEnd(); + } + if (annotation instanceof LikelihoodIndicator) { + previousLikelihood = (LikelihoodIndicator) annotation; + previousLikelihoodConsumed = false; + } + } + } + } + + /** + * If a sentence contains a likelihood indicator, this indicator is assigned + * to all concept mentions occurring in the sentence. If a sentence does not + * contain a likelihood indicator, the default likelihood category (i.e. + * 'assertion') is assigned to all concept mentions occurring in the + * sentence. In case of multiple likelihood indicators the lowest likelihood + * category is chosen. + * + * @param aJCas + */ + private void assignLikelihood(JCas aJCas) { + buildTreeMaps(aJCas); + + // create default likelihood indicator for assertions (has begin = 0 and + // end = 0) + LikelihoodIndicator assertionIndicator = new LikelihoodIndicator(aJCas); + assertionIndicator.setLikelihood("assertion"); + assertionIndicator.setComponentId(this.getClass().getName()); + assertionIndicator.addToIndexes(); + + // iterate over sentences + for (int sentBegin : sentMap.keySet()) { + int sentEnd = sentMap.get(sentBegin); + boolean sentHasLikelihood = false; + boolean multipleLikelihood = false; + Integer firstLikelihoodBegin = 0; + Integer lastLikelihoodBegin = 0; + + // determine whether the sentence contains a likelihood indicator at + // all and whether it even contains multiple likelihood indicators + firstLikelihoodBegin = likelihoodMap.ceilingKey(sentBegin); + if (firstLikelihoodBegin != null) { + if (firstLikelihoodBegin > sentEnd) { + sentHasLikelihood = false; + } else { + sentHasLikelihood = true; + } + } + if (sentHasLikelihood == true) { + lastLikelihoodBegin = likelihoodMap.floorKey(sentEnd); + if (firstLikelihoodBegin == lastLikelihoodBegin) { + multipleLikelihood = false; + } else { + multipleLikelihood = true; + } + } + + // determine which likelihood category to assign to concept mentions + // in the sentence and create the corresponding likelihood indicator + LikelihoodIndicator assignedLikelihood = null; + if (sentHasLikelihood == true) { + if (multipleLikelihood == true) { + // determine the lowest likelihood category in the sentence + NavigableMap likelihoodSubMap = likelihoodMap + .subMap(firstLikelihoodBegin, true, + lastLikelihoodBegin, true); + int currentLikelihoodValue = 100; + for (int i : likelihoodSubMap.keySet()) { + LikelihoodIndicator likelihood = likelihoodSubMap + .get(i); + String likelihoodCat = likelihood.getLikelihood(); + int likelihoodValue = likelihoodValueMap + .get(likelihoodCat); + if (likelihoodValue < currentLikelihoodValue) { + assignedLikelihood = likelihood; + currentLikelihoodValue = likelihoodValue; + } + } + } else { + LikelihoodIndicator likelihood = likelihoodMap + .get(firstLikelihoodBegin); + assignedLikelihood = likelihood; + } + } else { + assignedLikelihood = assertionIndicator; + } + + // get all events in the sentence and assign the corresponding + // likelihood indicator + if (conceptMap.ceilingKey(sentBegin) != null) { + int firstConceptBegin = conceptMap.ceilingKey(sentBegin); + if (firstConceptBegin > sentEnd) { + continue; + } else { + int lastConceptBegin = conceptMap.floorKey(sentEnd); + NavigableMap> conceptSubMap = conceptMap + .subMap(firstConceptBegin, true, lastConceptBegin, + true); + for (int i : conceptSubMap.keySet()) { + ArrayList conceptList = conceptSubMap + .get(i); + for (ConceptMention concept : conceptList) { + concept.setLikelihood(assignedLikelihood); + } + } + } + } + } + } + + @SuppressWarnings("rawtypes") + public void buildTreeMaps(JCas aJCas) { + FSIterator sentIt = aJCas.getAnnotationIndex(Sentence.type).iterator(); + FSIterator conceptIt = aJCas.getAnnotationIndex(conceptTypeTemplate.type) + .iterator(); + FSIterator likelihoodIt = aJCas.getAnnotationIndex( + LikelihoodIndicator.type).iterator(); + + sentMap = new TreeMap<>(); + while (sentIt.hasNext()) { + Sentence sent = (Sentence) sentIt.next(); + int sentBegin = sent.getBegin(); + int sentEnd = sent.getEnd(); + sentMap.put(sentBegin, sentEnd); + } + + conceptMap = new TreeMap<>(); + while (conceptIt.hasNext()) { + ConceptMention concept = (ConceptMention) conceptIt.next(); + int conceptBegin = concept.getBegin(); + if (conceptMap.containsKey(conceptBegin)) { + ArrayList conceptList = conceptMap + .get(conceptBegin); + conceptList.add(concept); + conceptMap.put(conceptBegin, conceptList); + } else { + ArrayList conceptList = new ArrayList<>(); + conceptList.add(concept); + conceptMap.put(conceptBegin, conceptList); + } + } + + likelihoodMap = new TreeMap<>(); + while (likelihoodIt.hasNext()) { + LikelihoodIndicator likelihood = (LikelihoodIndicator) likelihoodIt + .next(); + int likelihoodBegin = likelihood.getBegin(); + likelihoodMap.put(likelihoodBegin, likelihood); + } + } } diff --git a/jcore-likelihood-assignment-ae/src/main/resources/de/julielab/jcore/ae/likelihoodassignment/desc/jcore-likelihood-assignment-ae.xml b/jcore-likelihood-assignment-ae/src/main/resources/de/julielab/jcore/ae/likelihoodassignment/desc/jcore-likelihood-assignment-ae.xml index 14bc6f60a..2db5339a6 100644 --- a/jcore-likelihood-assignment-ae/src/main/resources/de/julielab/jcore/ae/likelihoodassignment/desc/jcore-likelihood-assignment-ae.xml +++ b/jcore-likelihood-assignment-ae/src/main/resources/de/julielab/jcore/ae/likelihoodassignment/desc/jcore-likelihood-assignment-ae.xml @@ -6,8 +6,23 @@ JCoRe Likelihood Assignment AE Analysis Engine to assign likelihood indicators to their corresponding entities and events. - 2.5.1-SNAPSHOT - + 2.6.0 + + + AssignmentStrategy + There are two available assignment strategies for likelihood indicators to ConceptMentions, 'all' and 'next-concept'. The first, 'all', assigns the lowest likelihood indicator in a sentence to all ConceptMention in this sentence. The second assigns a likelihood indicator only to the directly following ConceptMention in the same sentence. The latter strategy fares a bit better in evaluations carried out for the publication of this approach. Defaults to 'next-concept'." + String + false + false + + + ConceptTypeName + The qualified UIMA type name for the concept annotation for which likelihood assignment should be performed. Must be a subclass of de.julielab.jcore.types.ConceptMention. Defaults to de.julielab.jcore.types.ConceptMention. + String + false + false + + diff --git a/jcore-likelihood-assignment-ae/src/test/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotatorTest.java b/jcore-likelihood-assignment-ae/src/test/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotatorTest.java index 5caf84f55..34861be6c 100644 --- a/jcore-likelihood-assignment-ae/src/test/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotatorTest.java +++ b/jcore-likelihood-assignment-ae/src/test/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotatorTest.java @@ -1,25 +1,21 @@ package de.julielab.jcore.ae.likelihoodassignment; -import de.julielab.jcore.types.ConceptMention; -import de.julielab.jcore.types.LikelihoodIndicator; -import de.julielab.jcore.types.Sentence; +import de.julielab.jcore.types.*; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.JFSIndexRepository; import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.InvalidXMLException; -import org.apache.uima.util.XMLInputSource; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.Iterator; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; /** @@ -71,10 +67,7 @@ public void initCas(JCas aJCas) { @Test @SuppressWarnings({ "rawtypes"}) public void testProcess() throws ResourceInitializationException, IOException, InvalidXMLException { - - XMLInputSource assignmentXML = null; - ResourceSpecifier assignmentSpec = null; - AnalysisEngine assignmentAnnotator = AnalysisEngineFactory.createEngine(DESCRIPTOR); + AnalysisEngine assignmentAnnotator = AnalysisEngineFactory.createEngine(DESCRIPTOR, LikelihoodAssignmentAnnotator.PARAM_ASSIGNMENT_STRATEGY, LikelihoodAssignmentAnnotator.STRATEGY_ALL); JCas aJCas = null; try { @@ -119,4 +112,76 @@ public String getPredictedAssignments(Iterator conceptIter) { return conceptLikelihood; } + + @Test + public void testAssignNextStrategy() throws Exception { + AnalysisEngine assignmentAnnotator = AnalysisEngineFactory.createEngine(DESCRIPTOR, LikelihoodAssignmentAnnotator.PARAM_ASSIGNMENT_STRATEGY, LikelihoodAssignmentAnnotator.STRATEGY_NEXT_CONCEPT); + final JCas jCas = assignmentAnnotator.newJCas(); + jCas.setDocumentText("Our data suggest that it is highly probable that the interaction occurred, however not the other one."); + new Sentence(jCas, 0, jCas.getDocumentText().length()).addToIndexes(); + + LikelihoodIndicator suggest = new LikelihoodIndicator(jCas, 9, 16); + suggest.setLikelihood("moderate"); + suggest.addToIndexes(); + + LikelihoodIndicator highly = new LikelihoodIndicator(jCas, 28, 43); + highly.setLikelihood("high"); + highly.addToIndexes(); + + ConceptMention interaction = new ConceptMention(jCas, 53, 64); + interaction.addToIndexes(); + + LikelihoodIndicator not = new LikelihoodIndicator(jCas, 83, 86); + not.setLikelihood("negation"); + not.addToIndexes(); + + ConceptMention theOtherOne = new ConceptMention(jCas, 87, 100); + theOtherOne.addToIndexes(); + + assignmentAnnotator.process(jCas); + + assertEquals(highly, interaction.getLikelihood()); + assertEquals( not, theOtherOne.getLikelihood()); + } + + @Test + public void testAssignNextStrategySpecificConceptType() throws Exception { + // Here we test that the interaction type EventMention gets the likelihood assignment and not + // the entity argument because that is also a ConceptMention which gets assigned by default. + AnalysisEngine assignmentAnnotator = AnalysisEngineFactory.createEngine(DESCRIPTOR, + LikelihoodAssignmentAnnotator.PARAM_ASSIGNMENT_STRATEGY, LikelihoodAssignmentAnnotator.STRATEGY_NEXT_CONCEPT, + LikelihoodAssignmentAnnotator.PARAM_CONCEPT_TYPE_NAME, EventMention.class.getCanonicalName()); + final JCas jCas = assignmentAnnotator.newJCas(); + jCas.setDocumentText("Our data suggest one entity interacts with another but there is phosphorylation."); + new Sentence(jCas, 0, jCas.getDocumentText().length()).addToIndexes(); + + LikelihoodIndicator suggest = new LikelihoodIndicator(jCas, 9, 16); + suggest.setLikelihood("moderate"); + suggest.addToIndexes(); + + EntityMention oneEntity = new EntityMention(jCas, 17, 27); + oneEntity.addToIndexes(); + + EventMention interacts = new EventMention(jCas, 28, 37); + interacts.addToIndexes(); + // this is here to test that the assignment to same-offset annotations works + EventMention interacts2 = new EventMention(jCas, 28, 37); + interacts2.addToIndexes(); + + EntityMention another = new EntityMention(jCas, 43, 50); + another.addToIndexes(); + + EventMention phosphorylation = new EventMention(jCas, 64, 79); + phosphorylation.addToIndexes(); + + assignmentAnnotator.process(jCas); + + // only the EventMentions should be assigned likelihoods. + assertEquals(null, oneEntity.getLikelihood()); + assertEquals( suggest, interacts.getLikelihood()); + assertEquals( suggest, interacts2.getLikelihood()); + assertEquals(null, another.getLikelihood()); + // due to the next-concept strategy, this mention should receive the default assertion likelihood + assertEquals("assertion", phosphorylation.getLikelihood().getLikelihood()); + } } diff --git a/jcore-likelihood-detection-ae/component.meta b/jcore-likelihood-detection-ae/component.meta index e58826719..068a3ab10 100644 --- a/jcore-likelihood-detection-ae/component.meta +++ b/jcore-likelihood-detection-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-likelihood-detection-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Likelihood Detection AE" } diff --git a/jcore-likelihood-detection-ae/pom.xml b/jcore-likelihood-detection-ae/pom.xml index c68a79a73..0fc7e7fff 100644 --- a/jcore-likelihood-detection-ae/pom.xml +++ b/jcore-likelihood-detection-ae/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -42,8 +42,8 @@ julielab-java-utilities - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe Likelihood Detection AE diff --git a/jcore-likelihood-detection-ae/src/main/resources/de/julielab/jcore/ae/likelihooddetection/desc/jcore-likelihood-detection-ae.xml b/jcore-likelihood-detection-ae/src/main/resources/de/julielab/jcore/ae/likelihooddetection/desc/jcore-likelihood-detection-ae.xml index 81e9c76f1..9e3a492f4 100644 --- a/jcore-likelihood-detection-ae/src/main/resources/de/julielab/jcore/ae/likelihooddetection/desc/jcore-likelihood-detection-ae.xml +++ b/jcore-likelihood-detection-ae/src/main/resources/de/julielab/jcore/ae/likelihooddetection/desc/jcore-likelihood-detection-ae.xml @@ -6,7 +6,7 @@ JCoRe Likelihood Detection AE Analysis Engine to detect epistemic modal expressions and assign the appropriate likelihood category. - 2.5.1-SNAPSHOT + 2.6.0 LikelihoodDict diff --git a/jcore-likelihood-detection-ae/src/test/java/de/julielab/jcore/ae/likelihooddetection/LikelihoodDetectionAnnotatorTest.java b/jcore-likelihood-detection-ae/src/test/java/de/julielab/jcore/ae/likelihooddetection/LikelihoodDetectionAnnotatorTest.java index 864b0c431..eee8b0d8e 100644 --- a/jcore-likelihood-detection-ae/src/test/java/de/julielab/jcore/ae/likelihooddetection/LikelihoodDetectionAnnotatorTest.java +++ b/jcore-likelihood-detection-ae/src/test/java/de/julielab/jcore/ae/likelihooddetection/LikelihoodDetectionAnnotatorTest.java @@ -5,21 +5,21 @@ import de.julielab.jcore.types.Token; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.JFSIndexRepository; import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.InvalidXMLException; -import org.apache.uima.util.XMLInputSource; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; import java.util.Iterator; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; /** @@ -68,9 +68,6 @@ public void initCas(JCas aJCas) { @Test @SuppressWarnings("rawtypes") public void testProcess() throws ResourceInitializationException, IOException, InvalidXMLException { - - XMLInputSource likelihoodXML = null; - ResourceSpecifier likelihoodSpec = null; AnalysisEngine likelihoodAnnotator = AnalysisEngineFactory.createEngine(DESCRIPTOR); JCas aJCas = null; try { @@ -127,4 +124,22 @@ private ArrayList getPredictedIndicators(Iterator likelihoodIter) { prediction.add(predictedCategories); return prediction; } + + @Test + public void test() throws Exception { + String text = "Genome-wide expression analyses indicate that TAZ/YAP, TEADs, and TGFβ-induced signals coordinate a specific pro-tumorigenic transcriptional program"; + AnalysisEngine likelihoodAnnotator = AnalysisEngineFactory.createEngine(DESCRIPTOR); + JCas aJCas = null; + try { + aJCas = likelihoodAnnotator.newJCas(); + } catch (ResourceInitializationException e) { + LOGGER.error("testProcess()", e); + } + likelihoodAnnotator.process(aJCas); + + final Collection select = JCasUtil.select(aJCas, LikelihoodIndicator.class); + for (var s : select) { + System.out.println(s.getCoveredText()); + } + } } diff --git a/jcore-line-multiplier/component.meta b/jcore-line-multiplier/component.meta index 432aa6b6a..38394f9cd 100644 --- a/jcore-line-multiplier/component.meta +++ b/jcore-line-multiplier/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-line-multiplier", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Line Multiplier" } diff --git a/jcore-line-multiplier/pom.xml b/jcore-line-multiplier/pom.xml index 12aa067d8..650c68038 100644 --- a/jcore-line-multiplier/pom.xml +++ b/jcore-line-multiplier/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -29,8 +29,8 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine org.assertj diff --git a/jcore-line-multiplier/src/main/resources/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml b/jcore-line-multiplier/src/main/resources/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml index 69ff063cd..f58d9d2ed 100644 --- a/jcore-line-multiplier/src/main/resources/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml +++ b/jcore-line-multiplier/src/main/resources/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml @@ -6,7 +6,7 @@ JCoRe Line Multiplier Splits incoming CAS document texts on line breaks and returns one CAS for each non-blank line. - 2.5.1-SNAPSHOT + 2.6.0 NumberLinesPerCAS diff --git a/jcore-line-multiplier/src/test/java/de/julielab/jcore/multiplier/line/LineMultiplierTest.java b/jcore-line-multiplier/src/test/java/de/julielab/jcore/multiplier/line/LineMultiplierTest.java index 23b7e9ea3..5ecd2c19a 100644 --- a/jcore-line-multiplier/src/test/java/de/julielab/jcore/multiplier/line/LineMultiplierTest.java +++ b/jcore-line-multiplier/src/test/java/de/julielab/jcore/multiplier/line/LineMultiplierTest.java @@ -5,13 +5,13 @@ import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.ArrayList; import java.util.List; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertTrue; /** * Unit tests for jcore-line-multiplier. */ diff --git a/jcore-line-multiplier/target/classes/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml b/jcore-line-multiplier/target/classes/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml index 69ff063cd..f58d9d2ed 100644 --- a/jcore-line-multiplier/target/classes/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml +++ b/jcore-line-multiplier/target/classes/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml @@ -6,7 +6,7 @@ JCoRe Line Multiplier Splits incoming CAS document texts on line breaks and returns one CAS for each non-blank line. - 2.5.1-SNAPSHOT + 2.6.0 NumberLinesPerCAS diff --git a/jcore-line-multiplier/target/test-classes/de/julielab/jcore/multiplier/line/LineMultiplierTest.class b/jcore-line-multiplier/target/test-classes/de/julielab/jcore/multiplier/line/LineMultiplierTest.class index e654ed056..f32ad510b 100644 Binary files a/jcore-line-multiplier/target/test-classes/de/julielab/jcore/multiplier/line/LineMultiplierTest.class and b/jcore-line-multiplier/target/test-classes/de/julielab/jcore/multiplier/line/LineMultiplierTest.class differ diff --git a/jcore-lingpipe-porterstemmer-ae/component.meta b/jcore-lingpipe-porterstemmer-ae/component.meta index f0adaa9a1..843a38e95 100644 --- a/jcore-lingpipe-porterstemmer-ae/component.meta +++ b/jcore-lingpipe-porterstemmer-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-lingpipe-porterstemmer-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Lingpipe Porter Stemmer AE" } diff --git a/jcore-lingpipe-porterstemmer-ae/pom.xml b/jcore-lingpipe-porterstemmer-ae/pom.xml index 6a10f10c5..6cd1f56ca 100644 --- a/jcore-lingpipe-porterstemmer-ae/pom.xml +++ b/jcore-lingpipe-porterstemmer-ae/pom.xml @@ -5,7 +5,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 jcore-lingpipe-porterstemmer-ae JCoRe Lingpipe Porter Stemmer AE @@ -22,8 +22,8 @@ 4.1.2-JL1.0 - junit - junit + org.junit.jupiter + junit-jupiter-engine https://github.com/JULIELab/jcore-base/tree/master/jcore-lingpipe-porterstemmer-ae diff --git a/jcore-lingpipe-porterstemmer-ae/src/main/resources/de/julielab/jcore/ae/lingpipe/porterstemmer/desc/jcore-lingpipe-porterstemmer-ae.xml b/jcore-lingpipe-porterstemmer-ae/src/main/resources/de/julielab/jcore/ae/lingpipe/porterstemmer/desc/jcore-lingpipe-porterstemmer-ae.xml index b959cf460..c432b936e 100644 --- a/jcore-lingpipe-porterstemmer-ae/src/main/resources/de/julielab/jcore/ae/lingpipe/porterstemmer/desc/jcore-lingpipe-porterstemmer-ae.xml +++ b/jcore-lingpipe-porterstemmer-ae/src/main/resources/de/julielab/jcore/ae/lingpipe/porterstemmer/desc/jcore-lingpipe-porterstemmer-ae.xml @@ -5,7 +5,7 @@ JCoRe Lingpipe Porterstemmer AE Adds a StemmedForm to each token in the CAS. The offsets and the value feature of each StemmedForm are set to the stem as returned by the Porter stemmer algorithm as implemented by Lingpipe. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab, Germany diff --git a/jcore-lingpipe-porterstemmer-ae/src/test/java/de/julielab/jcore/ae/lingpipe/porterstemmer/LingpipePorterstemmerAnnotatorTest.java b/jcore-lingpipe-porterstemmer-ae/src/test/java/de/julielab/jcore/ae/lingpipe/porterstemmer/LingpipePorterstemmerAnnotatorTest.java index 58eb08a15..5bc2d85dd 100644 --- a/jcore-lingpipe-porterstemmer-ae/src/test/java/de/julielab/jcore/ae/lingpipe/porterstemmer/LingpipePorterstemmerAnnotatorTest.java +++ b/jcore-lingpipe-porterstemmer-ae/src/test/java/de/julielab/jcore/ae/lingpipe/porterstemmer/LingpipePorterstemmerAnnotatorTest.java @@ -16,10 +16,10 @@ import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; public class LingpipePorterstemmerAnnotatorTest { @Test diff --git a/jcore-lingpipegazetteer-ae/LICENSE b/jcore-lingpipegazetteer-ae/LICENSE index be3f7b28e..f57182ac3 100644 --- a/jcore-lingpipegazetteer-ae/LICENSE +++ b/jcore-lingpipegazetteer-ae/LICENSE @@ -1,661 +1,73 @@ - GNU AFFERO GENERAL PUBLIC LICENSE - Version 3, 19 November 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU Affero General Public License is a free, copyleft license for -software and other kinds of works, specifically designed to ensure -cooperation with the community in the case of network server software. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -our General Public Licenses are intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - Developers that use our General Public Licenses protect your rights -with two steps: (1) assert copyright on the software, and (2) offer -you this License which gives you legal permission to copy, distribute -and/or modify the software. - - A secondary benefit of defending all users' freedom is that -improvements made in alternate versions of the program, if they -receive widespread use, become available for other developers to -incorporate. Many developers of free software are heartened and -encouraged by the resulting cooperation. However, in the case of -software used on network servers, this result may fail to come about. -The GNU General Public License permits making a modified version and -letting the public access it on a server without ever releasing its -source code to the public. - - The GNU Affero General Public License is designed specifically to -ensure that, in such cases, the modified source code becomes available -to the community. It requires the operator of a network server to -provide the source code of the modified version running there to the -users of that server. Therefore, public use of a modified version, on -a publicly accessible server, gives the public access to the source -code of the modified version. - - An older license, called the Affero General Public License and -published by Affero, was designed to accomplish similar goals. This is -a different license, not a version of the Affero GPL, but Affero has -released a new version of the Affero GPL which permits relicensing under -this license. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU Affero General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Remote Network Interaction; Use with the GNU General Public License. - - Notwithstanding any other provision of this License, if you modify the -Program, your modified version must prominently offer all users -interacting with it remotely through a computer network (if your version -supports such interaction) an opportunity to receive the Corresponding -Source of your version by providing access to the Corresponding Source -from a network server at no charge, through some standard or customary -means of facilitating copying of software. This Corresponding Source -shall include the Corresponding Source for any work covered by version 3 -of the GNU General Public License that is incorporated pursuant to the -following paragraph. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the work with which it is combined will remain governed by version -3 of the GNU General Public License. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU Affero General Public License from time to time. Such new versions -will be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU Affero General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU Affero General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU Affero General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If your software can interact with users remotely through a computer -network, you should also make sure that it provides a way for users to -get its source. For example, if your program is a web application, its -interface could display a "Source" link that leads users to an archive -of the code. There are many ways you could offer source, and different -solutions will be better for different programs; see section 13 for the -specific requirements. - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU AGPL, see -. +Alias-i ROYALTY FREE LICENSE VERSION 1 + +Copyright c 2003-2007 Alias-i, Inc +All Rights Reserved + +1. This Alias-i Royalty Free License Version 1 ("License") governs + the copying, modifying, and distributing of the computer program or + work containing a notice stating that it is subject to the terms of + this License and any derivative works of that computer program or + work. The computer program or work and any derivative works thereof + are the "Software." Your copying, modifying, or distributing of the + Software constitutes acceptance of this License. Although you are not + required to accept this License, since you have not signed it, nothing + else grants you permission to copy, modify, or distribute the + Software. If you wish to receive a license from Alias-i under + different terms than those contained in this License, please contact + Alias-i. Otherwise, if you do not accept this License, any copying, + modifying, or distributing of the Software is strictly prohibited by + law. + +2. You may copy or modify the Software or use any output of the + Software (i) for internal non-production trial, testing and evaluation + of the Software, or (ii) in connection with any product or service you + provide to third parties for free. Copying or modifying the Software + includes the acts of "installing", "running", "using", "accessing" or + "deploying" the Software as those terms are understood in the software + industry. Therefore, those activities are only permitted under this + License in the ways that copying or modifying are permitted. + +3. You may distribute the Software, provided that you: (i) distribute + the Software only under the terms of this License, no more, no less; + (ii) include a copy of this License along with any such distribution; + (iii) include the complete corresponding machine-readable source code + of the Software you are distributing; (iv) do not remove any copyright + or other notices from the Software; and, (v) cause any files of the + Software that you modified to carry prominent notices stating that you + changed the Software and the date of any change so that recipients + know that they are not receiving the original Software. + +4. Whether you distribute the Software or not, if you distribute any + computer program that is not the Software, but that (a) is distributed + in connection with the Software or contains any part of the Software, + (b) causes the Software to be copied or modified (i.e., ran, used, or + executed), such as through an API call, or (c) uses any output of the + Software, then you must distribute that other computer program under a + license defined as a Free Software License by the Free Software + Foundation or an Approved Open Source License by the Open Source + Initiative. + +5. You may not copy, modify, or distribute the Software except as + expressly provided under this License, unless you receive a different + written license from Alias-i to do so. Any attempt otherwise to copy, + modify, or distribute the Software is without Alias-i's permission, is + void, and will automatically terminate your rights under this License. + Your rights under this License may only be reinstated by a signed + writing from Alias-i. + +THE SOFTWARE IS PROVIDED "AS IS." TO THE MAXIMUM EXTENT PERMITTED BY +APPLICABLE LAW, ALIAS-i DOES NOT MAKE, AND HEREBY EXPRESSLY DISCLAIMS, +ANY WARRANTIES, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, CONCERNING +THE SOFTWARE OR ANY SUBJECT MATTER OF THIS LICENSE. SPECIFICALLY, BUT +WITHOUT LIMITING THE FOREGOING, LICENSOR MAKES NO EXPRESS OR IMPLIED +WARRANTY OF MERCHANTABILITY, FITNESS (FOR A PARTICULAR PURPOSE OR +OTHERWISE), QUALITY, USEFULNESS, TITLE, OR NON-INFRINGEMENT. TO THE +MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL LICENSOR +BE LIABLE TO YOU OR ANY THIRD PARTY FOR ANY DAMAGES OR IN RESPECT OF +ANY CLAIM UNDER ANY TORT, CONTRACT, STRICT LIABILITY, NEGLIGENCE OR +OTHER THEORY FOR ANY DIRECT, INDIRECT, INCIDENTAL, CONSEQUENTIAL, +PUNITIVE, SPECIAL OR EXEMPLARY DAMAGES, EVEN IF IT HAS BEEN ADVISED OF +THE POSSIBILITY OF SUCH DAMAGES, OR FOR ANY AMOUNTS IN EXCESS OF THE +AMOUNT YOU PAID ALIAS-i FOR THIS LICENSE. YOU MUST PASS THIS ENTIRE +LICENSE, INCLUDING SPECIFICALLY THIS DISCLAIMER AND LIMITATION OF +LIABILITY, ON WHENEVER YOU DISTRIBUTE THE SOFTWARE. diff --git a/jcore-lingpipegazetteer-ae/component.meta b/jcore-lingpipegazetteer-ae/component.meta index 0a77648a3..6b1d1c0bf 100644 --- a/jcore-lingpipegazetteer-ae/component.meta +++ b/jcore-lingpipegazetteer-ae/component.meta @@ -18,7 +18,7 @@ "maven-artifact": { "artifactId": "jcore-lingpipe-gazetteer-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Lingpipe Gazetteer AE" } diff --git a/jcore-lingpipegazetteer-ae/pom.xml b/jcore-lingpipegazetteer-ae/pom.xml index 1d39efcf8..3046249fb 100644 --- a/jcore-lingpipegazetteer-ae/pom.xml +++ b/jcore-lingpipegazetteer-ae/pom.xml @@ -1,68 +1,76 @@ - + - 4.0.0 - jcore-lingpipe-gazetteer-ae - jar - JCoRe Lingpipe Gazetteer AE - Basically used as NE tagger based on Lingpipe's dictionary-lookup tagger. + 4.0.0 + jcore-lingpipe-gazetteer-ae + jar + JCoRe Lingpipe Gazetteer AE + Basically used as NE tagger based on Lingpipe's dictionary-lookup tagger. - - de.julielab - jcore-base - 2.5.1-SNAPSHOT - + + de.julielab + jcore-base + 2.6.0 + - - - de.julielab - jcore-descriptor-creator - + + + de.julielab + jcore-descriptor-creator + de.julielab jcore-types ${jcore-types-version} - - org.slf4j - slf4j-api - - - de.julielab - jcore-utilities - ${jcore-utilities-version} - - - ch.qos.logback - logback-classic - provided - - - com.ibm.icu - icu4j - 4.8.1.1 - - - de.julielab - aliasi-lingpipe - 4.1.2-JL1.0 - - - org.apache.commons - commons-lang3 - 3.4 - - junitjunit - - JULIE Lab, Germany - http://www.julielab.de - - + + org.slf4j + slf4j-api + + + de.julielab + jcore-utilities + ${jcore-utilities-version} + + + ch.qos.logback + logback-classic + provided + + + com.ibm.icu + icu4j + 4.8.1.1 + + + de.julielab + aliasi-lingpipe + 4.1.2-JL1.0 + + + org.apache.commons + commons-lang3 + + + org.assertj + assertj-core + + + org.junit.jupiter + junit-jupiter-engine + + + + JULIE Lab, Germany + http://www.julielab.de + + GNU Affero General Public License, Version 3.0 http://www.gnu.org/licenses/agpl-3.0.en.html - https://github.com/JULIELab/jcore-base/tree/master/jcore-lingpipe-gazetteer-ae - + https://github.com/JULIELab/jcore-base/tree/master/jcore-lingpipe-gazetteer-ae + diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProvider.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProvider.java index 0395da7c8..0e43d4cd4 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProvider.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProvider.java @@ -13,6 +13,8 @@ public interface ChunkerProvider { public boolean getUseApproximateMatching(); public boolean getNormalize(); + + public boolean getNormalizePlural(); public boolean getTransliterate(); diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImpl.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImpl.java index dc5613755..06171ed03 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImpl.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImpl.java @@ -428,6 +428,11 @@ public boolean getNormalize() { return false; } + @Override + public boolean getNormalizePlural() { + return false; + } + @Override public boolean getTransliterate() { return false; diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImplAlt.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImplAlt.java index 7e3daa924..175653bf5 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImplAlt.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImplAlt.java @@ -42,6 +42,12 @@ public class ChunkerProviderImplAlt implements ChunkerProvider, SharedResourceOb * switched on in the descriptor for the annotator itself! */ public final static String PARAM_NORMALIZE_TEXT = "NormalizeText"; + /** + * Only in effect when {@link #PARAM_NORMALIZE_TEXT} is set to true. If so, will normalize plurals + * found in the text by removing the training 's'. Requires annotations of the type {@link de.julielab.jcore.types.PennBioIEPOSTag} + * to be present in the CAS. + */ + public static final String PARAM_NORMALIZE_PLURAL = "NormalizePlural"; /** * Parameter to indicate whether text - dictionary entries for this class - should be transliterated, i.e. whether * accents and other character variations should be stripped. If this is switched on here, it must also be switched @@ -54,6 +60,7 @@ public class ChunkerProviderImplAlt implements ChunkerProvider, SharedResourceOb private boolean useApproximateMatching; private boolean transliterate; private boolean normalize; + private boolean normalizePlural; private InputStream dictFile; private InputStream stopFile; @@ -71,6 +78,10 @@ public Chunker getChunker() { return dictChunker; } + public boolean getNormalizePlural() { + return normalizePlural; + } + public void load(DataResource resource) throws ResourceInitializationException { LOGGER.info("Loading configuration file from URI \"{}\" (URL: \"{}\").", resource.getUri(), resource.getUrl()); Properties properties = new Properties(); @@ -118,7 +129,11 @@ public void load(DataResource resource) throws ResourceInitializationException { normalize = false; if (normalizeString != null) normalize = new Boolean(normalizeString); - LOGGER.info("Normalize dictionary entries (i.e. completely strip dashes, parenthesis etc): {}", normalize); + LOGGER.info("Normalize dictionary entries and text (i.e. completely strip dashes, parenthesis etc): {}", normalize); + + normalizePlural = Boolean.parseBoolean(properties.getProperty(PARAM_NORMALIZE_PLURAL, "false")) && normalize; + if (normalize) + LOGGER.info("Also normalize plural forms to singular: {}", normalizePlural); String transliterateString = properties.getProperty(PARAM_TRANSLITERATE_TEXT); transliterate = false; @@ -256,14 +271,14 @@ private void readDictionary(InputStream dictFileStream) throws IOException, Anal bf = new BufferedReader(new InputStreamReader(dictFileStream)); String line = ""; - Transliterator transliterator = null; - if (transliterate) - transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC; Lower"); + Transliterator transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC"); TokenizerFactory tokenizerFactory = null; if (normalize) tokenizerFactory = new IndoEuropeanTokenizerFactory(); while ((line = bf.readLine()) != null) { + if (line.startsWith("#")) + continue; String[] values = line.split("\t"); if (values.length != 2) { LOGGER.error("readDictionary() - wrong format of line: " + line); @@ -276,11 +291,11 @@ private void readDictionary(InputStream dictFileStream) throws IOException, Anal continue; if (normalize) { - term = StringNormalizerForChunking.normalizeString(term, tokenizerFactory).string; + term = StringNormalizerForChunking.normalizeString(term, tokenizerFactory, transliterator).string; } if (transliterate) term = transliterator.transform(term); - if (useApproximateMatching && !caseSensitive && !transliterate) + if (useApproximateMatching && !caseSensitive) term = term.toLowerCase(); String label = values[1].trim(); diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ConfigurableChunkerProviderImplAlt.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ConfigurableChunkerProviderImplAlt.java index f0ae88711..f319562bd 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ConfigurableChunkerProviderImplAlt.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ConfigurableChunkerProviderImplAlt.java @@ -1,4 +1,3 @@ - package de.julielab.jcore.ae.lingpipegazetteer.chunking; import com.aliasi.chunk.Chunker; @@ -21,6 +20,7 @@ import java.io.*; import java.net.URI; import java.util.HashSet; +import java.util.Optional; import java.util.Set; import java.util.zip.GZIPInputStream; @@ -29,317 +29,333 @@ * Also, this implementation expects a configurableDataResourceSpecifier for the external resource, * specifying the dictionary directly and providing the parameters via the normal UIMA resource meta data * mechanism. - * + * * @author faessler - * */ public class ConfigurableChunkerProviderImplAlt implements ChunkerProvider, SharedResourceObject { - private static final Logger LOGGER = LoggerFactory.getLogger(ConfigurableChunkerProviderImplAlt.class); - public final static String PARAM_USE_APPROXIMATE_MATCHING = "UseApproximateMatching"; - public final static String PARAM_CASE_SENSITIVE = "CaseSensitive"; - public final static String PARAM_MAKE_VARIANTS = "MakeVariants"; - public final static String PARAM_STOPWORD_FILE = "StopWordFile"; - /** - * Parameter to indicate whether text - dictionary entries for this class - should be normalized by completely - * removing dashes, parenthesis, genitive 's and perhaps more. This is meant to replace the generation of term - * variants and cannot be used together with variation generation. If this is switched on here, it must also be - * switched on in the descriptor for the annotator itself! - */ - public final static String PARAM_NORMALIZE_TEXT = "NormalizeText"; - /** - * Parameter to indicate whether text - dictionary entries for this class - should be transliterated, i.e. whether - * accents and other character variations should be stripped. If this is switched on here, it must also be switched - * on in the descriptor of the annotator itself! - */ - public final static String PARAM_TRANSLITERATE_TEXT = "TransliterateText"; - - private Boolean generateVariants; - private Boolean caseSensitive; - private Boolean useApproximateMatching; - private Boolean transliterate; - private Boolean normalize; - private InputStream dictFile; - private InputStream stopFile; - - private AbstractDictionary dict; - private Chunker dictChunker = null; - private final double CHUNK_SCORE = 1.0; - - private final int MIN_TERM_LENGTH = 3; - private final double APPROX_MATCH_THRESHOLD_SCORE = 100; - private Set stopWords = new HashSet(); - private String stopwordFilePath; + public final static String PARAM_USE_APPROXIMATE_MATCHING = "UseApproximateMatching"; + public final static String PARAM_CASE_SENSITIVE = "CaseSensitive"; + public final static String PARAM_MAKE_VARIANTS = "MakeVariants"; + public final static String PARAM_STOPWORD_FILE = "StopWordFile"; + /** + * Parameter to indicate whether text - dictionary entries for this class - should be normalized by completely + * removing dashes, parenthesis, genitive 's and perhaps more. This is meant to replace the generation of term + * variants and cannot be used together with variation generation. If this is switched on here, it must also be + * switched on in the descriptor for the annotator itself! + */ + public final static String PARAM_NORMALIZE_TEXT = "NormalizeText"; + /** + * Only in effect when {@link #PARAM_NORMALIZE_TEXT} is set to true. If so, will normalize plurals + * found in the text by removing the training 's'. Requires annotations of the type {@link de.julielab.jcore.types.PennBioIEPOSTag} + * to be present in the CAS. + */ + public static final String PARAM_NORMALIZE_PLURAL = "NormalizePlural"; + /** + * Parameter to indicate whether text - dictionary entries for this class - should be transliterated, i.e. whether + * accents and other character variations should be stripped. If this is switched on here, it must also be switched + * on in the descriptor of the annotator itself! + */ + public final static String PARAM_TRANSLITERATE_TEXT = "TransliterateText"; + private static final Logger LOGGER = LoggerFactory.getLogger(ConfigurableChunkerProviderImplAlt.class); + private final double CHUNK_SCORE = 1.0; + private final int MIN_TERM_LENGTH = 3; + private final double APPROX_MATCH_THRESHOLD_SCORE = 100; + private Boolean generateVariants; + private Boolean caseSensitive; + private Boolean useApproximateMatching; + private Boolean transliterate; + private Boolean normalize; + private Boolean normalizePlural; + private InputStream dictFile; + private InputStream stopFile; + private AbstractDictionary dict; + private Chunker dictChunker = null; + private Set stopWords = new HashSet(); + private String stopwordFilePath; private URI resourceUri; public Chunker getChunker() { - return dictChunker; - } + return dictChunker; + } - public void load(DataResource resource) throws ResourceInitializationException { + public void load(DataResource resource) throws ResourceInitializationException { resourceUri = resource.getUri(); LOGGER.info("Creating dictionary chunker with dictionary loaded from " + resourceUri); - ConfigurationParameterSettings settings = resource.getMetaData().getConfigurationParameterSettings(); - stopwordFilePath = (String) settings.getParameterValue(PARAM_STOPWORD_FILE); - if (stopwordFilePath == null) - throw new ResourceInitializationException(ResourceInitializationException.CONFIG_SETTING_ABSENT, - new Object[] { PARAM_STOPWORD_FILE }); + ConfigurationParameterSettings settings = resource.getMetaData().getConfigurationParameterSettings(); + stopwordFilePath = (String) settings.getParameterValue(PARAM_STOPWORD_FILE); + if (stopwordFilePath == null) + throw new ResourceInitializationException(ResourceInitializationException.CONFIG_SETTING_ABSENT, + new Object[]{PARAM_STOPWORD_FILE}); - generateVariants = (Boolean) settings.getParameterValue(PARAM_MAKE_VARIANTS); - LOGGER.info("Generate variants: {}", generateVariants); + generateVariants = (Boolean) settings.getParameterValue(PARAM_MAKE_VARIANTS); + LOGGER.info("Generate variants: {}", generateVariants); - normalize = (Boolean) settings.getParameterValue(PARAM_NORMALIZE_TEXT); - LOGGER.info("Normalize dictionary entries (i.e. completely strip dashes, parenthesis etc): {}", normalize); + normalize = (Boolean) settings.getParameterValue(PARAM_NORMALIZE_TEXT); + LOGGER.info("Normalize dictionary entries (i.e. completely strip dashes, parenthesis etc): {}", normalize); + normalizePlural = Optional.ofNullable((Boolean) settings.getParameterValue(PARAM_NORMALIZE_PLURAL)).orElse(false) && normalize; + if (normalize) + LOGGER.info("Also normalize plural forms to singular: {}", normalizePlural); - transliterate = (Boolean) settings.getParameterValue(PARAM_TRANSLITERATE_TEXT); - LOGGER.info("Transliterate dictionary entries (i.e. transform accented characters to their base forms): {}", - transliterate); + transliterate = (Boolean) settings.getParameterValue(PARAM_TRANSLITERATE_TEXT); + LOGGER.info("Transliterate dictionary entries (i.e. transform accented characters to their base forms): {}", + transliterate); - caseSensitive = (Boolean) settings.getParameterValue(PARAM_CASE_SENSITIVE); - LOGGER.info("Case sensitive: {}", caseSensitive); + caseSensitive = (Boolean) settings.getParameterValue(PARAM_CASE_SENSITIVE); + LOGGER.info("Case sensitive: {}", caseSensitive); - useApproximateMatching = (Boolean) settings.getParameterValue(PARAM_USE_APPROXIMATE_MATCHING); - LOGGER.info("Use approximate matching: {}", useApproximateMatching); + useApproximateMatching = (Boolean) settings.getParameterValue(PARAM_USE_APPROXIMATE_MATCHING); + LOGGER.info("Use approximate matching: {}", useApproximateMatching); - if (normalize && generateVariants) - throw new ResourceInitializationException( - new IllegalStateException( - "MakeVariants and NormalizeText are both activated which is invalid. The two options work towards the same goal in two different ways, i.e. to recognize dictionary entry variants not given explicitly. However, the approaches are not compatible and you have to choose a single one.")); + if (normalize && generateVariants) + throw new ResourceInitializationException( + new IllegalStateException( + "MakeVariants and NormalizeText are both activated which is invalid. The two options work towards the same goal in two different ways, i.e. to recognize dictionary entry variants not given explicitly. However, the approaches are not compatible and you have to choose a single one.")); - try { + try { try { dictFile = UriUtilities.getInputStreamFromUri(resource.getUri()); } catch (Exception e) { LOGGER.error("Could not load the dictionary from {}, see the following exception for details.", resource.getUri()); throw e; } - stopFile = readStreamFromFileSystemOrClassPath(stopwordFilePath); - initStopWords(stopFile); - readDictionary(dictFile); - - LOGGER.info("Now creating chunker."); - long time = System.currentTimeMillis(); - if (useApproximateMatching) { - final Set charsToDelete = new HashSet<>(); - charsToDelete.add('-'); - // charsToDelete.add('+'); - // charsToDelete.add(','); - // charsToDelete.add('.'); - // charsToDelete.add(':'); - // charsToDelete.add(';'); - // charsToDelete.add('?'); - // charsToDelete.add('!'); - // charsToDelete.add('*'); - // charsToDelete.add('§'); - // charsToDelete.add('$'); - // charsToDelete.add('%'); - // charsToDelete.add('&'); - // charsToDelete.add('/'); - // charsToDelete.add('\\'); - // charsToDelete.add('('); - // charsToDelete.add(')'); - // charsToDelete.add('<'); - // charsToDelete.add('>'); - // charsToDelete.add('['); - // charsToDelete.add(']'); - // charsToDelete.add('='); - // charsToDelete.add('\''); - // charsToDelete.add('`'); - // charsToDelete.add('´'); - // charsToDelete.add('"'); - // charsToDelete.add('#'); - - WeightedEditDistance editDistance = ApproxDictionaryChunker.TT_DISTANCE; - editDistance = new WeightedEditDistance() { - - @Override - public double deleteWeight(char cDeleted) { - double ret; - if (cDeleted == '-') - ret = -5.0; - else if (cDeleted == ' ' || charsToDelete.contains(cDeleted)) - ret = -10.0; - else - ret = -110.0; - return ret; - } - - @Override - public double insertWeight(char cInserted) { - return deleteWeight(cInserted); - } - - @Override - public double matchWeight(char cMatched) { - return 0.0; - } - - @Override - public double substituteWeight(char cDeleted, char cInserted) { - if (cDeleted == ' ' && cInserted == '-') - return -2.0; - if (cDeleted == '-' && cInserted == ' ') - return -2.0; - if (cDeleted == ' ' && charsToDelete.contains(cInserted)) - return -10.0; - if (charsToDelete.contains(cDeleted) && cInserted == ' ') - return -10.0; - return -110.0; - } - - @Override - public double transposeWeight(char c1, char c2) { - return Double.NEGATIVE_INFINITY; - } - }; - - dictChunker = - new ApproxDictionaryChunker((TrieDictionary) dict, - IndoEuropeanTokenizerFactory.INSTANCE, editDistance, APPROX_MATCH_THRESHOLD_SCORE); - } else { - dictChunker = - new ExactDictionaryChunker(dict, IndoEuropeanTokenizerFactory.INSTANCE, false, caseSensitive); - } - time = System.currentTimeMillis() - time; - LOGGER.info("Building the actual chunker from the dictionary took {}ms ({}s).", time, time / 1000); - - } catch (Exception e) { - LOGGER.error("Exception while creating chunker instance from dictionary file {} with stopwords from {}", resource.getUri(), stopwordFilePath, e); - } - } - - private void readDictionary(InputStream dictFileStream) throws IOException, AnalysisEngineProcessException { - long time = System.currentTimeMillis(); - if (useApproximateMatching) { - dict = new TrieDictionary(); - } else { - dict = new MapDictionary(); - } - // now read from file and add entries - LOGGER.info("readDictionary() - adding entries from " + resourceUri.toString() + " to dictionary..."); - BufferedReader bf = null; - try { - bf = new BufferedReader(new InputStreamReader(dictFileStream)); - String line = ""; - - Transliterator transliterator = null; - if (transliterate) - transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC; Lower"); - - TokenizerFactory tokenizerFactory = null; - if (normalize) - tokenizerFactory = new IndoEuropeanTokenizerFactory(); - while ((line = bf.readLine()) != null) { - String[] values = line.split("\t"); - if (values.length != 2) { - LOGGER.error("readDictionary() - wrong format of line: " + line); - throw new AnalysisEngineProcessException(AnalysisEngineProcessException.ANNOTATOR_EXCEPTION, null); - } - - String term = values[0].trim(); - - if (stopWords.contains(term.toLowerCase())) - continue; - - if (normalize) { - term = StringNormalizerForChunking.normalizeString(term, tokenizerFactory).string; - } - if (transliterate) - term = transliterator.transform(term); - if (useApproximateMatching && !caseSensitive && !transliterate) - term = term.toLowerCase(); - - String label = values[1].trim(); - if (term.length() < MIN_TERM_LENGTH) - continue; - - if (generateVariants) { - if (true) - throw new NotImplementedException( - "In this alternative ChunkerProvider, generating variants will currently fail to adequately filter out stop words due to the transliteration and/or normalization algorithms. If you don't need those algorithms, just stick to the original ChunkerProviderImpl. Otherwise, this issue must be fixed (shouldnt be too difficult). Variants are also currently not treated with normalization/transliteration (but this is deemed to be two alternative ways to achieve a similar thing anyway)"); - } else { - // This is a second stop-word-check but here the term has been transliterated and/or normalized. If - // somehow the result of this was a stop word, ignore it. - if (!stopWords.contains(term.toLowerCase())) - dict.addEntry(new DictionaryEntry(term, label, CHUNK_SCORE)); - } - } - - time = System.currentTimeMillis() - time; - LOGGER.info("Reading dictionary took {}ms ({}s)", time, time / 1000); - } finally { - if (null != bf) - bf.close(); - } - } - - private void initStopWords(InputStream stopFileStream) throws IOException { - stopWords = new HashSet(); - - LOGGER.info("readDictionary() - adding entries from " + stopwordFilePath + " to dictionary..."); - BufferedReader bf = new BufferedReader(new InputStreamReader(stopFileStream)); - String line = ""; - - try { - while ((line = bf.readLine()) != null) { - if (line.startsWith("#")) { - continue; - } - stopWords.add(line.trim().toLowerCase()); - } - bf.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - @Override - public Set getStopWords() { - return stopWords; - } - - @Override - public boolean getUseApproximateMatching() { - return useApproximateMatching; - } - - @Override - public boolean getNormalize() { - return normalize; - } - - @Override - public boolean getTransliterate() { - return transliterate; - } - - @Override - public boolean getCaseSensitive() { - return caseSensitive; - - } - - private InputStream readStreamFromFileSystemOrClassPath(String filePath) { - InputStream is = null; - File file = new File(filePath); - if (file.exists()) { - try { - is = new FileInputStream(file); - } catch (FileNotFoundException e) { - e.printStackTrace(); - } - } else { - is = getClass().getResourceAsStream(filePath.startsWith("/") ? filePath : "/" + filePath); - } - if (filePath.endsWith(".gz") || filePath.endsWith(".gzip")) - try { - is = new GZIPInputStream(is); - } catch (IOException e) { - e.printStackTrace(); - } - return is; - } + stopFile = readStreamFromFileSystemOrClassPath(stopwordFilePath); + initStopWords(stopFile); + readDictionary(dictFile); + + LOGGER.info("Now creating chunker."); + long time = System.currentTimeMillis(); + if (useApproximateMatching) { + final Set charsToDelete = new HashSet<>(); + charsToDelete.add('-'); + // charsToDelete.add('+'); + // charsToDelete.add(','); + // charsToDelete.add('.'); + // charsToDelete.add(':'); + // charsToDelete.add(';'); + // charsToDelete.add('?'); + // charsToDelete.add('!'); + // charsToDelete.add('*'); + // charsToDelete.add('§'); + // charsToDelete.add('$'); + // charsToDelete.add('%'); + // charsToDelete.add('&'); + // charsToDelete.add('/'); + // charsToDelete.add('\\'); + // charsToDelete.add('('); + // charsToDelete.add(')'); + // charsToDelete.add('<'); + // charsToDelete.add('>'); + // charsToDelete.add('['); + // charsToDelete.add(']'); + // charsToDelete.add('='); + // charsToDelete.add('\''); + // charsToDelete.add('`'); + // charsToDelete.add('´'); + // charsToDelete.add('"'); + // charsToDelete.add('#'); + + WeightedEditDistance editDistance = ApproxDictionaryChunker.TT_DISTANCE; + editDistance = new WeightedEditDistance() { + + @Override + public double deleteWeight(char cDeleted) { + double ret; + if (cDeleted == '-') + ret = -5.0; + else if (cDeleted == ' ' || charsToDelete.contains(cDeleted)) + ret = -10.0; + else + ret = -110.0; + return ret; + } + + @Override + public double insertWeight(char cInserted) { + return deleteWeight(cInserted); + } + + @Override + public double matchWeight(char cMatched) { + return 0.0; + } + + @Override + public double substituteWeight(char cDeleted, char cInserted) { + if (cDeleted == ' ' && cInserted == '-') + return -2.0; + if (cDeleted == '-' && cInserted == ' ') + return -2.0; + if (cDeleted == ' ' && charsToDelete.contains(cInserted)) + return -10.0; + if (charsToDelete.contains(cDeleted) && cInserted == ' ') + return -10.0; + return -110.0; + } + + @Override + public double transposeWeight(char c1, char c2) { + return Double.NEGATIVE_INFINITY; + } + }; + + dictChunker = + new ApproxDictionaryChunker((TrieDictionary) dict, + IndoEuropeanTokenizerFactory.INSTANCE, editDistance, APPROX_MATCH_THRESHOLD_SCORE); + } else { + dictChunker = + new ExactDictionaryChunker(dict, IndoEuropeanTokenizerFactory.INSTANCE, false, caseSensitive); + } + time = System.currentTimeMillis() - time; + LOGGER.info("Building the actual chunker from the dictionary took {}ms ({}s).", time, time / 1000); + + } catch (Exception e) { + LOGGER.error("Exception while creating chunker instance from dictionary file {} with stopwords from {}", resource.getUri(), stopwordFilePath, e); + } + } + + private void readDictionary(InputStream dictFileStream) throws IOException, AnalysisEngineProcessException { + long time = System.currentTimeMillis(); + if (useApproximateMatching) { + dict = new TrieDictionary(); + } else { + dict = new MapDictionary(); + } + // now read from file and add entries + LOGGER.info("readDictionary() - adding entries from " + resourceUri.toString() + " to dictionary..."); + BufferedReader bf = null; + try { + bf = new BufferedReader(new InputStreamReader(dictFileStream)); + String line = ""; + + Transliterator transliterator = null; +// transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC; Lower"); + transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC"); + + TokenizerFactory tokenizerFactory = null; + if (normalize) + tokenizerFactory = new IndoEuropeanTokenizerFactory(); + while ((line = bf.readLine()) != null) { + if (line.startsWith("#")) + continue; + String[] values = line.split("\t"); + if (values.length != 2) { + LOGGER.error("readDictionary() - wrong format of line: " + line); + throw new AnalysisEngineProcessException(AnalysisEngineProcessException.ANNOTATOR_EXCEPTION, null); + } + + String term = values[0].trim(); + + if (stopWords.contains(term.toLowerCase())) + continue; + + if (normalize) { + term = StringNormalizerForChunking.normalizeString(term, tokenizerFactory, transliterator).string; + } + if (transliterate) + term = transliterator.transform(term); + // the exact matcher takes the caseSensitive switch as a parameter, we don't need to do it ourselves + if (useApproximateMatching && !caseSensitive) + term = term.toLowerCase(); + + String label = values[1].trim(); + if (term.length() < MIN_TERM_LENGTH) + continue; + + if (generateVariants) { + if (true) + throw new NotImplementedException( + "In this alternative ChunkerProvider, generating variants will currently fail to adequately filter out stop words due to the transliteration and/or normalization algorithms. If you don't need those algorithms, just stick to the original ChunkerProviderImpl. Otherwise, this issue must be fixed (shouldnt be too difficult). Variants are also currently not treated with normalization/transliteration (but this is deemed to be two alternative ways to achieve a similar thing anyway)"); + } else { + // This is a second stop-word-check but here the term has been transliterated and/or normalized. If + // somehow the result of this was a stop word, ignore it. + if (!stopWords.contains(term.toLowerCase())) + dict.addEntry(new DictionaryEntry(term, label, CHUNK_SCORE)); + } + } + + time = System.currentTimeMillis() - time; + LOGGER.info("Reading dictionary took {}ms ({}s)", time, time / 1000); + } finally { + if (null != bf) + bf.close(); + } + } + + private void initStopWords(InputStream stopFileStream) throws IOException { + stopWords = new HashSet(); + + LOGGER.info("readDictionary() - adding entries from " + stopwordFilePath + " to dictionary..."); + BufferedReader bf = new BufferedReader(new InputStreamReader(stopFileStream)); + String line = ""; + + try { + while ((line = bf.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + stopWords.add(line.trim().toLowerCase()); + } + bf.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + @Override + public Set getStopWords() { + return stopWords; + } + + @Override + public boolean getUseApproximateMatching() { + return useApproximateMatching; + } + + @Override + public boolean getNormalize() { + return normalize; + } + + @Override + public boolean getNormalizePlural() { + return normalizePlural; + } + + @Override + public boolean getTransliterate() { + return transliterate; + } + + @Override + public boolean getCaseSensitive() { + return caseSensitive; + + } + + private InputStream readStreamFromFileSystemOrClassPath(String filePath) throws FileNotFoundException { + InputStream is = null; + File file = new File(filePath); + if (file.exists()) { + try { + is = new FileInputStream(file); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + } else { + is = getClass().getResourceAsStream(filePath.startsWith("/") ? filePath : "/" + filePath); + } + if (filePath.endsWith(".gz") || filePath.endsWith(".gzip")) + try { + is = new GZIPInputStream(is); + } catch (IOException e) { + e.printStackTrace(); + } + if (is == null) + throw new FileNotFoundException("Could not read contents from " + filePath); + return is; + } } diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java index 6ddd3b58a..b2a534d9f 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java @@ -1,21 +1,20 @@ -/** - * +/** * Copyright (c) 2015, JULIE Lab. - * + *

* Author: tomanek, jwermter - * - * - * Creation date: Jan 14, 2008 - * + *

+ *

+ * Creation date: Jan 14, 2008 + *

* A entity tagger based on a dictionary lookup. Lingpipe's gazetteer is used. - * - * There are two modes: exact matching (only terms which map exactly to - * those specified in dictionary are found). Approximate matching (by means of - * weighted levenstein distance, approximate matches are found.) - * - * As approximate matching results in concurring matches on overlapping spans, I + *

+ * There are two modes: exact matching (only terms which map exactly to + * those specified in dictionary are found). Approximate matching (by means of + * weighted levenstein distance, approximate matches are found.) + *

+ * As approximate matching results in concurring matches on overlapping spans, I * added a mechanism to resolve this according to this rules: in overlapping matches - * the one with the best (here: lowest) score is taken, if more than one chunk has the + * the one with the best (here: lowest) score is taken, if more than one chunk has the * same score, the one with the longest span is chosen. **/ package de.julielab.jcore.ae.lingpipegazetteer.uima; @@ -26,6 +25,7 @@ import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; import com.aliasi.tokenizer.TokenizerFactory; import com.ibm.icu.text.Transliterator; +import de.julielab.java.utilities.spanutils.OffsetSet; import de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider; import de.julielab.jcore.ae.lingpipegazetteer.chunking.OverlappingChunk; import de.julielab.jcore.ae.lingpipegazetteer.utils.StringNormalizerForChunking; @@ -33,12 +33,14 @@ import de.julielab.jcore.types.Abbreviation; import de.julielab.jcore.types.AbbreviationLongform; import de.julielab.jcore.types.ConceptMention; +import de.julielab.jcore.types.PennBioIEPOSTag; import de.julielab.jcore.types.mantra.Entity; import de.julielab.jcore.utility.JCoReAnnotationTools; import de.julielab.jcore.utility.index.IndexTermGenerator; import de.julielab.jcore.utility.index.JCoReHashMapAnnotationIndex; import de.julielab.jcore.utility.index.TermGenerators; import de.julielab.jcore.utility.index.TermGenerators.LongOffsetIndexTermGenerator; +import org.apache.commons.lang3.Range; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; @@ -54,705 +56,677 @@ import org.slf4j.LoggerFactory; import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; public class GazetteerAnnotator extends JCasAnnotator_ImplBase { - private static final String COMPONENT_ID = GazetteerAnnotator.class.getCanonicalName(); - private static final Logger LOGGER = LoggerFactory.getLogger(GazetteerAnnotator.class); - public static final String CHUNKER_RESOURCE_NAME = "DictionaryChunkerProvider"; - // public final static String PARAM_USE_APPROXIMATE_MATCHING = - // "UseApproximateMatching"; - public final static String PARAM_CHECK_ACRONYMS = "CheckAcronyms"; - public final static String PARAM_OUTPUT_TYPE = "OutputType"; - /** - * Only required to set to false as an annotator parameter when using - * approximate matching and the ChunkerProvider is set to CaseSensitive false. - * That is because the approximate chunker is always case sensitive. - */ - // public final static String PARAM_CASE_SENSITIVE = "CaseSensitive"; - private static final String PARAM_USE_MANTRA_MODE = "MantraMode"; - /** - * Parameter to indicate whether text - CAS document text for this class - - * should be normalized by completely removing dashes, parenthesis, genitive 's - * and perhaps more. This is meant to replace the generation of term variants - * and cannot be used together with variation generation. If this is switched on - * here, it must also be switched on in the external resource configuration for - * the ChunkerProvider! Can only be used with alternative ChunkerProviderImplAlt - * implementation. - */ - // public final static String PARAM_NORMALIZE_TEXT = "NormalizeText"; - /** - * Parameter to indicate whether text - CAS document text for this class - - * should be transliterated, i.e. whether accents and other character variations - * should be stripped. If this is switched on here, it must also be switched on - * in the external resource configuration for the ChunkerProvider! Can only be - * used with alternative ChunkerProviderImplAlt implementation. - */ - // public final static String PARAM_TRANSLITERATE_TEXT = - // "TransliterateText"; - - @ConfigurationParameter(name = PARAM_USE_MANTRA_MODE, defaultValue = "false") - private boolean mantraMode = false; - - // needs to be true because of chunker injection: - @ConfigurationParameter(name = PARAM_CHECK_ACRONYMS, defaultValue = "true") - private boolean checkAcronyms = true; - @ConfigurationParameter(name = PARAM_OUTPUT_TYPE) - private String outputType = null; - - @ExternalResource(key = CHUNKER_RESOURCE_NAME, mandatory = true) - private ChunkerProvider provider; - /** - * Removes diacritics and does lower casing - */ - private Transliterator transliterator; - private Chunker gazetteer = null; - private TokenizerFactory normalizationTokenFactory; - private Set stopWords; - - // TODO for debug only - private static int initializeCount = 0; - - public void initialize(UimaContext aContext) throws ResourceInitializationException { - LOGGER.info("calls to initialize: " + initializeCount); - - super.initialize(aContext); - LOGGER.info("initialize() - initializing GazetteerAnnotator..."); - - try { - provider = (ChunkerProvider) getContext().getResourceObject(CHUNKER_RESOURCE_NAME); - gazetteer = provider.getChunker(); -// stopWords = provider.getStopWords(); - String[] stopwordArray = { "a", "about", "above", "across", "after", "afterwards", "again", "against", - "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", - "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", - "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", - "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", - "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", - "cant", "co", "computer", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", - "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", - "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", - "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", - "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", - "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", - "herself", "high", "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in", - "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", - "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", - "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", - "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", - "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", - "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", - "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", - "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", - "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", - "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", - "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", - "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though", "three", - "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", - "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", - "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", - "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", - "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", - "you", "your", "yours", "yourself", "yourselves", }; - stopWords = new HashSet<>(); - for (String sw : stopwordArray) - stopWords.add(sw); - } catch (ResourceAccessException e) { - LOGGER.error("Exception while initializing", e); - } - - // check acronyms - checkAcronyms = (Boolean) aContext.getConfigParameterValue(PARAM_CHECK_ACRONYMS); - LOGGER.info( - "Check for acronyms (found dictionary entries that are abbreviations are only accepted if their long form is an abbreviation of the same type, too): {}", - checkAcronyms); - // filter stop words - - Boolean normalizeBoolean = provider.getNormalize();// (Boolean) - // aContext.getConfigParameterValue(PARAM_NORMALIZE_TEXT); - if (normalizeBoolean) { - normalizationTokenFactory = new IndoEuropeanTokenizerFactory(); - } - LOGGER.info("Normalize CAS document text (i.e. do stemming and remove possessive 's): {}", provider.getNormalize()); - - Boolean transliterateBoolean = provider.getTransliterate();// (Boolean) - // aContext.getConfigParameterValue(PARAM_TRANSLITERATE_TEXT); - if (transliterateBoolean) { - transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC; Lower"); - } - LOGGER.info("Transliterate CAS document text (i.e. transform accented characters to their base forms): {}", - provider.getTransliterate()); - - // define output level - outputType = (String) aContext.getConfigParameterValue(PARAM_OUTPUT_TYPE); - if (outputType == null) { - LOGGER.error("initialize() - output type not specified."); - throw new ResourceInitializationException(); - } - - mantraMode = aContext.getConfigParameterValue(PARAM_USE_MANTRA_MODE) != null - ? (Boolean) aContext.getConfigParameterValue(PARAM_USE_MANTRA_MODE) - : false; - } - - /** - * process the CAS, there are two subroutines: one for exact and one for - * approximate matching. - */ - public void process(JCas aJCas) throws AnalysisEngineProcessException { - if (gazetteer == null) - throw new IllegalStateException("The actual gazetteer object is null. Check previous log messages pointing to the error (most probably the dictionary file could not be found)."); - String docText = aJCas.getDocumentText(); - if (docText == null || docText.length() == 0) - return; - if (provider.getUseApproximateMatching() && !provider.getTransliterate() && !provider.getCaseSensitive()) - docText = docText.toLowerCase(); - NormalizedString normalizedDocText = null; - if (provider.getNormalize()) { - normalizedDocText = StringNormalizerForChunking.normalizeString(docText, normalizationTokenFactory, - transliterator); - } - - IndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator(); - JCoReHashMapAnnotationIndex conceptMentionIndex = new JCoReHashMapAnnotationIndex<>( - longOffsetTermGenerator, longOffsetTermGenerator, aJCas, ConceptMention.type); - JCoReHashMapAnnotationIndex abbreviationIndex = new JCoReHashMapAnnotationIndex<>( - longOffsetTermGenerator, longOffsetTermGenerator, aJCas, Abbreviation.type); - - LOGGER.debug("Performing actual Gazetteer annotation..."); - Chunking chunking; - if (provider.getNormalize()) - chunking = gazetteer.chunk(normalizedDocText.string); - else - chunking = gazetteer.chunk(docText); - LOGGER.debug("Gazetteer annotation done."); - if (provider.getUseApproximateMatching()) { - /* - * handle matches found by approx matching: this means especially overlapping - * matches with different scores (doesn't happen with exact matches) - */ - List chunkList = filterChunking(chunking); - List overlappingChunks = groupOverlappingChunks(chunkList, - chunking.charSequence().toString()); - // now add the best chunk of all overlappingChunks to the CAS - LOGGER.debug("all overlapping chunks:\n"); - // Set bestChunksSet = new HashSet<>(); - for (OverlappingChunk overlappingChunk : overlappingChunks) { - // show chunks - LOGGER.debug(overlappingChunk.toStringAll()); - List bestChunks = overlappingChunk.getBestChunks(); - LOGGER.debug("Found {} best chunks.", bestChunks.size()); - for (int i = 0; i < bestChunks.size(); i++) { - Chunk bestChunk = bestChunks.get(i); - LOGGER.debug("Nr. " + i + " best chunk: " + bestChunk.start() + " - " + bestChunk.end() + ": " - + bestChunk.score() + " ; type: " + bestChunk.type()); - // TODO this check and the corresponding set may be removed - // when this exception hasn't been thrown - // in a - // while. Its currently just to be sure, this should not - // happen any more since the chunks are sorted - // by - // offset in the grouping method. - // if (bestChunksSet.contains(bestChunk)) { - // throw new IllegalStateException("Duplicate best chunk: " + bestChunk); - // } - // bestChunksSet.add(bestChunk); - // add 2 cas - add2Cas(aJCas, bestChunk, normalizedDocText, conceptMentionIndex, abbreviationIndex); - } - } - // for (Chunk chunk : chunking.chunkSet()) { - // add2Cas(aJCas, chunk, normalizedDocText); - // } - } else { - for (Chunk chunk : chunking.chunkSet()) { - add2Cas(aJCas, chunk, normalizedDocText, conceptMentionIndex, abbreviationIndex); - } - } - if (checkAcronyms && !mantraMode) { - LOGGER.debug("process() - checking acronyms"); - annotateAcronymsWithFullFormEntity(aJCas, conceptMentionIndex); - } - } - - private List filterChunking(Chunking chunking) { - // ChunkingImpl newChunking = new ChunkingImpl(chunking.charSequence()); - List newChunking = new ArrayList<>(chunking.chunkSet().size()); - for (Chunk chunk : chunking.chunkSet()) { - String chunkText = chunking.charSequence().subSequence(chunk.start(), chunk.end()).toString(); - if (filterParenthesis(chunkText)) - continue; - if (filterPunctuationArtifacts(chunkText)) - continue; - if (filterStopwords(chunkText)) - continue; - newChunking.add(chunk); - } - return newChunking; - } - - private boolean filterPunctuationArtifacts(String chunkText) { - if (chunkText.startsWith("-")) - return true; - if (chunkText.endsWith("-")) - return true; - return false; - } - - private boolean filterStopwords(String chunkText) { - if (stopWords.contains(chunkText.toLowerCase())) - return true; - if (chunkText.contains(" ")) { - String[] words = chunkText.split(" "); - int stopWordCounter = 0; - for (String word : words) { - if (stopWords.contains(word.toLowerCase())) - stopWordCounter++; - } - if (Math.ceil(words.length / 2.0) <= stopWordCounter) { - LOGGER.debug("Filtering due to high stop word occurrences: {}", chunkText); - return true; - } - } - return false; - } - - static boolean filterParenthesis(String chunkText) { - Stack parenthesisStack = new Stack<>(); - // Map pMap = new HashMap<>(); - for (int i = 0; i < chunkText.length(); i++) { - char current = chunkText.charAt(i); - if (isParentheses(current)) { - if (isOpenedParentheses(current)) { - parenthesisStack.add(current); - } else { - if (parenthesisStack.isEmpty()) - return true; - if (!isParenthesisCounterpart(parenthesisStack.pop(), current)) - return true; - } - } - } - if (!parenthesisStack.isEmpty()) - return true; - return false; - } - - private static boolean isParenthesisCounterpart(Character char1, Character char2) { - ParenthesisType char1ParenthesisType = getParenthesisType(char2); - ParenthesisType char2ParenthesisType = getParenthesisType(char1); - if (char1ParenthesisType == ParenthesisType.NONE || char2ParenthesisType == ParenthesisType.NONE) - throw new IllegalArgumentException("The two characters '" + char1 + "' and '" + char2 - + "' were given in order to determine whether they are compatible parenthesis counterparts, but at least one of those characters is no parentheses."); - return char1ParenthesisType.equals(char2ParenthesisType); - } - - // enum ParenthesesType { - // ROUND_CLOSED { - // @Override - // boolean isOpen() { - // return false; - // } - // - // }, - // BRACKET_CLOSED { - // @Override - // boolean isOpen() { - // return false; - // } - // }, - // CURLY_CLOSED { - // @Override - // boolean isOpen() { - // return false; - // } - // - // }, - // ROUND_OPENED { - // @Override - // boolean isOpen() { - // return true; - // } - // }, - // BRACKET_OPENED { - // @Override - // boolean isOpen() { - // return true; - // } - // }, - // CURLY_OPENED { - // @Override - // boolean isOpen() { - // return true; - // } - // }; - // abstract boolean isOpen(); - // - // boolean isClose() { - // return !isOpen(); - // }; - // } - - enum ParenthesisType { - ROUND, BRACKET, CURLY, NONE - } - - static ParenthesisType getParenthesisType(char current) { - switch (current) { - case '(': - case ')': - return ParenthesisType.ROUND; - case '[': - case ']': - return ParenthesisType.BRACKET; - case '{': - case '}': - return ParenthesisType.CURLY; - default: - return ParenthesisType.NONE; - } - } - - static boolean isParentheses(char current) { - return isOpenedParentheses(current) || isClosedParentheses(current); - } - - static boolean isOpenedParentheses(char current) { - switch (current) { - case '(': - case '[': - case '{': - return true; - default: - return false; - } - } - - static boolean isClosedParentheses(char current) { - switch (current) { - case ')': - case ']': - case '}': - return true; - default: - return false; - } - } - - static List groupOverlappingChunks(List chunkList, String chunkedText) { - // sort chunkList so the grouping works as intended - Collections.sort(chunkList, new Comparator() { - - @Override - public int compare(Chunk o1, Chunk o2) { - return o1.start() - o2.start(); - } - - }); - // group overlapping chunks - List overlappingChunks = new ArrayList(); - for (Chunk chunk : chunkList) { - // for debugging - // System.out.println("chunking.add(ChunkFactory.createChunk(" + - // chunk.start() + ", " + chunk.end() + - // ", 0d));"); - boolean added = false; - for (OverlappingChunk over : overlappingChunks) { - if (over.isOverlappingSpan(chunk.start(), chunk.end())) { - over.addChunk(chunk.start(), chunk.end(), chunk); - added = true; - } - } - if (!added) { - overlappingChunks.add(new OverlappingChunk(chunk.start(), chunk.end(), chunk, chunkedText)); - added = true; - } - } - return overlappingChunks; - } - - // ------------ INFO .......... - // String text = aJCas.getDocumentText(); - // int start = chunk.start(); - // int end = chunk.end(); - // String type = chunk.type(); - // double score = chunk.score(); - // String phrase = text.substring(start, end); - // System.out.println(" found phrase=|" + phrase + "|" - // + " start=" + start + " end=" + end + " type=" + type - // + " score=" + score); - // ------------ INFO .......... - /** - * checks whether a chunk (= dictionary match) is an acronym. If yes, checks - * whether respective full form (obtained via abbr textReference) is - * ConceptMention and has same specificType as chunk If these conditions are not - * fulfilled, no entity annotation will be made. - * - * @param abbreviationIndex - * @param conceptMentionIndex - */ - private boolean isAcronymWithSameFullFormSpecificType(JCas aJCas, Chunk chunk, NormalizedString normalizedDocText, - JCoReHashMapAnnotationIndex conceptMentionIndex, - JCoReHashMapAnnotationIndex abbreviationIndex) { - // Annotation anno; - int start; - int end; - if (provider.getNormalize()) { - try { - start = normalizedDocText.getOriginalOffset(chunk.start()); - end = normalizedDocText.getOriginalOffset(chunk.end()); - } catch (Exception e) { - System.out.println("Text: " + normalizedDocText); - System.out.println("Chunk: " + chunk); - System.out.println("Chunk end: " + chunk.end()); - System.out - .println("Normalized Text: " + normalizedDocText.string.substring(chunk.start(), chunk.end())); - throw e; - } - // anno = new Annotation(aJCas, start, end); - } else { - start = chunk.start(); - end = chunk.end(); - } - - LongOffsetIndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator(); - // Retrieves potential abbr annotation - Abbreviation abbr = abbreviationIndex.getFirst(longOffsetTermGenerator.forOffsets(start, end)); - // check whether it's an abbr - String chunktext = null; - if (LOGGER.isDebugEnabled()) - chunktext = aJCas.getDocumentText().substring(start, end); - if (abbr == null) { - LOGGER.debug("{} chunk \"{}\" is not an abbreviation\n", chunk, chunktext); - return true; - } - // checks whether respective full form is ConceptMention - AbbreviationLongform textRef = abbr.getTextReference(); - ConceptMention em = conceptMentionIndex.getFirst(textRef); - if (em == null) { - LOGGER.debug( - chunk + " chunk \"{}\" is an abbreviation but respective full \"{}\" form is no ConceptMention\n", - chunktext, textRef.getCoveredText()); - return false; - } - - // checks whether full form annotation matches the type to be annotated - // here - String emType = em.getClass().getCanonicalName(); - if (emType.equals(outputType)) { - LOGGER.debug(chunk - + " chunk \"{}\" is an abbreviation and respective full form \"{}\" is ConceptMention with same type as OutputType\n", - chunktext, em.getCoveredText()); - return true; - } - - LOGGER.debug(chunk - + " chunk \"{}\" is an abbreviation but respective full form \"{}\" is ConceptMention without the correct OutputType (is: {}; OutputType: {})\n", - new Object[] { chunktext, em.getCoveredText(), emType, outputType }); - return false; - } - - /** - * adds a chunk as an annotation to the CAS - * - * @param normalizedDocText - * @param abbreviationIndex - * @param conceptMentionIndex - */ - private void add2Cas(JCas aJCas, Chunk chunk, NormalizedString normalizedDocText, - JCoReHashMapAnnotationIndex conceptMentionIndex, - JCoReHashMapAnnotationIndex abbreviationIndex) throws AnalysisEngineProcessException { - // System.out.println("CHUNK: start=" + chunk.start() + " end=" + - // chunk.end()); - // if checkAcronyms, then check acronyms for compliant full forms (= - // with same specificType) - if (checkAcronyms && !isAcronymWithSameFullFormSpecificType(aJCas, chunk, normalizedDocText, - conceptMentionIndex, abbreviationIndex)) { - return; - } - - int start = provider.getNormalize() ? normalizedDocText.getOriginalOffset(chunk.start()) : chunk.start(); - int end = provider.getNormalize() ? normalizedDocText.getOriginalOffset(chunk.end()) : chunk.end(); - - try { - if (mantraMode) { - // the "type" string is used to transport all data needed for - // the MAN-XML format - for (String term : chunk.type().split("@@TERM@@")) { - // @@ is used to separate source, cui, type(s) and group (in - // this order!) - String[] info = term.split("@@"); - Entity newEntity = (Entity) JCoReAnnotationTools.getAnnotationByClassName(aJCas, - "de.julielab.jcore.types.mantra.Entity"); - newEntity.setBegin(start); - newEntity.setEnd(end); - newEntity.setComponentId(COMPONENT_ID); - newEntity.setConfidence(chunk.score() + ""); - - // mantra specific - newEntity.setSource(info[0]); - newEntity.setCui(info[1]); - newEntity.setSemanticType(info[2]); - newEntity.setSemanticGroup(info[3]); - - newEntity.addToIndexes(); - } - } else { - ConceptMention newEntity = (ConceptMention) JCoReAnnotationTools.getAnnotationByClassName(aJCas, - outputType); - newEntity.setBegin(start); - newEntity.setEnd(end); - - // String entityText = newEntity.getCoveredText(); - // if (stopWords.contains(entityText.toLowerCase())) - // return; - // if (entityText.contains(" ")) { - // String[] words = entityText.split(" "); - // int stopWordCounter = 0; - // for (String word : words) { - // if (stopWords.contains(word.toLowerCase())) - // stopWordCounter++; - // } - // if (words.length == stopWordCounter) - // return; - // } - - newEntity.setSpecificType(chunk.type()); - newEntity.setComponentId(COMPONENT_ID); - newEntity.setConfidence(chunk.score() + ""); - newEntity.addToIndexes(); - - conceptMentionIndex.index(newEntity); - } - } catch (Exception e) { - LOGGER.error("process() - could not generate output type: " + e.getMessage()); - e.printStackTrace(); - throw new AnalysisEngineProcessException(e); - } - } - - private void annotateAcronymsWithFullFormEntity(JCas aJCas, - JCoReHashMapAnnotationIndex conceptMentionIndex) - throws AnalysisEngineProcessException { - - JFSIndexRepository indexes = aJCas.getJFSIndexRepository(); - FSIterator abbrevIter = indexes.getAnnotationIndex(Abbreviation.type).iterator(); - IndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator(); - - // loop over all abbreviations - while (abbrevIter.hasNext()) { - Abbreviation abbrev = (Abbreviation) abbrevIter.next(); - AbbreviationLongform fullFormAnnotation = abbrev.getTextReference(); - LOGGER.debug("annotateAcronymsWithFullFormEntity() - checking abbreviation: " + abbrev.getCoveredText()); - ConceptMention emFullform = null;// AnnotationRetrieval.getMatchingAnnotation(aJCas, fullFormAnnotation, - // ConceptMention.class); - emFullform = conceptMentionIndex.getFirst(fullFormAnnotation); - - // The following code was once introduced for gene tagging. There, - // the acronym fullforms sometimes miss minor parts of an annotated - // gene, leading to non-annotated acronyms that would have been - // correct. - // However, for general-purpose concept recognition this approach - // can be quite harmful. Example: "Anaphase-promoting complex (APC)" - // where only "anaphase" is recognized as concept. Now, "APC" would - // be annotated as an acronym for "anaphase". Here, a better - // recognition of the abbreviation span is required. - // ConceptMention emFullform = null; - // List conceptsInFullform = - // JCoReAnnotationTools.getIncludedAnnotations(aJCas, - // fullFormAnnotation, - // ConceptMention.class); - // if (conceptsInFullform.size() == 1) { - // emFullform = conceptsInFullform.get(0); - // LOGGER.debug("Found a single ConceptMention included in the full - // form: {}", emFullform.getCoveredText()); - // } else if (conceptsInFullform.size() > 1) { - // // If there are multiple ConceptMentions found in the full form, - // take that largest right-most candidate. - // int maxSize = -1; - // for (ConceptMention em : conceptsInFullform) { - // int emSize = em.getEnd() - em.getBegin(); - // if (emSize > maxSize) { - // emFullform = em; - // maxSize = emSize; - // } - // } - // LOGGER.debug("Found multiple ConceptMentions included in the full - // form \"{}\", returning the longest.", - // fullFormAnnotation.getCoveredText()); - // if (LOGGER.isTraceEnabled()) { - // LOGGER.trace("All found ConceptMentions:"); - // for (ConceptMention cm : conceptsInFullform) { - // LOGGER.trace("Text: {}; offsets: {}-{}", - // new Object[] { cm.getCoveredText(), cm.getBegin(), cm.getEnd() - // }); - // } - // } - // } else { - // LOGGER.debug("No ConceptMention in the span of acronym fullform - // \"{}\" found.", - // fullFormAnnotation.getCoveredText()); - // } - - String type = null; - if (emFullform != null) - type = emFullform.getClass().getCanonicalName(); - - ConceptMention emAcronym = null;// AnnotationRetrieval.getMatchingAnnotation(aJCas, abbrev, - // ConceptMention.class); - emAcronym = conceptMentionIndex.getFirst(abbrev); - // This is really slow, really a pain with full texts. - // It was originally introduced to push recall for gene recognition. - // So now we will lose (a bit) of recognition performance there. - // ConceptMention emAcronym = - // JCoReAnnotationTools.getPartiallyOverlappingAnnotation(aJCas, - // abbrev, - // ConceptMention.class); - - // if type of the entity is equal to the output type for this - // annotator - if (type != null && type.equals(outputType)) { - if (emFullform == null) { - LOGGER.debug( - "annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no ConceptMention\n"); - continue; - } - if (emFullform.getComponentId() != null && emFullform.getComponentId().equals(COMPONENT_ID) - && (emAcronym == null - || !emAcronym.getClass().getName().equals(emFullform.getClass().getName()))) { - - try { - LOGGER.debug("annotateAcronymsWithFullFormEntity() - fullform of abbreviation (" - + abbrev.getCoveredText() + " [begin=" + abbrev.getBegin() + "; end=" + abbrev.getEnd() - + "]) has ConceptMention: " + emFullform.toString()); - ConceptMention newEntityOnAcronym = (ConceptMention) JCoReAnnotationTools - .getAnnotationByClassName(aJCas, outputType); - newEntityOnAcronym.setBegin(abbrev.getBegin()); - newEntityOnAcronym.setEnd(abbrev.getEnd()); - newEntityOnAcronym.setTextualRepresentation(newEntityOnAcronym.getCoveredText()); - newEntityOnAcronym.setSpecificType(emFullform.getSpecificType()); - newEntityOnAcronym.setComponentId(COMPONENT_ID + "+acronym"); - newEntityOnAcronym.setConfidence(emFullform.getConfidence() + ""); - newEntityOnAcronym.addToIndexes(); - - } catch (Exception e) { - LOGGER.error("process() - could not generate output type: " + e.getMessage()); - e.printStackTrace(); - throw new AnalysisEngineProcessException(AnalysisEngineProcessException.ANNOTATOR_EXCEPTION, - null); - } - - } else { - if (emAcronym == null) - LOGGER.debug("annotateAcronymsWithFullFormEntity() - emAcronym != null"); - else if (emAcronym.getClass().getName().equals(emFullform.getClass().getName())) - LOGGER.debug("annotateAcronymsWithFullFormEntity() - emAcroType=" - + emAcronym.getClass().getCanonicalName() + " == emFullformType=" - + emFullform.getClass().getCanonicalName()); - } - - } - } - } + public static final String CHUNKER_RESOURCE_NAME = "DictionaryChunkerProvider"; + // public final static String PARAM_USE_APPROXIMATE_MATCHING = + // "UseApproximateMatching"; + public final static String PARAM_CHECK_ACRONYMS = "CheckAcronyms"; + public final static String PARAM_OUTPUT_TYPE = "OutputType"; + private static final String COMPONENT_ID = GazetteerAnnotator.class.getCanonicalName(); + private static final Logger LOGGER = LoggerFactory.getLogger(GazetteerAnnotator.class); + /** + * Only required to set to false as an annotator parameter when using + * approximate matching and the ChunkerProvider is set to CaseSensitive false. + * That is because the approximate chunker is always case sensitive. + */ + // public final static String PARAM_CASE_SENSITIVE = "CaseSensitive"; + private static final String PARAM_USE_MANTRA_MODE = "MantraMode"; + /** + * Parameter to indicate whether text - CAS document text for this class - + * should be normalized by completely removing dashes, parenthesis, genitive 's + * and perhaps more. This is meant to replace the generation of term variants + * and cannot be used together with variation generation. If this is switched on + * here, it must also be switched on in the external resource configuration for + * the ChunkerProvider! Can only be used with alternative ChunkerProviderImplAlt + * implementation. + */ + // public final static String PARAM_NORMALIZE_TEXT = "NormalizeText"; + // TODO for debug only + private static int initializeCount = 0; + /** + * Parameter to indicate whether text - CAS document text for this class - + * should be transliterated, i.e. whether accents and other character variations + * should be stripped. If this is switched on here, it must also be switched on + * in the external resource configuration for the ChunkerProvider! Can only be + * used with alternative ChunkerProviderImplAlt implementation. + */ + // public final static String PARAM_TRANSLITERATE_TEXT = + // "TransliterateText"; + + @ConfigurationParameter(name = PARAM_USE_MANTRA_MODE, defaultValue = "false") + private boolean mantraMode = false; + // needs to be true because of chunker injection: + @ConfigurationParameter(name = PARAM_CHECK_ACRONYMS, defaultValue = "true") + private boolean checkAcronyms = true; + @ConfigurationParameter(name = PARAM_OUTPUT_TYPE) + private String outputType = null; + @ExternalResource(key = CHUNKER_RESOURCE_NAME, mandatory = true) + private ChunkerProvider provider; + /** + * Removes diacritics and does lower casing + */ + private Transliterator transliterator; + private Chunker gazetteer = null; + private TokenizerFactory normalizationTokenFactory; + private Set stopWords; + + static boolean filterParenthesis(String chunkText) { + Stack parenthesisStack = new Stack<>(); + // Map pMap = new HashMap<>(); + for (int i = 0; i < chunkText.length(); i++) { + char current = chunkText.charAt(i); + if (isParentheses(current)) { + if (isOpenedParentheses(current)) { + parenthesisStack.add(current); + } else { + if (parenthesisStack.isEmpty()) + return true; + if (!isParenthesisCounterpart(parenthesisStack.pop(), current)) + return true; + } + } + } + if (!parenthesisStack.isEmpty()) + return true; + return false; + } + + private static boolean isParenthesisCounterpart(Character char1, Character char2) { + ParenthesisType char1ParenthesisType = getParenthesisType(char2); + ParenthesisType char2ParenthesisType = getParenthesisType(char1); + if (char1ParenthesisType == ParenthesisType.NONE || char2ParenthesisType == ParenthesisType.NONE) + throw new IllegalArgumentException("The two characters '" + char1 + "' and '" + char2 + + "' were given in order to determine whether they are compatible parenthesis counterparts, but at least one of those characters is no parentheses."); + return char1ParenthesisType.equals(char2ParenthesisType); + } + + static ParenthesisType getParenthesisType(char current) { + switch (current) { + case '(': + case ')': + return ParenthesisType.ROUND; + case '[': + case ']': + return ParenthesisType.BRACKET; + case '{': + case '}': + return ParenthesisType.CURLY; + default: + return ParenthesisType.NONE; + } + } + + static boolean isParentheses(char current) { + return isOpenedParentheses(current) || isClosedParentheses(current); + } + + static boolean isOpenedParentheses(char current) { + switch (current) { + case '(': + case '[': + case '{': + return true; + default: + return false; + } + } + + static boolean isClosedParentheses(char current) { + switch (current) { + case ')': + case ']': + case '}': + return true; + default: + return false; + } + } + + static List groupOverlappingChunks(List chunkList, String chunkedText) { + // sort chunkList so the grouping works as intended + Collections.sort(chunkList, new Comparator() { + + @Override + public int compare(Chunk o1, Chunk o2) { + return o1.start() - o2.start(); + } + + }); + // group overlapping chunks + List overlappingChunks = new ArrayList(); + for (Chunk chunk : chunkList) { + // for debugging + // System.out.println("chunking.add(ChunkFactory.createChunk(" + + // chunk.start() + ", " + chunk.end() + + // ", 0d));"); + boolean added = false; + for (OverlappingChunk over : overlappingChunks) { + if (over.isOverlappingSpan(chunk.start(), chunk.end())) { + over.addChunk(chunk.start(), chunk.end(), chunk); + added = true; + } + } + if (!added) { + overlappingChunks.add(new OverlappingChunk(chunk.start(), chunk.end(), chunk, chunkedText)); + added = true; + } + } + return overlappingChunks; + } + + public void initialize(UimaContext aContext) throws ResourceInitializationException { + LOGGER.info("calls to initialize: " + initializeCount); + + super.initialize(aContext); + LOGGER.info("initialize() - initializing GazetteerAnnotator..."); + + try { + provider = (ChunkerProvider) getContext().getResourceObject(CHUNKER_RESOURCE_NAME); + gazetteer = provider.getChunker(); + stopWords = provider.getStopWords(); +// String[] stopwordArray = {"a", "about", "above", "across", "after", "afterwards", "again", "against", +// "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", +// "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", +// "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", +// "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", +// "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", +// "cant", "co", "computer", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", +// "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", +// "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", +// "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", +// "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", +// "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", +// "herself", "high", "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in", +// "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", +// "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", +// "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", +// "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", +// "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", +// "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", +// "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", +// "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", +// "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", +// "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", +// "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", +// "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though", "three", +// "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", +// "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", +// "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", +// "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", +// "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", +// "you", "your", "yours", "yourself", "yourselves",}; +// stopWords = new HashSet<>(); +// for (String sw : stopwordArray) +// stopWords.add(sw); + } catch (ResourceAccessException e) { + LOGGER.error("Exception while initializing", e); + } + + // check acronyms + checkAcronyms = (Boolean) aContext.getConfigParameterValue(PARAM_CHECK_ACRONYMS); + LOGGER.info( + "Check for acronyms (found dictionary entries that are abbreviations are only accepted if their long form is an abbreviation of the same type, too): {}", + checkAcronyms); + // filter stop words + + Boolean normalizeBoolean = provider.getNormalize(); + if (normalizeBoolean) { + normalizationTokenFactory = new IndoEuropeanTokenizerFactory(); + } + LOGGER.info("Normalize CAS document text (i.e. do stemming and remove possessive 's): {}", provider.getNormalize()); + + Boolean transliterateBoolean = provider.getTransliterate();// (Boolean) + // aContext.getConfigParameterValue(PARAM_TRANSLITERATE_TEXT); +// if (transliterateBoolean) { +// transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC; Lower"); + transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC"); +// } + LOGGER.info("Transliterate CAS document text (i.e. transform accented characters to their base forms): {}", + provider.getTransliterate()); + + // define output level + outputType = (String) aContext.getConfigParameterValue(PARAM_OUTPUT_TYPE); + if (outputType == null) { + LOGGER.error("initialize() - output type not specified."); + throw new ResourceInitializationException(); + } + + mantraMode = aContext.getConfigParameterValue(PARAM_USE_MANTRA_MODE) != null + ? (Boolean) aContext.getConfigParameterValue(PARAM_USE_MANTRA_MODE) + : false; + } + + /** + * process the CAS, there are two subroutines: one for exact and one for + * approximate matching. + */ + public void process(JCas aJCas) throws AnalysisEngineProcessException { + if (gazetteer == null) + throw new IllegalStateException("The actual gazetteer object is null. Check previous log messages pointing to the error (most probably the dictionary file could not be found)."); + String docText = aJCas.getDocumentText(); + if (docText == null || docText.length() == 0) + return; + // normalization includes transliteration + if (provider.getTransliterate() && !provider.getNormalize()) + docText = transliterator.transform(docText); + NormalizedString normalizedDocText = null; + if (provider.getNormalize()) { + if (provider.getNormalizePlural()) { + OffsetSet pluralOffsets = StreamSupport.stream(Spliterators.spliterator(aJCas.getAnnotationIndex(PennBioIEPOSTag.type).iterator(), 0, 0), false).filter(tag -> tag.getValue().equals("NNS")).map(tag -> Range.between(tag.getBegin(), tag.getEnd())).collect(Collectors.toCollection(OffsetSet::new)); + normalizedDocText = StringNormalizerForChunking.normalizeString(docText, normalizationTokenFactory, true, pluralOffsets, transliterator); + } else { + normalizedDocText = StringNormalizerForChunking.normalizeString(docText, normalizationTokenFactory, transliterator); + } + } + // exact matching has a switch for case sensitivity, so we can save the work here + if (!provider.getCaseSensitive() && provider.getUseApproximateMatching()) { + if (provider.getNormalize()) + normalizedDocText.string = normalizedDocText.string.toLowerCase(); + else + docText = docText.toLowerCase(); + } + + IndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator(); + JCoReHashMapAnnotationIndex conceptMentionIndex = new JCoReHashMapAnnotationIndex<>( + longOffsetTermGenerator, longOffsetTermGenerator, aJCas, ConceptMention.type); + JCoReHashMapAnnotationIndex abbreviationIndex = new JCoReHashMapAnnotationIndex<>( + longOffsetTermGenerator, longOffsetTermGenerator, aJCas, Abbreviation.type); + + LOGGER.debug("Performing actual Gazetteer annotation..."); + Chunking chunking; + if (provider.getNormalize()) + chunking = gazetteer.chunk(normalizedDocText.string); + else + chunking = gazetteer.chunk(docText); + LOGGER.debug("Gazetteer annotation done."); + if (provider.getUseApproximateMatching()) { + /* + * handle matches found by approx matching: this means especially overlapping + * matches with different scores (doesn't happen with exact matches) + */ + List chunkList = filterChunking(chunking); + List overlappingChunks = groupOverlappingChunks(chunkList, + chunking.charSequence().toString()); + // now add the best chunk of all overlappingChunks to the CAS + LOGGER.debug("all overlapping chunks:\n"); + // Set bestChunksSet = new HashSet<>(); + for (OverlappingChunk overlappingChunk : overlappingChunks) { + // show chunks + LOGGER.debug(overlappingChunk.toStringAll()); + List bestChunks = overlappingChunk.getBestChunks(); + LOGGER.debug("Found {} best chunks.", bestChunks.size()); + for (int i = 0; i < bestChunks.size(); i++) { + Chunk bestChunk = bestChunks.get(i); + if (LOGGER.isDebugEnabled()) { + String chunkText = provider.getNormalize() ? normalizedDocText.string.substring(bestChunk.start(), bestChunk.end()) : aJCas.getDocumentText().substring(bestChunk.start(), bestChunk.end()); + LOGGER.debug("Nr. " + i + " best chunk: " + bestChunk.start() + " - " + bestChunk.end() + ": " + + bestChunk.score() + " ; type: " + bestChunk.type() + " ; text: " + chunkText); + } + // TODO this check and the corresponding set may be removed + // when this exception hasn't been thrown + // in a + // while. Its currently just to be sure, this should not + // happen any more since the chunks are sorted + // by + // offset in the grouping method. + // if (bestChunksSet.contains(bestChunk)) { + // throw new IllegalStateException("Duplicate best chunk: " + bestChunk); + // } + // bestChunksSet.add(bestChunk); + // add 2 cas + add2Cas(aJCas, bestChunk, normalizedDocText, conceptMentionIndex, abbreviationIndex); + } + } + // for (Chunk chunk : chunking.chunkSet()) { + // add2Cas(aJCas, chunk, normalizedDocText); + // } + } else { + for (Chunk chunk : chunking.chunkSet()) { + add2Cas(aJCas, chunk, normalizedDocText, conceptMentionIndex, abbreviationIndex); + } + } + if (checkAcronyms && !mantraMode) { + LOGGER.debug("process() - checking acronyms"); + annotateAcronymsWithFullFormEntity(aJCas, conceptMentionIndex); + } + } + + private List filterChunking(Chunking chunking) { + // ChunkingImpl newChunking = new ChunkingImpl(chunking.charSequence()); + List newChunking = new ArrayList<>(chunking.chunkSet().size()); + for (Chunk chunk : chunking.chunkSet()) { + String chunkText = chunking.charSequence().subSequence(chunk.start(), chunk.end()).toString(); + if (filterParenthesis(chunkText)) + continue; + if (filterPunctuationArtifacts(chunkText)) + continue; + if (filterStopwords(chunkText)) + continue; + newChunking.add(chunk); + } + return newChunking; + } + + private boolean filterPunctuationArtifacts(String chunkText) { + if (chunkText.startsWith("-")) + return true; + if (chunkText.endsWith("-")) + return true; + return false; + } + + private boolean filterStopwords(String chunkText) { + if (stopWords.contains(chunkText.toLowerCase())) + return true; + if (chunkText.contains(" ")) { + String[] words = chunkText.split(" "); + int stopWordCounter = 0; + for (String word : words) { + if (stopWords.contains(word.toLowerCase())) + stopWordCounter++; + } + if (Math.ceil(words.length / 2.0) <= stopWordCounter) { + LOGGER.debug("Filtering due to high stop word occurrences: {}", chunkText); + return true; + } + } + return false; + } + + /** + * checks whether a chunk (= dictionary match) is an acronym. If yes, checks + * whether respective full form (obtained via abbr textReference) is + * ConceptMention and has same specificType as chunk If these conditions are not + * fulfilled, no entity annotation will be made. + * + * @param abbreviationIndex + * @param conceptMentionIndex + */ + private boolean isAcronymWithSameFullFormSpecificType(JCas aJCas, Chunk chunk, NormalizedString normalizedDocText, + JCoReHashMapAnnotationIndex conceptMentionIndex, + JCoReHashMapAnnotationIndex abbreviationIndex) { + // Annotation anno; + int start; + int end; + if (provider.getNormalize()) { + try { + start = normalizedDocText.getOriginalOffset(chunk.start()); + end = normalizedDocText.getOriginalOffset(chunk.end()); + } catch (Exception e) { + System.out.println("Text: " + normalizedDocText); + System.out.println("Chunk: " + chunk); + System.out.println("Chunk end: " + chunk.end()); + System.out + .println("Normalized Text: " + normalizedDocText.string.substring(chunk.start(), chunk.end())); + throw e; + } + // anno = new Annotation(aJCas, start, end); + } else { + start = chunk.start(); + end = chunk.end(); + } + + LongOffsetIndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator(); + // Retrieves potential abbr annotation + Abbreviation abbr = abbreviationIndex.getFirst(longOffsetTermGenerator.forOffsets(start, end)); + // check whether it's an abbr + String chunktext = null; + if (LOGGER.isDebugEnabled()) + chunktext = aJCas.getDocumentText().substring(start, end); + if (abbr == null) { + LOGGER.debug("{} chunk \"{}\" is not an abbreviation\n", chunk, chunktext); + return true; + } + // checks whether respective full form is ConceptMention + AbbreviationLongform textRef = abbr.getTextReference(); + ConceptMention em = conceptMentionIndex.getFirst(textRef); + if (em == null) { + LOGGER.debug( + chunk + " chunk \"{}\" is an abbreviation but respective full \"{}\" form is no ConceptMention\n", + chunktext, textRef.getCoveredText()); + return false; + } + + // checks whether full form annotation matches the type to be annotated + // here + String emType = em.getClass().getCanonicalName(); + if (emType.equals(outputType)) { + LOGGER.debug(chunk + + " chunk \"{}\" is an abbreviation and respective full form \"{}\" is ConceptMention with same type as OutputType\n", + chunktext, em.getCoveredText()); + return true; + } + + LOGGER.debug(chunk + + " chunk \"{}\" is an abbreviation but respective full form \"{}\" is ConceptMention without the correct OutputType (is: {}; OutputType: {})\n", + new Object[]{chunktext, em.getCoveredText(), emType, outputType}); + return false; + } + + // ------------ INFO .......... + // String text = aJCas.getDocumentText(); + // int start = chunk.start(); + // int end = chunk.end(); + // String type = chunk.type(); + // double score = chunk.score(); + // String phrase = text.substring(start, end); + // System.out.println(" found phrase=|" + phrase + "|" + // + " start=" + start + " end=" + end + " type=" + type + // + " score=" + score); + // ------------ INFO .......... + + /** + * adds a chunk as an annotation to the CAS + * + * @param normalizedDocText + * @param abbreviationIndex + * @param conceptMentionIndex + */ + private void add2Cas(JCas aJCas, Chunk chunk, NormalizedString normalizedDocText, + JCoReHashMapAnnotationIndex conceptMentionIndex, + JCoReHashMapAnnotationIndex abbreviationIndex) throws AnalysisEngineProcessException { + // System.out.println("CHUNK: start=" + chunk.start() + " end=" + + // chunk.end()); + // if checkAcronyms, then check acronyms for compliant full forms (= + // with same specificType) + if (checkAcronyms && !isAcronymWithSameFullFormSpecificType(aJCas, chunk, normalizedDocText, + conceptMentionIndex, abbreviationIndex)) { + return; + } + + // The Math.min(, Math.max(0, )) application is a security measure. I rare cases they are issues with multi + // byte character encodings. This security measure won't correct the underlying error but avoid errors + // due to invalid offsets. + int start = Math.min(aJCas.getDocumentText().length(), Math.max(0, provider.getNormalize() ? normalizedDocText.getOriginalOffset(chunk.start()) : chunk.start())); + int end = Math.min(aJCas.getDocumentText().length(), Math.max(0, provider.getNormalize() ? normalizedDocText.getOriginalOffset(chunk.end()) : chunk.end())); + + try { + if (mantraMode) { + // the "type" string is used to transport all data needed for + // the MAN-XML format + for (String term : chunk.type().split("@@TERM@@")) { + // @@ is used to separate source, cui, type(s) and group (in + // this order!) + String[] info = term.split("@@"); + Entity newEntity = (Entity) JCoReAnnotationTools.getAnnotationByClassName(aJCas, + "de.julielab.jcore.types.mantra.Entity"); + newEntity.setBegin(start); + newEntity.setEnd(end); + newEntity.setComponentId(COMPONENT_ID); + newEntity.setConfidence(chunk.score() + ""); + + // mantra specific + newEntity.setSource(info[0]); + newEntity.setCui(info[1]); + newEntity.setSemanticType(info[2]); + newEntity.setSemanticGroup(info[3]); + + newEntity.addToIndexes(); + } + } else { + ConceptMention newEntity = (ConceptMention) JCoReAnnotationTools.getAnnotationByClassName(aJCas, + outputType); + newEntity.setBegin(start); + newEntity.setEnd(end); + + // String entityText = newEntity.getCoveredText(); + // if (stopWords.contains(entityText.toLowerCase())) + // return; + // if (entityText.contains(" ")) { + // String[] words = entityText.split(" "); + // int stopWordCounter = 0; + // for (String word : words) { + // if (stopWords.contains(word.toLowerCase())) + // stopWordCounter++; + // } + // if (words.length == stopWordCounter) + // return; + // } + + newEntity.setSpecificType(chunk.type()); + newEntity.setComponentId(COMPONENT_ID); + newEntity.setConfidence(chunk.score() + ""); + newEntity.addToIndexes(); + + conceptMentionIndex.index(newEntity); + } + } catch (Exception e) { + LOGGER.error("process() - could not generate output type: " + e.getMessage()); + e.printStackTrace(); + throw new AnalysisEngineProcessException(e); + } + } + + private void annotateAcronymsWithFullFormEntity(JCas aJCas, + JCoReHashMapAnnotationIndex conceptMentionIndex) + throws AnalysisEngineProcessException { + + JFSIndexRepository indexes = aJCas.getJFSIndexRepository(); + FSIterator abbrevIter = indexes.getAnnotationIndex(Abbreviation.type).iterator(); + IndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator(); + + // loop over all abbreviations + while (abbrevIter.hasNext()) { + Abbreviation abbrev = (Abbreviation) abbrevIter.next(); + AbbreviationLongform fullFormAnnotation = abbrev.getTextReference(); + LOGGER.debug("annotateAcronymsWithFullFormEntity() - checking abbreviation: " + abbrev.getCoveredText()); + ConceptMention emFullform = null;// AnnotationRetrieval.getMatchingAnnotation(aJCas, fullFormAnnotation, + // ConceptMention.class); + emFullform = conceptMentionIndex.getFirst(fullFormAnnotation); + + // The following code was once introduced for gene tagging. There, + // the acronym fullforms sometimes miss minor parts of an annotated + // gene, leading to non-annotated acronyms that would have been + // correct. + // However, for general-purpose concept recognition this approach + // can be quite harmful. Example: "Anaphase-promoting complex (APC)" + // where only "anaphase" is recognized as concept. Now, "APC" would + // be annotated as an acronym for "anaphase". Here, a better + // recognition of the abbreviation span is required. + // ConceptMention emFullform = null; + // List conceptsInFullform = + // JCoReAnnotationTools.getIncludedAnnotations(aJCas, + // fullFormAnnotation, + // ConceptMention.class); + // if (conceptsInFullform.size() == 1) { + // emFullform = conceptsInFullform.get(0); + // LOGGER.debug("Found a single ConceptMention included in the full + // form: {}", emFullform.getCoveredText()); + // } else if (conceptsInFullform.size() > 1) { + // // If there are multiple ConceptMentions found in the full form, + // take that largest right-most candidate. + // int maxSize = -1; + // for (ConceptMention em : conceptsInFullform) { + // int emSize = em.getEnd() - em.getBegin(); + // if (emSize > maxSize) { + // emFullform = em; + // maxSize = emSize; + // } + // } + // LOGGER.debug("Found multiple ConceptMentions included in the full + // form \"{}\", returning the longest.", + // fullFormAnnotation.getCoveredText()); + // if (LOGGER.isTraceEnabled()) { + // LOGGER.trace("All found ConceptMentions:"); + // for (ConceptMention cm : conceptsInFullform) { + // LOGGER.trace("Text: {}; offsets: {}-{}", + // new Object[] { cm.getCoveredText(), cm.getBegin(), cm.getEnd() + // }); + // } + // } + // } else { + // LOGGER.debug("No ConceptMention in the span of acronym fullform + // \"{}\" found.", + // fullFormAnnotation.getCoveredText()); + // } + + String type = null; + if (emFullform != null) + type = emFullform.getClass().getCanonicalName(); + + ConceptMention emAcronym = null;// AnnotationRetrieval.getMatchingAnnotation(aJCas, abbrev, + // ConceptMention.class); + emAcronym = conceptMentionIndex.getFirst(abbrev); + // This is really slow, really a pain with full texts. + // It was originally introduced to push recall for gene recognition. + // So now we will lose (a bit) of recognition performance there. + // ConceptMention emAcronym = + // JCoReAnnotationTools.getPartiallyOverlappingAnnotation(aJCas, + // abbrev, + // ConceptMention.class); + + // if type of the entity is equal to the output type for this + // annotator + if (type != null && type.equals(outputType)) { + if (emFullform == null) { + LOGGER.debug( + "annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no ConceptMention\n"); + continue; + } + if (emFullform.getComponentId() != null && emFullform.getComponentId().equals(COMPONENT_ID) + && (emAcronym == null + || !emAcronym.getClass().getName().equals(emFullform.getClass().getName()))) { + + try { + LOGGER.debug("annotateAcronymsWithFullFormEntity() - fullform of abbreviation (" + + abbrev.getCoveredText() + " [begin=" + abbrev.getBegin() + "; end=" + abbrev.getEnd() + + "]) has ConceptMention: " + emFullform.toString()); + ConceptMention newEntityOnAcronym = (ConceptMention) JCoReAnnotationTools + .getAnnotationByClassName(aJCas, outputType); + newEntityOnAcronym.setBegin(abbrev.getBegin()); + newEntityOnAcronym.setEnd(abbrev.getEnd()); + newEntityOnAcronym.setTextualRepresentation(newEntityOnAcronym.getCoveredText()); + newEntityOnAcronym.setSpecificType(emFullform.getSpecificType()); + newEntityOnAcronym.setComponentId(COMPONENT_ID + "+acronym"); + newEntityOnAcronym.setConfidence(emFullform.getConfidence() + ""); + newEntityOnAcronym.addToIndexes(); + + } catch (Exception e) { + LOGGER.error("process() - could not generate output type: " + e.getMessage()); + e.printStackTrace(); + throw new AnalysisEngineProcessException(AnalysisEngineProcessException.ANNOTATOR_EXCEPTION, + null); + } + + } else { + if (emAcronym == null) + LOGGER.debug("annotateAcronymsWithFullFormEntity() - emAcronym != null"); + else if (emAcronym.getClass().getName().equals(emFullform.getClass().getName())) + LOGGER.debug("annotateAcronymsWithFullFormEntity() - emAcroType=" + + emAcronym.getClass().getCanonicalName() + " == emFullformType=" + + emFullform.getClass().getCanonicalName()); + } + + } + } + } + + enum ParenthesisType { + ROUND, BRACKET, CURLY, NONE + } } diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java index 2cffe9bde..3172f5601 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java @@ -1,213 +1,252 @@ - package de.julielab.jcore.ae.lingpipegazetteer.utils; +import com.aliasi.tokenizer.PorterStemmerTokenizerFactory; import com.aliasi.tokenizer.Tokenizer; import com.aliasi.tokenizer.TokenizerFactory; import com.ibm.icu.text.Transliterator; +import de.julielab.java.utilities.spanutils.OffsetSet; +import org.apache.commons.lang3.Range; import java.util.*; public class StringNormalizerForChunking { - public enum Mode { - /** - * Punctuation characters are deleted completely, shrinking the string. - */ - DELETE, - /** Punctuation characters are replaced by white spaces. */ - REPLACE - } - - private static Set charsToDelete = new HashSet<>(); - static { - charsToDelete.add('-'); - charsToDelete.add('+'); - charsToDelete.add(','); - charsToDelete.add('.'); - charsToDelete.add(':'); - charsToDelete.add(';'); - charsToDelete.add('?'); - charsToDelete.add('!'); - charsToDelete.add('*'); - charsToDelete.add('§'); - charsToDelete.add('$'); - charsToDelete.add('%'); - charsToDelete.add('&'); - charsToDelete.add('/'); - charsToDelete.add('\\'); - charsToDelete.add('('); - charsToDelete.add(')'); - charsToDelete.add('<'); - charsToDelete.add('>'); - charsToDelete.add('['); - charsToDelete.add(']'); - charsToDelete.add('='); - charsToDelete.add('\''); - charsToDelete.add('`'); - charsToDelete.add('´'); - charsToDelete.add('"'); - charsToDelete.add('#'); - } - - public static class NormalizedString { - public String string; - private Map offsetMap = new HashMap<>(); - - public Map getOffsetMap() { - return offsetMap; - } - - private TreeSet normalizedOffsetSet; - - public Integer getOriginalOffset(int normalizedOffset) { - Integer originalOffset = offsetMap.get(normalizedOffset); - if (originalOffset == null) { - originalOffset = deriveOriginalOffset(normalizedOffset); - offsetMap.put(normalizedOffset, originalOffset); - } - return originalOffset; - } - - private Integer deriveOriginalOffset(int normalizedOffset) { - if (normalizedOffsetSet == null) - normalizedOffsetSet = new TreeSet<>(offsetMap.keySet()); - Integer previousNormalizedOffset = normalizedOffsetSet.floor(normalizedOffset); - Integer originalPreviousOffset = offsetMap.get(previousNormalizedOffset); - int offsetShift = Math.abs(originalPreviousOffset - previousNormalizedOffset); - // Typically, the normalized string will be shorter than the - // original, thus the original offset would be larger. - if (originalPreviousOffset > previousNormalizedOffset) - return normalizedOffset + offsetShift; - // But if, for some reason, the normalized string is longer than the - // original, we would have to subtract the difference from the - // normalized offset. - return normalizedOffset - offsetShift; - } - } - - /** - * This method was meant for text normalization by just deleting punctuation - * characters. However, the approach turned out to be suboptimal in cases - * where a dictionary entry would be "SHP-1" and the text form would be "SHP - * 1". That is, when in the text there is just a whitespace where there is a - * punctuation character in the dictionary, we won't recognize the - * dictionary entry. Thus, a different normalization was developed, namely - * in the other normalization method. It is supposed to be used together - * with an approximate chunker. - * - * @param str - * @return - */ - public static NormalizedString normalizeString(String str) { - NormalizedString ns = new NormalizedString(); - StringBuilder sb = new StringBuilder(); - int deletedChars = 0; - - for (int i = 0; i < str.length(); i++) { - char c = str.charAt(i); - if (charsToDelete.contains(c)) { - deletedChars++; - // switch (mode) { - // case REPLACE: sb.append(" "); break; - // case DELETE: deletedChars++; break; - // } - } else { - sb.append(c); - } - int newOffset = Math.max(0, i - deletedChars); - if (null == ns.offsetMap.get(newOffset)) - ns.offsetMap.put(newOffset, i); - } - ns.string = sb.toString(); - return ns; - } - - /** - * This normalization method uses a given TokenizerFactory (could also be a - * PorterStemmerTokenizerFactory for stemming) and additionally removes - * possessive 's constructions. Dashes and other punctuation is left - * untouched. By using an approximate chunker, one can also handle - * punctuation. - * - * @param str - * @param tokenizerFactory - * @return - */ - public static NormalizedString normalizeString(String str, TokenizerFactory tokenizerFactory, - Transliterator transliterator) { - // boolean stemming = tokenizerFactory instanceof - // PorterStemmerTokenizerFactory; - - NormalizedString ns = new NormalizedString(); - - char[] strChars = str.toCharArray(); - Tokenizer tokenizer = tokenizerFactory.tokenizer(strChars, 0, strChars.length); - StringBuilder sb = new StringBuilder(); - ArrayDeque tokenS = new ArrayDeque<>(); - Map deleteCandidateOffsetMap = new HashMap<>(); - // According to the lingpipe API documentation, one starts with the next - // whitespace. - sb.append(tokenizer.nextWhitespace()); - ns.offsetMap.put(0, 0); - String token; - while ((token = tokenizer.nextToken()) != null) { - // Handle possessive 's (like Parkinson's). It will be deleted. In - // case we have accidentally deleted some - // tokens, those are stored in the stack and their offsets are - // stored, too. In case it was an error, the - // tokens are later added again in the "else" path. - if (token.equals("'")) { - int newStartOffset = sb.length() + sumOfStack(tokenS); - int newEndOffset = sb.length() + sumOfStack(tokenS) + token.length(); - deleteCandidateOffsetMap.put(newStartOffset, tokenizer.lastTokenStartPosition()); - deleteCandidateOffsetMap.put(newEndOffset, tokenizer.lastTokenEndPosition()); - tokenS.push(token + tokenizer.nextWhitespace()); - } else if (token.equals("s") && tokenS.size() == 1) { - int newStartOffset = sb.length() + sumOfStack(tokenS); - int newEndOffset = sb.length() + sumOfStack(tokenS) + token.length(); - deleteCandidateOffsetMap.put(newStartOffset, tokenizer.lastTokenStartPosition()); - deleteCandidateOffsetMap.put(newEndOffset, tokenizer.lastTokenEndPosition()); - tokenS.push(token); - String ws = tokenizer.nextWhitespace(); - if (ws.length() > 0) { - sb.append(ws); - tokenS.clear(); - deleteCandidateOffsetMap.clear(); - } - } else { - if (!tokenS.isEmpty()) { - for (String s : tokenS) { - sb.append(s); - } - tokenS.clear(); - ns.offsetMap.putAll(deleteCandidateOffsetMap); - deleteCandidateOffsetMap.clear(); - } - if (transliterator != null) - token = transliterator.transform(token); - // plural s, only when no stemming is done - // if (!stemming && token.endsWith("s")) - // token = token.substring(0, token.length() - 1); - sb.append(token); - int newStartOffset = sb.length() - token.length(); - int newEndOffset = sb.length(); - ns.offsetMap.put(newStartOffset, tokenizer.lastTokenStartPosition()); - ns.offsetMap.put(newEndOffset, tokenizer.lastTokenEndPosition()); - sb.append(tokenizer.nextWhitespace()); - } - } - ns.string = sb.toString(); - return ns; - } - - private static int sumOfStack(Deque stack) { - int sum = 0; - for (String i : stack) - sum += i.length(); - return sum; - } - - public static NormalizedString normalizeString(String str, TokenizerFactory tokenizerFactory) { - return normalizeString(str, tokenizerFactory, null); - } + + private static Set charsToDelete = new HashSet<>(); + + static { + charsToDelete.add('-'); + charsToDelete.add('+'); + charsToDelete.add(','); + charsToDelete.add('.'); + charsToDelete.add(':'); + charsToDelete.add(';'); + charsToDelete.add('?'); + charsToDelete.add('!'); + charsToDelete.add('*'); + charsToDelete.add('§'); + charsToDelete.add('$'); + charsToDelete.add('%'); + charsToDelete.add('&'); + charsToDelete.add('/'); + charsToDelete.add('\\'); + charsToDelete.add('('); + charsToDelete.add(')'); + charsToDelete.add('<'); + charsToDelete.add('>'); + charsToDelete.add('['); + charsToDelete.add(']'); + charsToDelete.add('='); + charsToDelete.add('\''); + charsToDelete.add('`'); + charsToDelete.add('´'); + charsToDelete.add('"'); + charsToDelete.add('#'); + + // this would normalize German umlauts like Hörsturz -> Hoersturz + // I leave it here for the future but don't add it right now because I don't want to make this Transliterator + // a static field due to Thread safety and also don't have time now to refactor this all +// String rules = "[\\u00E4{a\\u0308}] > ae; " + +// " [\\u00F6{o\\u0308}] > oe;" + +// " [\\u00FC{u\\u0308}] > ue;" + +// " {[\\u00C4{A\\u0308}]}[:Lowercase:] > Ae;" + +// " {[\\u00D6{O\\u0308}]}[:Lowercase:] > Oe;" + +// " {[\\u00DC{U\\u0308}]}[:Lowercase:] > Ue;" + +// " [\\u00C4{A\\u0308}] > AE;" + +// " [\\u00D6{O\\u0308}] > OE;" + +// " [\\u00DC{U\\u0308}] > UE;" + +// " [\\u20AC] > EUR;"; +// +// germanUmlautTransliterator = Transliterator.createFromRules("de_EUR-ASCII", rules, Transliterator.FORWARD); + } + + /** + * This method was meant for text normalization by just deleting punctuation + * characters. However, the approach turned out to be suboptimal in cases + * where a dictionary entry would be "SHP-1" and the text form would be "SHP + * 1". That is, when in the text there is just a whitespace where there is a + * punctuation character in the dictionary, we won't recognize the + * dictionary entry. Thus, a different normalization was developed, namely + * in the other normalization method. It is supposed to be used together + * with an approximate chunker. + * + * @param str + * @return + */ + public static NormalizedString normalizeString(String str) { + NormalizedString ns = new NormalizedString(); + StringBuilder sb = new StringBuilder(); + int deletedChars = 0; + + for (int i = 0; i < str.length(); i++) { + char c = str.charAt(i); + if (charsToDelete.contains(c)) { + deletedChars++; + // switch (mode) { + // case REPLACE: sb.append(" "); break; + // case DELETE: deletedChars++; break; + // } + } else { + sb.append(c); + } + int newOffset = Math.max(0, i - deletedChars); + if (null == ns.offsetMap.get(newOffset)) + ns.offsetMap.put(newOffset, i); + } + ns.string = sb.toString(); + return ns; + } + + /** + * This normalization method uses a given TokenizerFactory (could also be a + * PorterStemmerTokenizerFactory for stemming) and additionally removes + * possessive 's constructions. Dashes and other punctuation is left + * untouched. By using an approximate chunker, one can also handle + * punctuation. + * + * @param str + * @param tokenizerFactory + * @return + */ + public static NormalizedString normalizeString(String str, TokenizerFactory tokenizerFactory, boolean normalizePlural, OffsetSet pluralPositions, + Transliterator transliterator) { + + + boolean stemming = tokenizerFactory instanceof + PorterStemmerTokenizerFactory; + + NormalizedString ns = new NormalizedString(); + + char[] strChars = str.toCharArray(); + Tokenizer tokenizer = tokenizerFactory.tokenizer(strChars, 0, strChars.length); + StringBuilder sb = new StringBuilder(); + ArrayDeque tokenS = new ArrayDeque<>(); + Map deleteCandidateOffsetMap = new HashMap<>(); + // According to the lingpipe API documentation, one starts with the next + // whitespace. + sb.append(tokenizer.nextWhitespace()); + ns.offsetMap.put(0, 0); + String token; + while ((token = tokenizer.nextToken()) != null) { + // Handle possessive 's (like Parkinson's). It will be deleted. In + // case we have accidentally deleted some + // tokens, those are stored in the stack and their offsets are + // stored, too. In case it was an error, the + // tokens are later added again in the "else" path. + if (token.equals("'")) { + int newStartOffset = sb.length() + sumOfStack(tokenS); + int newEndOffset = sb.length() + sumOfStack(tokenS) + token.length(); + deleteCandidateOffsetMap.put(newStartOffset, tokenizer.lastTokenStartPosition()); + deleteCandidateOffsetMap.put(newEndOffset, tokenizer.lastTokenEndPosition()); + tokenS.push(token + tokenizer.nextWhitespace()); + } else if (token.equals("s") && tokenS.size() == 1) { + int newStartOffset = sb.length() + sumOfStack(tokenS); + int newEndOffset = sb.length() + sumOfStack(tokenS) + token.length(); + deleteCandidateOffsetMap.put(newStartOffset, tokenizer.lastTokenStartPosition()); + deleteCandidateOffsetMap.put(newEndOffset, tokenizer.lastTokenEndPosition()); + tokenS.push(token); + String ws = tokenizer.nextWhitespace(); + if (ws.length() > 0) { + sb.append(ws); + tokenS.clear(); + deleteCandidateOffsetMap.clear(); + } + } else { + if (!tokenS.isEmpty()) { + for (String s : tokenS) { + sb.append(s); + } + tokenS.clear(); + ns.offsetMap.putAll(deleteCandidateOffsetMap); + deleteCandidateOffsetMap.clear(); + } + token = transliterator.transform(token); +// token = germanUmlautTransliterator.transliterate(token); + // plural s, only when no stemming is done + // an even better normalization would be to use the lemma, of course + Range tokenOffsets = Range.between(tokenizer.lastTokenStartPosition(), tokenizer.lastTokenEndPosition()); + try { + if (normalizePlural && !stemming && token.endsWith("s") && pluralPositions != null && !pluralPositions.isEmpty() && Optional.ofNullable(pluralPositions.locate(tokenOffsets)).orElse(Range.between(0, 0)).isOverlappedBy(tokenOffsets)) + token = token.substring(0, token.length() - 1); + } catch (Exception e) { + System.out.println("normalizePlural: " + normalizePlural); + System.out.println("stemming: " + stemming); + System.out.println("Token: " + token); + System.out.println("PluralPositions: " + pluralPositions); + System.out.println("TokenOffsets: " + tokenOffsets); + System.out.println("pluralPositions.locate(tokenOffsets): " + pluralPositions.locate(tokenOffsets)); + e.printStackTrace(); + } + sb.append(token); + int newStartOffset = sb.length() - token.length(); + int newEndOffset = sb.length(); + ns.offsetMap.put(newStartOffset, tokenizer.lastTokenStartPosition()); + ns.offsetMap.put(newEndOffset, tokenizer.lastTokenEndPosition()); + sb.append(tokenizer.nextWhitespace()); + } + } + ns.string = sb.toString(); + return ns; + } + + private static int sumOfStack(Deque stack) { + int sum = 0; + for (String i : stack) + sum += i.length(); + return sum; + } + + public static NormalizedString normalizeString(String str, TokenizerFactory tokenizerFactory, Transliterator transliterator) { + return normalizeString(str, tokenizerFactory, false, null, transliterator); + } + + public static NormalizedString normalizeString(String str, boolean normalizePlural, OffsetSet pluralPositions, TokenizerFactory tokenizerFactory) { + return normalizeString(str, tokenizerFactory, normalizePlural, pluralPositions, null); + } + + public enum Mode { + /** + * Punctuation characters are deleted completely, shrinking the string. + */ + DELETE, + /** + * Punctuation characters are replaced by white spaces. + */ + REPLACE + } + + public static class NormalizedString { + public String string; + private Map offsetMap = new HashMap<>(); + private TreeSet normalizedOffsetSet; + + public Map getOffsetMap() { + return offsetMap; + } + + public Integer getOriginalOffset(int normalizedOffset) { + Integer originalOffset = offsetMap.get(normalizedOffset); + if (originalOffset == null) { + originalOffset = deriveOriginalOffset(normalizedOffset); + offsetMap.put(normalizedOffset, originalOffset); + } + return originalOffset; + } + + private Integer deriveOriginalOffset(int normalizedOffset) { + if (normalizedOffsetSet == null) + normalizedOffsetSet = new TreeSet<>(offsetMap.keySet()); + Integer previousNormalizedOffset = normalizedOffsetSet.floor(normalizedOffset); + Integer originalPreviousOffset = offsetMap.get(previousNormalizedOffset); + int offsetShift = Math.abs(originalPreviousOffset - previousNormalizedOffset); + // Typically, the normalized string will be shorter than the + // original, thus the original offset would be larger. + if (originalPreviousOffset > previousNormalizedOffset) + return normalizedOffset + offsetShift; + // But if, for some reason, the normalized string is longer than the + // original, we would have to subtract the difference from the + // normalized offset. + return normalizedOffset - offsetShift; + } + } } diff --git a/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae-configurable-resource.xml b/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae-configurable-resource.xml index 1f4e5a34e..6c8aad79c 100644 --- a/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae-configurable-resource.xml +++ b/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae-configurable-resource.xml @@ -16,7 +16,7 @@ embedded into the descriptor. The current parameter settings will work but may be changed. Refer to https://github.com/JULIELab/jcore-base/tree/master/jcore-lingpipegazetteer-ae for more information. - 2.5.1-SNAPSHOT + 2.6.0 julielab @@ -50,6 +50,9 @@ + + + @@ -105,6 +108,12 @@ false true + + NormalizePlural + Boolean + false + true + TransliterateText Boolean @@ -149,6 +158,12 @@ true + + NormalizePlural + + false + + TransliterateText diff --git a/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae.xml b/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae.xml index b168cefa2..4882f43cb 100644 --- a/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae.xml +++ b/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae.xml @@ -14,7 +14,7 @@ and some parameter settings for dictionary processing and tagging. Refer to https://github.com/JULIELab/jcore-base/tree/master/jcore-lingpipegazetteer-ae for more information. - 2.5.1-SNAPSHOT + 2.6.0 julielab diff --git a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/StringNormalizerForChunkingTest.java b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/StringNormalizerForChunkingTest.java index fe1ac16a0..b186a6c2d 100644 --- a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/StringNormalizerForChunkingTest.java +++ b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/StringNormalizerForChunkingTest.java @@ -5,14 +5,19 @@ import com.aliasi.tokenizer.PorterStemmerTokenizerFactory; import com.aliasi.tokenizer.TokenizerFactory; import com.ibm.icu.text.Transliterator; +import de.julielab.java.utilities.spanutils.OffsetSet; import de.julielab.jcore.ae.lingpipegazetteer.utils.StringNormalizerForChunking; import de.julielab.jcore.ae.lingpipegazetteer.utils.StringNormalizerForChunking.NormalizedString; -import org.junit.Ignore; -import org.junit.Test; +import org.apache.commons.lang3.Range; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.*; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; public class StringNormalizerForChunkingTest { + + private Transliterator transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC"); @Test public void testTextNormalization() { String term; @@ -20,25 +25,25 @@ public void testTextNormalization() { term = "\"Call\" - postponed"; ns = StringNormalizerForChunking.normalizeString(term); - assertEquals("Term normalization was not correct", "Call postponed", ns.string); + assertEquals( "Call postponed", ns.string, "Term normalization was not correct"); term = "\"Light-for-dates\" with signs of fetal malnutrition, 1,000-1,249 grams"; ns = StringNormalizerForChunking.normalizeString(term); - assertEquals("Term normalization was not correct", - "Lightfordates with signs of fetal malnutrition 10001249 grams", ns.string); + assertEquals("Lightfordates with signs of fetal malnutrition 10001249 grams", + ns.string, "Term normalization was not correct"); term = "#Tarsal &/or metatarsal bones"; ns = StringNormalizerForChunking.normalizeString(term); - assertEquals("Term normalization was not correct", "Tarsal or metatarsal bones", ns.string); + assertEquals( "Tarsal or metatarsal bones", ns.string, "Term normalization was not correct"); term = "% it = jCas.getAnnotationIndex(EntityMention.type).iterator(); - assertTrue("There are no entity annotations in the CAS.", it.hasNext()); + assertTrue(it.hasNext(), "There are no entity annotations in the CAS."); EntityMention em = (EntityMention) it.next(); - assertEquals("Start wrong: ", new Integer(0), new Integer(em.getBegin())); - assertEquals("End wrong: ", new Integer(5), new Integer(em.getEnd())); - assertEquals("Wrong type: ", "SHP-1", em.getSpecificType()); + assertEquals( new Integer(0), new Integer(em.getBegin()), "Start wrong: "); + assertEquals( new Integer(5), new Integer(em.getEnd()), "End wrong: "); + assertEquals( "SHP-1", em.getSpecificType(), "Wrong type: "); - assertTrue("The secnond entity annotations is missing.", it.hasNext()); + assertTrue(it.hasNext(), "The secnond entity annotations is missing."); em = (EntityMention) it.next(); - assertEquals("Start wrong: ", new Integer(10), new Integer(em.getBegin())); - assertEquals("End wrong: ", new Integer(45), new Integer(em.getEnd())); - assertEquals("Wrong type: ", "KLRG2", em.getSpecificType()); + assertEquals( new Integer(10), new Integer(em.getBegin()), "Start wrong: "); + assertEquals( new Integer(45), new Integer(em.getEnd()), "End wrong: "); + assertEquals( "KLRG2", em.getSpecificType(), "Wrong type: "); - assertFalse("There are too many annotations.", it.hasNext()); + assertFalse(it.hasNext(), "There are too many annotations."); jCas.reset(); jCas.setDocumentText( @@ -312,13 +315,13 @@ public void testAnnotatorWithTextNormalization() annotator.process(jCas); it = jCas.getAnnotationIndex(EntityMention.type).iterator(); - assertTrue("There are no entity annotations in the CAS.", it.hasNext()); + assertTrue(it.hasNext(), "There are no entity annotations in the CAS."); em = (EntityMention) it.next(); - assertEquals("Start wrong: ", new Integer(17), new Integer(em.getBegin())); - assertEquals("End wrong: ", new Integer(103), new Integer(em.getEnd())); - assertEquals("Wrong type: ", "CHEM", em.getSpecificType()); + assertEquals( new Integer(17), new Integer(em.getBegin()), "Start wrong: "); + assertEquals( new Integer(103), new Integer(em.getEnd()), "End wrong: "); + assertEquals( "CHEM", em.getSpecificType(), "Wrong type: "); - assertFalse("There are too many annotations.", it.hasNext()); + assertFalse(it.hasNext(), "There are too many annotations."); jCas.reset(); jCas.setDocumentText( @@ -326,13 +329,13 @@ public void testAnnotatorWithTextNormalization() annotator.process(jCas); it = jCas.getAnnotationIndex(EntityMention.type).iterator(); - assertTrue("There are no entity annotations in the CAS.", it.hasNext()); + assertTrue(it.hasNext(), "There are no entity annotations in the CAS."); em = (EntityMention) it.next(); - assertEquals("Start wrong: ", new Integer(17), new Integer(em.getBegin())); - assertEquals("End wrong: ", new Integer(103), new Integer(em.getEnd())); - assertEquals("Wrong type: ", "CHEM", em.getSpecificType()); + assertEquals( new Integer(17), new Integer(em.getBegin()), "Start wrong: "); + assertEquals( new Integer(103), new Integer(em.getEnd()), "End wrong: "); + assertEquals( "CHEM", em.getSpecificType(), "Wrong type: "); - assertFalse("There are too many annotations.", it.hasNext()); + assertFalse(it.hasNext(), "There are too many annotations."); jCas.reset(); jCas.setDocumentText( @@ -340,7 +343,7 @@ public void testAnnotatorWithTextNormalization() annotator.process(jCas); it = jCas.getAnnotationIndex(EntityMention.type).iterator(); - assertFalse("There is an annotation in CAS although there shouldnt be.", it.hasNext()); + assertFalse(it.hasNext(), "There is an annotation in CAS although there shouldnt be."); jCas.reset(); jCas.setDocumentText("Test-dosing unit KLRg1 killer cell lectin like receptor G2 Parkinson's Disease"); @@ -352,8 +355,34 @@ public void testAnnotatorWithTextNormalization() System.out.println(it.next().getCoveredText()); counter++; } - assertEquals("Wrong entity count: ", new Integer(4), counter); + assertEquals( new Integer(4), counter, "Wrong entity count: "); + + } + + @Test + public void testAnnotatorWithPluralNormalization() + throws ResourceInitializationException, AnalysisEngineProcessException { + ExternalResourceDescription extDesc = ExternalResourceFactory.createExternalResourceDescription( + ChunkerProviderImplAlt.class, new File("src/test/resources/normalizepluralgazetteer.properties")); + TypeSystemDescription tsDesc = TypeSystemDescriptionFactory + .createTypeSystemDescription("de.julielab.jcore.types.jcore-semantics-mention-types"); + + AnalysisEngine annotator = AnalysisEngineFactory.createEngine(GazetteerAnnotator.class, tsDesc, + GazetteerAnnotator.PARAM_OUTPUT_TYPE, "de.julielab.jcore.types.EntityMention", + GazetteerAnnotator.CHUNKER_RESOURCE_NAME, extDesc); + JCas jCas = annotator.newJCas(); + + jCas.setDocumentText("High-density lipoprotein (HDL) is one of the five major groups of lipoproteins."); + PennBioIEPOSTag tag = new PennBioIEPOSTag(jCas, 74, 86); + tag.setValue("NNS"); + tag.addToIndexes(); + annotator.process(jCas); + Collection entityMentions = JCasUtil.select(jCas, EntityMention.class); + assertEquals( 2, entityMentions.size(), "Expected a single entity"); + Iterator iterator = entityMentions.iterator(); + assertEquals( "lipoprotein", iterator.next().getCoveredText(), "Unexpected covered entity text"); + assertEquals( "lipoproteins", iterator.next().getCoveredText(), "Unexpected covered entity text"); } @Test @@ -363,7 +392,7 @@ public void testAnnotateAcronymsWithFullFormEntity() throws Exception { TypeSystemDescription tsDesc = TypeSystemDescriptionFactory .createTypeSystemDescription("de.julielab.jcore.types.jcore-semantics-mention-types"); - AnalysisEngine annotator = AnalysisEngineFactory.createPrimitive(GazetteerAnnotator.class, tsDesc, + AnalysisEngine annotator = AnalysisEngineFactory.createEngine(GazetteerAnnotator.class, tsDesc, GazetteerAnnotator.PARAM_OUTPUT_TYPE, "de.julielab.jcore.types.EntityMention", GazetteerAnnotator.CHUNKER_RESOURCE_NAME, extDesc); JCas jCas = annotator.newJCas(); @@ -388,7 +417,7 @@ public void testAnnotateAcronymsWithFullFormEntity() throws Exception { it.next(); counter++; } - assertEquals("Wrong entity count: ", new Integer(1), counter); + assertEquals( new Integer(1), counter, "Wrong entity count: "); jCas.reset(); jCas.setDocumentText( @@ -426,40 +455,19 @@ public void testAnnotateAcronymsWithFullFormEntity() throws Exception { } assertEquals("GENE", next.getSpecificType()); } - assertEquals("Wrong entity count: ", new Integer(1), counter); + assertEquals( Integer.valueOf(1), counter, "Wrong entity count: "); } - @Test - public void testAnnotatorWithTextNormalizationMuh() - throws ResourceInitializationException, AnalysisEngineProcessException { - ExternalResourceDescription extDesc = ExternalResourceFactory.createExternalResourceDescription( - ChunkerProviderImplAlt.class, new File("src/test/resources/normalizegazetteer.properties")); - TypeSystemDescription tsDesc = TypeSystemDescriptionFactory - .createTypeSystemDescription("de.julielab.jcore.types.jcore-semantics-mention-types"); - AnalysisEngine annotator = AnalysisEngineFactory.createPrimitive(GazetteerAnnotator.class, tsDesc, - GazetteerAnnotator.PARAM_OUTPUT_TYPE, "de.julielab.jcore.types.EntityMention", - GazetteerAnnotator.CHUNKER_RESOURCE_NAME, extDesc); - JCas jCas = annotator.newJCas(); - - jCas.setDocumentText("We shall now describe our system setup followed by our proposed solution, which is a fully distributed and absolute localization solution specifically designed for both one-hop and multi-hop WSNs. Our considered WSN consists of Ns number of sensors randomly placed onto a map of predefined size with Nb number of beacons. Let 𝕊 and 𝔹 be the sets describing all sensors and beacons respectively, where each sensor is noted as Sensori, i ∈ 𝕊 and each beacon is noted as Beaconj, j ∈ 𝔹. Each node either a sensor or a beacon is noted as Nodep, p ∈ 𝕊 ∪ 𝔹, and vector V⃗p is used to represent the coordinate of Nodep. Beacons are placed onto the map with fixed coordinates V⃗j, where j ∈ 𝔹. We assume that each beacon is aware of its own absolute location. Whereas each sensor is unaware of its own location, and is configured with an initial guess of location unrelated to its actual deployed location. The two-dimensional (2-D) localization problem is the estimation of Ns unknown-location coordinates V⃗i, where i ∈ 𝕊.\n"); - annotator.process(jCas); - - FSIterator it = jCas.getAnnotationIndex(EntityMention.type).iterator(); -while (it.hasNext()) { - Annotation annotation = (Annotation) it.next(); - System.out.println(annotation.getCoveredText()); -} - } @Test - public void testSontesthalt() throws Exception { + public void testGeneRecognition() throws Exception { ExternalResourceDescription extDesc = ExternalResourceFactory.createExternalResourceDescription( ChunkerProviderImplAlt.class, new File("src/test/resources/normalizegazetteer.eg.testdict.properties")); TypeSystemDescription tsDesc = TypeSystemDescriptionFactory .createTypeSystemDescription("de.julielab.jcore.types.jcore-semantics-mention-types"); - AnalysisEngine annotator = AnalysisEngineFactory.createPrimitive(GazetteerAnnotator.class, tsDesc, + AnalysisEngine annotator = AnalysisEngineFactory.createEngine(GazetteerAnnotator.class, tsDesc, GazetteerAnnotator.PARAM_OUTPUT_TYPE, "de.julielab.jcore.types.EntityMention", GazetteerAnnotator.CHUNKER_RESOURCE_NAME, extDesc); @@ -506,6 +514,38 @@ public void testSontesthalt() throws Exception { assertEquals("Yak1", it.next().getCoveredText()); } + @Test + public void testStopwords() throws Exception { + ExternalResourceDescription extDesc = ExternalResourceFactory.createExternalResourceDescription( + ChunkerProviderImplAlt.class, new File("src/test/resources/normalizegazetteer.eg.testdict.teststopwords.properties")); + TypeSystemDescription tsDesc = TypeSystemDescriptionFactory + .createTypeSystemDescription("de.julielab.jcore.types.jcore-semantics-mention-types"); + + AnalysisEngine annotator = AnalysisEngineFactory.createEngine(GazetteerAnnotator.class, tsDesc, + GazetteerAnnotator.PARAM_OUTPUT_TYPE, "de.julielab.jcore.types.EntityMention", + GazetteerAnnotator.CHUNKER_RESOURCE_NAME, extDesc); + + JCas jCas = annotator.newJCas(); + + // Warning: This text does not make sense ;-) + jCas.setDocumentText( + "Identification of cDNAs encoding two human alpha class microsomal glutathione and the heterologous expression of glutathione S-transferase alpha-4."); + + annotator.process(jCas); + + Set extractedGenes = new HashSet<>(); + for (var e : JCasUtil.select(jCas, EntityMention.class)) { + extractedGenes.add(e.getCoveredText()); + } + // The stop word list contains the term "glutathione" + // The current algorithm in GazetteerAnnotator#filterStopwords(String) computes the fraction that the + // stop word has on the whole entity and only rejects it if it exceeds some threshold. For this reason, + // the shorter mention is excluded while the longer is retained. + assertThat(extractedGenes).doesNotContain("microsomal glutathione"); + // The whole "glutathione S-transferase alpha-4" is on the stop word list. + assertThat(extractedGenes).contains("glutathione S-transferase alpha-4"); + } + @Test public void testApproximate() throws Exception { ExternalResourceDescription extDesc = ExternalResourceFactory.createExternalResourceDescription( @@ -513,7 +553,7 @@ public void testApproximate() throws Exception { TypeSystemDescription tsDesc = TypeSystemDescriptionFactory .createTypeSystemDescription("de.julielab.jcore.types.jcore-semantics-mention-types"); - AnalysisEngine annotator = AnalysisEngineFactory.createPrimitive(GazetteerAnnotator.class, tsDesc, + AnalysisEngine annotator = AnalysisEngineFactory.createEngine(GazetteerAnnotator.class, tsDesc, GazetteerAnnotator.PARAM_OUTPUT_TYPE, "de.julielab.jcore.types.EntityMention", GazetteerAnnotator.CHUNKER_RESOURCE_NAME, extDesc); @@ -571,9 +611,9 @@ public void testGroupOvecrlappingChunks() { assertEquals(1, bestChunkList.size()); Chunk bestChunk = bestChunkList.get(0); assertFalse( + bestChunks.contains(bestChunk), "Duplicate best chunk: " + bestChunk + " (\"" - + chunkedText.subSequence(bestChunk.start(), bestChunk.end()) + "\")", - bestChunks.contains(bestChunk)); + + chunkedText.subSequence(bestChunk.start(), bestChunk.end()) + "\")"); bestChunks.add(bestChunk); } } @@ -635,7 +675,7 @@ public void testReadCompressedDictionary() throws Exception { TypeSystemDescription tsDesc = TypeSystemDescriptionFactory .createTypeSystemDescription("de.julielab.jcore.types.jcore-semantics-mention-types"); - AnalysisEngine annotator = AnalysisEngineFactory.createPrimitive(GazetteerAnnotator.class, tsDesc, + AnalysisEngine annotator = AnalysisEngineFactory.createEngine(GazetteerAnnotator.class, tsDesc, GazetteerAnnotator.PARAM_OUTPUT_TYPE, "de.julielab.jcore.types.EntityMention", GazetteerAnnotator.CHUNKER_RESOURCE_NAME, extDesc); @@ -654,4 +694,27 @@ public void testReadCompressedDictionary() throws Exception { assertEquals(1, counter); } + @Test + public void testOffsetIssueWhenNoTransliteration() throws Exception { + ExternalResourceDescription extDesc = ExternalResourceFactory.createExternalResourceDescription( + ConfigurableChunkerProviderImplAlt.class, "file:src/test/resources/pehc.dict", ConfigurableChunkerProviderImplAlt.PARAM_CASE_SENSITIVE, false, ConfigurableChunkerProviderImplAlt.PARAM_NORMALIZE_TEXT, true, ConfigurableChunkerProviderImplAlt.PARAM_TRANSLITERATE_TEXT, false, ConfigurableChunkerProviderImplAlt.PARAM_STOPWORD_FILE, "de/julielab/jcore/ae/lingpipegazetteer/stopwords/general_english_words", ConfigurableChunkerProviderImplAlt.PARAM_USE_APPROXIMATE_MATCHING, true, ConfigurableChunkerProviderImplAlt.PARAM_MAKE_VARIANTS, false); + TypeSystemDescription tsDesc = TypeSystemDescriptionFactory + .createTypeSystemDescription("de.julielab.jcore.types.jcore-semantics-mention-types"); + + AnalysisEngine annotator = AnalysisEngineFactory.createEngine(GazetteerAnnotator.class, tsDesc, + GazetteerAnnotator.PARAM_OUTPUT_TYPE, "de.julielab.jcore.types.EntityMention", + GazetteerAnnotator.CHUNKER_RESOURCE_NAME, extDesc); + + JCas jCas = annotator.newJCas(); + +// jCas.setDocumentText("Clinical Features and Course of Patients with Peripheral Exudative Hemorrhagic Chorioretinopathy.\nTo evaluate the clinical characteristics of patients who were followed in our clinic with the diagnosis of peripheral exudative hemorrhagic chorioretinopathy (PEHC).\nMedical records of 12 patients who were diagnosed with PEHC in İstanbul University İstanbul Faculty of Medicine, Department of Ophthalmology between July 2006 and June 2014 were reviewed retrospectively.\nThis study included 21 eyes of 12 patients. Four (33.3%) of the patients were male and 8 (66.7%) were female and ages ranged between 73 and 89 years. Eight (66.7%) of the patients were referred to us with the diagnosis of choroidal mass. Unilateral involvement was found in 3 and bilateral involvement in 9 patients. Temporal quadrants were involved in all eyes. Fifteen eyes (71.4%) had subretinal hemorrhage and hemorrhagic/serous retinal pigment epithelial detachment, 11 (52.4%) had lipid exudation, 5 (23.8%) had chronic retinal pigment epithelium alterations, 2 (9.5%) had subretinal fibrosis and 1 (4.8%) had vitreous hemorrhage. PEHC lesions were accompanied by drusen in 11 eyes (52.4%), geographic atrophy in 2 eyes (9.5%), and choroidal neovascularization scar in 2 eyes (9.5%)."); + jCas.setDocumentText("[...] diagnosed with PEHC in İstanbul University İstanbul Faculty of Medicine, Department of Ophthalmology [...].\n[...] PEHC lesions were accompanied by drusen [...]."); + annotator.process(jCas); + + List entityStrings = new ArrayList<>(); + for (EntityMention g : jCas.getAnnotationIndex(EntityMention.type)) { + entityStrings.add(g.getCoveredText()); + } + assertThat(entityStrings).containsExactly("PEHC", "İstanbul", "İstanbul", "PEHC", "lesions"); + } } diff --git a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/OverlappingChunkTest.java b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/OverlappingChunkTest.java index 078f62ecb..c700ff26f 100644 --- a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/OverlappingChunkTest.java +++ b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/OverlappingChunkTest.java @@ -3,12 +3,12 @@ import com.aliasi.chunk.Chunk; import com.aliasi.chunk.ChunkFactory; import de.julielab.jcore.ae.lingpipegazetteer.chunking.OverlappingChunk; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.ArrayList; import java.util.List; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class OverlappingChunkTest { @Test diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/ApproxGazetteerAnnotatorTest.xml b/jcore-lingpipegazetteer-ae/src/test/resources/ApproxGazetteerAnnotatorTest.xml index bfd3827d0..3f778c495 100644 --- a/jcore-lingpipegazetteer-ae/src/test/resources/ApproxGazetteerAnnotatorTest.xml +++ b/jcore-lingpipegazetteer-ae/src/test/resources/ApproxGazetteerAnnotatorTest.xml @@ -6,7 +6,7 @@ GazetteerAnnotator - 2.5.1-SNAPSHOT + 2.6.0 julielab diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/ExactGazetteerAnnotatorTest.xml b/jcore-lingpipegazetteer-ae/src/test/resources/ExactGazetteerAnnotatorTest.xml index eeebe281b..c1d7f4f90 100644 --- a/jcore-lingpipegazetteer-ae/src/test/resources/ExactGazetteerAnnotatorTest.xml +++ b/jcore-lingpipegazetteer-ae/src/test/resources/ExactGazetteerAnnotatorTest.xml @@ -6,7 +6,7 @@ GazetteerAnnotator - 2.5.1-SNAPSHOT + 2.6.0 julielab diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/normalizePlural.dict b/jcore-lingpipegazetteer-ae/src/test/resources/normalizePlural.dict new file mode 100644 index 000000000..a59e0435f --- /dev/null +++ b/jcore-lingpipegazetteer-ae/src/test/resources/normalizePlural.dict @@ -0,0 +1 @@ +lipoprotein Group diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/normalizegazetteer.eg.testdict.teststopwords.properties b/jcore-lingpipegazetteer-ae/src/test/resources/normalizegazetteer.eg.testdict.teststopwords.properties new file mode 100644 index 000000000..3eda6cdbb --- /dev/null +++ b/jcore-lingpipegazetteer-ae/src/test/resources/normalizegazetteer.eg.testdict.teststopwords.properties @@ -0,0 +1,8 @@ +DictionaryFile=src/test/resources/eg.testdict +StopWordFile=src/test/resources/stopwords.test +NormalizeText=true +TransliterateText=true +UseApproximateMatching=true +MakeVariants=false +CaseSensitive=false + diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/normalizegazetteer.properties b/jcore-lingpipegazetteer-ae/src/test/resources/normalizegazetteer.properties index 88c7883d4..91ac661e7 100644 --- a/jcore-lingpipegazetteer-ae/src/test/resources/normalizegazetteer.properties +++ b/jcore-lingpipegazetteer-ae/src/test/resources/normalizegazetteer.properties @@ -1,5 +1,6 @@ DictionaryFile=src/test/resources/dictionary.tst -StopWordFile=src/test/resources/general_english_words +#StopWordFile=src/test/resources/general_english_words +StopWordFile=src/test/resources/reducedStopWordList.txt NormalizeText=true UseApproximateMatching=true MakeVariants=false diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/normalizepluralgazetteer.properties b/jcore-lingpipegazetteer-ae/src/test/resources/normalizepluralgazetteer.properties new file mode 100644 index 000000000..025fd2fa7 --- /dev/null +++ b/jcore-lingpipegazetteer-ae/src/test/resources/normalizepluralgazetteer.properties @@ -0,0 +1,8 @@ +DictionaryFile=src/test/resources/normalizePlural.dict +StopWordFile=src/test/resources/reducedStopWordList.txt +NormalizeText=true +NormalizePlural=true +UseApproximateMatching=true +MakeVariants=false +CaseSensitive=false + diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/pehc.dict b/jcore-lingpipegazetteer-ae/src/test/resources/pehc.dict new file mode 100644 index 000000000..4e6b0f5ec --- /dev/null +++ b/jcore-lingpipegazetteer-ae/src/test/resources/pehc.dict @@ -0,0 +1,3 @@ +PEHC Gene +lesions Gene +İstanbul Gene \ No newline at end of file diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/reducedStopWordList.txt b/jcore-lingpipegazetteer-ae/src/test/resources/reducedStopWordList.txt new file mode 100644 index 000000000..b0385b7e1 --- /dev/null +++ b/jcore-lingpipegazetteer-ae/src/test/resources/reducedStopWordList.txt @@ -0,0 +1,320 @@ +about +above +across +after +afterwards +again +against +almost +alone +along +already +also +although +always +am +among +amoungst +amount +an +and +another +any +anyhow +anyone +anything +anywhere +are +around +as +at +back +be +became +because +become +becoming +been +before +beforehand +behind +being +below +beside +between +beyond +bill +both +bottom +but +by +call +can +cannot +co +computer +con +could +couldnt +cry +de +describe +detail +do +down +due +during +each +eg +eight +either +eleven +else +elsewhere +enough +etc +even +ever +every +everyone +everything +everywhere +except +fifteen +fify +fill +find +fire +first +five +for +former +formerly +found +four +from +front +full +further +get +give +go +had +has +have +he +hence +her +here +hereafter +hereby +herein +hereupon +hers +high +him +himself +his +how +however +hundred +i +ie +if +in +indeed +interest +into +is +it +its +itself +keep +last +latter +least +less +ltd +made +many +may +me +meanwhile +might +mill +more +moreover +most +mostly +move +much +must +my +myself +name +neither +never +nevertheless +next +nine +no +nobody +none +noone +not +nothing +now +nowhere +of +off +often +on +once +one +only +or +other +others +otherwise +our +ours +ourselves +out +over +own +per +perhaps +please +put +rather +re +same +see +seem +seemed +seems +serious +several +she +should +show +side +since +sincere +sixty +so +some +somehow +someone +something +sometime +sometimes +still +such +system +take +ten +than +that +the +their +them +then +thence +there +thereafter +thereby +therefore +therein +these +they +thick +thin +third +this +those +though +three +throughout +thru +thus +to +together +too +top +toward +towards +twenty +two +un +under +until +up +upon +us +very +via +was +we +were +what +whatever +when +whence +whenever +where +whereafter +whereas +wherein +whereupon +wherever +whether +which +while +whither +who +whole +whom +whose +why +will +with +within +without +would +yet +your +yours +yourself +yourselves +a +all +amongst +anyway +becomes +besides +cant +done +empty +few +forty +hasnt +herself +inc +latterly +mine +namely +noer +onto +part +seeming +six +somewhere +themselves +thereupon +through +twelve +well +whereby +whoever +you diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/stopwords.test b/jcore-lingpipegazetteer-ae/src/test/resources/stopwords.test new file mode 100644 index 000000000..b2da0a5ec --- /dev/null +++ b/jcore-lingpipegazetteer-ae/src/test/resources/stopwords.test @@ -0,0 +1 @@ +glutathione diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/unused/bio_text.xmi b/jcore-lingpipegazetteer-ae/src/test/resources/unused/bio_text.xmi deleted file mode 100644 index 5e3993e5f..000000000 --- a/jcore-lingpipegazetteer-ae/src/test/resources/unused/bio_text.xmi +++ /dev/null @@ -1,3 +0,0 @@ - - diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/unused/tmp.txt b/jcore-lingpipegazetteer-ae/src/test/resources/unused/tmp.txt deleted file mode 100644 index 93e1214e3..000000000 --- a/jcore-lingpipegazetteer-ae/src/test/resources/unused/tmp.txt +++ /dev/null @@ -1,4878 +0,0 @@ -0 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - tests for errors when loading and initializing dictionary... -375 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -419 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - building dictionary took: 0 secs -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -2321 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -2327 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2330 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2334 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2344 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for EXACT matching (6 matches expected)... -14407 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14416 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14433 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14434 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14435 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14436 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14437 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14440 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14466 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14594 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14595 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14596 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14597 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@1.0 chunk is not an abbreviation - -14605 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14608 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@1.0 chunk is not an abbreviation - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@1.0 chunk is not an abbreviation - -14610 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14626 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14630 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14631 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14632 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14640 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14643 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - -14643 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for APPROX matching (13 matches expected)... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14848 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14850 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14851 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14859 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - all overlapping chunks: - -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 563-569 -563-568:KLRG1@10.0 start=563 end=568 score=10.0 -563-569:KLRG2@100.0 start=563 end=569 score=100.0 -563-568:KLRG2@0.0 start=563 end=568 score=0.0 -562-568:KLRG2@100.0 start=562 end=568 score=100.0 -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 563 - 568: 0.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 679-685 -679-684:KLRG1@10.0 start=679 end=684 score=10.0 -679-684:KLRG2@0.0 start=679 end=684 score=0.0 -679-685:KLRG2@100.0 start=679 end=685 score=100.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 679 - 684: 0.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 40-46 -41-46:KLRG1@0.0 start=41 end=46 score=0.0 -40-46:KLRG1@100.0 start=40 end=46 score=100.0 -41-47:KLRG1@100.0 start=41 end=47 score=100.0 -41-46:KLRG2@10.0 start=41 end=46 score=10.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 41 - 46: 0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 526-561 -526-561:KLRG2@0.0 start=526 end=561 score=0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 526 - 561: 0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@0.0 chunk is not an abbreviation - -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 741-787 -741-787:ITIM@0.0 start=741 end=787 score=0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 741 - 787: 0.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@0.0 chunk is not an abbreviation - -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 4-39 -4-39:KLRG2@10.0 start=4 end=39 score=10.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 4 - 39: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 4-39:KLRG2@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 829-834 -829-832:SHP-1@100.0 start=829 end=832 score=100.0 -829-834:SHP-1@10.0 start=829 end=834 score=10.0 -829-833:SHP-1@50.0 start=829 end=833 score=50.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 829 - 834: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 829-834:SHP-1@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 843-848 -843-847:SHP-1@60.0 start=843 end=847 score=60.0 -843-848:SHP-1@10.0 start=843 end=848 score=10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 843 - 848: 10.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@10.0 chunk is not an abbreviation - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 289-294 -289-294:KLRG2@10.0 start=289 end=294 score=10.0 -289-294:KLRG1@0.0 start=289 end=294 score=0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 289 - 294: 0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 370-374 -370-374:KLRG2@100.0 start=370 end=374 score=100.0 -370-374:KLRG1@100.0 start=370 end=374 score=100.0 -14971 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 370 - 374: 100.0 -14973 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 370-374:KLRG2@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 818-824 -818-822:SHP-1@100.0 start=818 end=822 score=100.0 -818-824:SHP-1@100.0 start=818 end=824 score=100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 818 - 824: 100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 818-824:SHP-1@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 428-433 -428-433:KLRG1@0.0 start=428 end=433 score=0.0 -428-433:KLRG2@10.0 start=428 end=433 score=10.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 428 - 433: 0.0 -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14984 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=41; end=46]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=289; end=294]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=428; end=433]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=563; end=568]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=679; end=684]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14988 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14989 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G1 -EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 41 - end: 46 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 289 - end: 294 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG -EntityMention - sofa: _InitialView - begin: 370 - end: 374 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 428 - end: 433 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14993 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHIP-1 -EntityMention - sofa: _InitialView - begin: 818 - end: 824 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHIP-1" - head: - mentionLevel: -0 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - tests for errors when loading and initializing dictionary... -375 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -419 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - building dictionary took: 0 secs -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -2321 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -2327 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2330 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2334 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2344 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for EXACT matching (6 matches expected)... -14407 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14416 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14433 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14434 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14435 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14436 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14437 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14440 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14466 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14594 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14595 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14596 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14597 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@1.0 chunk is not an abbreviation - -14605 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14608 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@1.0 chunk is not an abbreviation - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@1.0 chunk is not an abbreviation - -14610 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14626 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14630 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14631 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14632 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14640 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14643 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - -14643 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for APPROX matching (13 matches expected)... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14848 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14850 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14851 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14859 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - all overlapping chunks: - -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 563-569 -563-568:KLRG1@10.0 start=563 end=568 score=10.0 -563-569:KLRG2@100.0 start=563 end=569 score=100.0 -563-568:KLRG2@0.0 start=563 end=568 score=0.0 -562-568:KLRG2@100.0 start=562 end=568 score=100.0 -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 563 - 568: 0.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 679-685 -679-684:KLRG1@10.0 start=679 end=684 score=10.0 -679-684:KLRG2@0.0 start=679 end=684 score=0.0 -679-685:KLRG2@100.0 start=679 end=685 score=100.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 679 - 684: 0.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 40-46 -41-46:KLRG1@0.0 start=41 end=46 score=0.0 -40-46:KLRG1@100.0 start=40 end=46 score=100.0 -41-47:KLRG1@100.0 start=41 end=47 score=100.0 -41-46:KLRG2@10.0 start=41 end=46 score=10.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 41 - 46: 0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 526-561 -526-561:KLRG2@0.0 start=526 end=561 score=0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 526 - 561: 0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@0.0 chunk is not an abbreviation - -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 741-787 -741-787:ITIM@0.0 start=741 end=787 score=0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 741 - 787: 0.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@0.0 chunk is not an abbreviation - -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 4-39 -4-39:KLRG2@10.0 start=4 end=39 score=10.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 4 - 39: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 4-39:KLRG2@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 829-834 -829-832:SHP-1@100.0 start=829 end=832 score=100.0 -829-834:SHP-1@10.0 start=829 end=834 score=10.0 -829-833:SHP-1@50.0 start=829 end=833 score=50.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 829 - 834: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 829-834:SHP-1@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 843-848 -843-847:SHP-1@60.0 start=843 end=847 score=60.0 -843-848:SHP-1@10.0 start=843 end=848 score=10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 843 - 848: 10.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@10.0 chunk is not an abbreviation - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 289-294 -289-294:KLRG2@10.0 start=289 end=294 score=10.0 -289-294:KLRG1@0.0 start=289 end=294 score=0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 289 - 294: 0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 370-374 -370-374:KLRG2@100.0 start=370 end=374 score=100.0 -370-374:KLRG1@100.0 start=370 end=374 score=100.0 -14971 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 370 - 374: 100.0 -14973 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 370-374:KLRG2@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 818-824 -818-822:SHP-1@100.0 start=818 end=822 score=100.0 -818-824:SHP-1@100.0 start=818 end=824 score=100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 818 - 824: 100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 818-824:SHP-1@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 428-433 -428-433:KLRG1@0.0 start=428 end=433 score=0.0 -428-433:KLRG2@10.0 start=428 end=433 score=10.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 428 - 433: 0.0 -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14984 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=41; end=46]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=289; end=294]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=428; end=433]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=563; end=568]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=679; end=684]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14988 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14989 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G1 -EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 41 - end: 46 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 289 - end: 294 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG -EntityMention - sofa: _InitialView - begin: 370 - end: 374 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 428 - end: 433 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: 0 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - tests for errors when loading and initializing dictionary... -375 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -419 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - building dictionary took: 0 secs -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -2321 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -2327 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2330 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2334 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2344 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for EXACT matching (6 matches expected)... -14407 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14416 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14433 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14434 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14435 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14436 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14437 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14440 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14466 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14594 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14595 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14596 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14597 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@1.0 chunk is not an abbreviation - -14605 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14608 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@1.0 chunk is not an abbreviation - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@1.0 chunk is not an abbreviation - -14610 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14626 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14630 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14631 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14632 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14640 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14643 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - -14643 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for APPROX matching (13 matches expected)... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14848 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14850 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14851 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14859 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - all overlapping chunks: - -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 563-569 -563-568:KLRG1@10.0 start=563 end=568 score=10.0 -563-569:KLRG2@100.0 start=563 end=569 score=100.0 -563-568:KLRG2@0.0 start=563 end=568 score=0.0 -562-568:KLRG2@100.0 start=562 end=568 score=100.0 -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 563 - 568: 0.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 679-685 -679-684:KLRG1@10.0 start=679 end=684 score=10.0 -679-684:KLRG2@0.0 start=679 end=684 score=0.0 -679-685:KLRG2@100.0 start=679 end=685 score=100.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 679 - 684: 0.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 40-46 -41-46:KLRG1@0.0 start=41 end=46 score=0.0 -40-46:KLRG1@100.0 start=40 end=46 score=100.0 -41-47:KLRG1@100.0 start=41 end=47 score=100.0 -41-46:KLRG2@10.0 start=41 end=46 score=10.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 41 - 46: 0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 526-561 -526-561:KLRG2@0.0 start=526 end=561 score=0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 526 - 561: 0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@0.0 chunk is not an abbreviation - -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 741-787 -741-787:ITIM@0.0 start=741 end=787 score=0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 741 - 787: 0.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@0.0 chunk is not an abbreviation - -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 4-39 -4-39:KLRG2@10.0 start=4 end=39 score=10.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 4 - 39: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 4-39:KLRG2@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 829-834 -829-832:SHP-1@100.0 start=829 end=832 score=100.0 -829-834:SHP-1@10.0 start=829 end=834 score=10.0 -829-833:SHP-1@50.0 start=829 end=833 score=50.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 829 - 834: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 829-834:SHP-1@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 843-848 -843-847:SHP-1@60.0 start=843 end=847 score=60.0 -843-848:SHP-1@10.0 start=843 end=848 score=10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 843 - 848: 10.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@10.0 chunk is not an abbreviation - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 289-294 -289-294:KLRG2@10.0 start=289 end=294 score=10.0 -289-294:KLRG1@0.0 start=289 end=294 score=0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 289 - 294: 0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 370-374 -370-374:KLRG2@100.0 start=370 end=374 score=100.0 -370-374:KLRG1@100.0 start=370 end=374 score=100.0 -14971 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 370 - 374: 100.0 -14973 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 370-374:KLRG2@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 818-824 -818-822:SHP-1@100.0 start=818 end=822 score=100.0 -818-824:SHP-1@100.0 start=818 end=824 score=100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 818 - 824: 100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 818-824:SHP-1@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 428-433 -428-433:KLRG1@0.0 start=428 end=433 score=0.0 -428-433:KLRG2@10.0 start=428 end=433 score=10.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 428 - 433: 0.0 -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14984 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=41; end=46]) has EntityMention: EntityMention - sofa: _InitialView - begin: 40 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - tests for errors when loading and initializing dictionary... -375 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -419 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - building dictionary took: 0 secs -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -2321 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -2327 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2330 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2334 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2344 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for EXACT matching (6 matches expected)... -14407 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14416 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14433 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14434 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14435 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14436 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14437 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14440 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14466 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14594 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14595 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14596 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14597 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@1.0 chunk is not an abbreviation - -14605 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14608 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@1.0 chunk is not an abbreviation - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@1.0 chunk is not an abbreviation - -14610 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14626 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14630 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14631 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14632 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14640 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14643 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - -14643 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for APPROX matching (13 matches expected)... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14848 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14850 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14851 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14859 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - all overlapping chunks: - -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 563-569 -563-568:KLRG1@10.0 start=563 end=568 score=10.0 -563-569:KLRG2@100.0 start=563 end=569 score=100.0 -563-568:KLRG2@0.0 start=563 end=568 score=0.0 -562-568:KLRG2@100.0 start=562 end=568 score=100.0 -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 563 - 568: 0.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 679-685 -679-684:KLRG1@10.0 start=679 end=684 score=10.0 -679-684:KLRG2@0.0 start=679 end=684 score=0.0 -679-685:KLRG2@100.0 start=679 end=685 score=100.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 679 - 684: 0.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 40-46 -41-46:KLRG1@0.0 start=41 end=46 score=0.0 -40-46:KLRG1@100.0 start=40 end=46 score=100.0 -41-47:KLRG1@100.0 start=41 end=47 score=100.0 -41-46:KLRG2@10.0 start=41 end=46 score=10.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 41 - 46: 0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 526-561 -526-561:KLRG2@0.0 start=526 end=561 score=0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 526 - 561: 0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@0.0 chunk is not an abbreviation - -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 741-787 -741-787:ITIM@0.0 start=741 end=787 score=0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 741 - 787: 0.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@0.0 chunk is not an abbreviation - -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 4-39 -4-39:KLRG2@10.0 start=4 end=39 score=10.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 4 - 39: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 4-39:KLRG2@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 829-834 -829-832:SHP-1@100.0 start=829 end=832 score=100.0 -829-834:SHP-1@10.0 start=829 end=834 score=10.0 -829-833:SHP-1@50.0 start=829 end=833 score=50.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 829 - 834: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 829-834:SHP-1@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 843-848 -843-847:SHP-1@60.0 start=843 end=847 score=60.0 -843-848:SHP-1@10.0 start=843 end=848 score=10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 843 - 848: 10.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@10.0 chunk is not an abbreviation - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 289-294 -289-294:KLRG2@10.0 start=289 end=294 score=10.0 -289-294:KLRG1@0.0 start=289 end=294 score=0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 289 - 294: 0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 370-374 -370-374:KLRG2@100.0 start=370 end=374 score=100.0 -370-374:KLRG1@100.0 start=370 end=374 score=100.0 -14971 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 370 - 374: 100.0 -14973 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 370-374:KLRG2@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 818-824 -818-822:SHP-1@100.0 start=818 end=822 score=100.0 -818-824:SHP-1@100.0 start=818 end=824 score=100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 818 - 824: 100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 818-824:SHP-1@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 428-433 -428-433:KLRG1@0.0 start=428 end=433 score=0.0 -428-433:KLRG2@10.0 start=428 end=433 score=10.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 428 - 433: 0.0 -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14984 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=41; end=46]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=289; end=294]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=428; end=433]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=563; end=568]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=679; end=684]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14988 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14989 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G1 -EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 41 - end: 46 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 289 - end: 294 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG -EntityMention - sofa: _InitialView - begin: 370 - end: 374 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 428 - end: 433 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14993 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHIP-1 -EntityMention - sofa: _InitialView - begin: 818 - end: 824 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerA0 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - tests for errors when loading and initializing dictionary... -375 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -419 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - building dictionary took: 0 secs -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -2321 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -2327 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2330 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2334 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2344 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for EXACT matching (6 matches expected)... -14407 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14416 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14433 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14434 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14435 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14436 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14437 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14440 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14466 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14594 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14595 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14596 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14597 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@1.0 chunk is not an abbreviation - -14605 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14608 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@1.0 chunk is not an abbreviation - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@1.0 chunk is not an abbreviation - -14610 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14626 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14630 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14631 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14632 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14640 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14643 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - -14643 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for APPROX matching (13 matches expected)... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14848 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14850 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14851 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14859 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - all overlapping chunks: - -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 563-569 -563-568:KLRG1@10.0 start=563 end=568 score=10.0 -563-569:KLRG2@100.0 start=563 end=569 score=100.0 -563-568:KLRG2@0.0 start=563 end=568 score=0.0 -562-568:KLRG2@100.0 start=562 end=568 score=100.0 -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 563 - 568: 0.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 679-685 -679-684:KLRG1@10.0 start=679 end=684 score=10.0 -679-684:KLRG2@0.0 start=679 end=684 score=0.0 -679-685:KLRG2@100.0 start=679 end=685 score=100.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 679 - 684: 0.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 40-46 -41-46:KLRG1@0.0 start=41 end=46 score=0.0 -40-46:KLRG1@100.0 start=40 end=46 score=100.0 -41-47:KLRG1@100.0 start=41 end=47 score=100.0 -41-46:KLRG2@10.0 start=41 end=46 score=10.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 41 - 46: 0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 526-561 -526-561:KLRG2@0.0 start=526 end=561 score=0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 526 - 561: 0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@0.0 chunk is not an abbreviation - -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 741-787 -741-787:ITIM@0.0 start=741 end=787 score=0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 741 - 787: 0.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@0.0 chunk is not an abbreviation - -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 4-39 -4-39:KLRG2@10.0 start=4 end=39 score=10.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 4 - 39: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 4-39:KLRG2@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 829-834 -829-832:SHP-1@100.0 start=829 end=832 score=100.0 -829-834:SHP-1@10.0 start=829 end=834 score=10.0 -829-833:SHP-1@50.0 start=829 end=833 score=50.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 829 - 834: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 829-834:SHP-1@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 843-848 -843-847:SHP-1@60.0 start=843 end=847 score=60.0 -843-848:SHP-1@10.0 start=843 end=848 score=10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 843 - 848: 10.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@10.0 chunk is not an abbreviation - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 289-294 -289-294:KLRG2@10.0 start=289 end=294 score=10.0 -289-294:KLRG1@0.0 start=289 end=294 score=0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 289 - 294: 0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 370-374 -370-374:KLRG2@100.0 start=370 end=374 score=100.0 -370-374:KLRG1@100.0 start=370 end=374 score=100.0 -14971 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 370 - 374: 100.0 -14973 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 370-374:KLRG2@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 818-824 -818-822:SHP-1@100.0 start=818 end=822 score=100.0 -818-824:SHP-1@100.0 start=818 end=824 score=100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 818 - 824: 100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 818-824:SHP-1@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 428-433 -428-433:KLRG1@0.0 start=428 end=433 score=0.0 -428-433:KLRG2@10.0 start=428 end=433 score=10.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 428 - 433: 0.0 -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14984 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=41; end=46]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=289; end=294]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=428; end=433]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=563; end=568]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=679; end=684]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14988 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14989 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G1 -EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 41 - end: 46 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 289 - end: 294 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG -EntityMention - sofa: _InitialView - begin: 370 - end: 374 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 428 - end: 433 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2"0 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - tests for errors when loading and initializing dictionary... -375 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -419 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - building dictionary took: 0 secs -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -2321 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -2327 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2330 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2334 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2344 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for EXACT matching (6 matches expected)... -14407 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14416 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14433 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14434 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14435 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14436 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14437 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14440 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14466 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14594 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14595 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14596 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14597 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@1.0 chunk is not an abbreviation - -14605 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14608 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@1.0 chunk is not an abbreviation - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@1.0 chunk is not an abbreviation - -14610 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14626 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14630 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14631 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14632 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14640 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14643 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - -14643 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for APPROX matching (13 matches expected)... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14848 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14850 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14851 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14859 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - all overlapping chunks: - -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 563-569 -563-568:KLRG1@10.0 start=563 end=568 score=10.0 -563-569:KLRG2@100.0 start=563 end=569 score=100.0 -563-568:KLRG2@0.0 start=563 end=568 score=0.0 -562-568:KLRG2@100.0 start=562 end=568 score=100.0 -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 563 - 568: 0.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 679-685 -679-684:KLRG1@10.0 start=679 end=684 score=10.0 -679-684:KLRG2@0.0 start=679 end=684 score=0.0 -679-685:KLRG2@100.0 start=679 end=685 score=100.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 679 - 684: 0.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 40-46 -41-46:KLRG1@0.0 start=41 end=46 score=0.0 -40-46:KLRG1@100.0 start=40 end=46 score=100.0 -41-47:KLRG1@100.0 start=41 end=47 score=100.0 -41-46:KLRG2@10.0 start=41 end=46 score=10.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 41 - 46: 0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 526-561 -526-561:KLRG2@0.0 start=526 end=561 score=0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 526 - 561: 0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@0.0 chunk is not an abbreviation - -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 741-787 -741-787:ITIM@0.0 start=741 end=787 score=0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 741 - 787: 0.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@0.0 chunk is not an abbreviation - -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 4-39 -4-39:KLRG2@10.0 start=4 end=39 score=10.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 4 - 39: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 4-39:KLRG2@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 829-834 -829-832:SHP-1@100.0 start=829 end=832 score=100.0 -829-834:SHP-1@10.0 start=829 end=834 score=10.0 -829-833:SHP-1@50.0 start=829 end=833 score=50.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 829 - 834: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 829-834:SHP-1@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 843-848 -843-847:SHP-1@60.0 start=843 end=847 score=60.0 -843-848:SHP-1@10.0 start=843 end=848 score=10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 843 - 848: 10.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@10.0 chunk is not an abbreviation - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 289-294 -289-294:KLRG2@10.0 start=289 end=294 score=10.0 -289-294:KLRG1@0.0 start=289 end=294 score=0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 289 - 294: 0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 370-374 -370-374:KLRG2@100.0 start=370 end=374 score=100.0 -370-374:KLRG1@100.0 start=370 end=374 score=100.0 -14971 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 370 - 374: 100.0 -14973 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 370-374:KLRG2@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 818-824 -818-822:SHP-1@100.0 start=818 end=822 score=100.0 -818-824:SHP-1@100.0 start=818 end=824 score=100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 818 - 824: 100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 818-824:SHP-1@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 428-433 -428-433:KLRG1@0.0 start=428 end=433 score=0.0 -428-433:KLRG2@10.0 start=428 end=433 score=10.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 428 - 433: 0.0 -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14984 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=41; end=46]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=289; end=294]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=428; end=433]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=563; end=568]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=679; end=684]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14988 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14989 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G1 -EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 41 - end: 46 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 289 - end: 294 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG -EntityMention - sofa: _InitialView - begin: 370 - end: 374 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 428 - end: 433 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14993 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHIP-1 -EntityMention - sofa: _InitialView - begin: 818 - end: 824 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: 0 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - tests for errors when loading and initializing dictionary... -375 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -419 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - building dictionary took: 0 secs -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -2321 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -2327 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2330 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2334 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2344 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for EXACT matching (6 matches expected)... -14407 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14416 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14433 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14434 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14435 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14436 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14437 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14440 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14466 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14594 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14595 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14596 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14597 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@1.0 chunk is not an abbreviation - -14605 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14608 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@1.0 chunk is not an abbreviation - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@1.0 chunk is not an abbreviation - -14610 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14626 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14630 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14631 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14632 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14640 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14643 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - -14643 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for APPROX matching (13 matches expected)... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14848 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14850 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14851 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14859 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - all overlapping chunks: - -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 563-569 -563-568:KLRG1@10.0 start=563 end=568 score=10.0 -563-569:KLRG2@100.0 start=563 end=569 score=100.0 -563-568:KLRG2@0.0 start=563 end=568 score=0.0 -562-568:KLRG2@100.0 start=562 end=568 score=100.0 -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 563 - 568: 0.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 679-685 -679-684:KLRG1@10.0 start=679 end=684 score=10.0 -679-684:KLRG2@0.0 start=679 end=684 score=0.0 -679-685:KLRG2@100.0 start=679 end=685 score=100.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 679 - 684: 0.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 40-46 -41-46:KLRG1@0.0 start=41 end=46 score=0.0 -40-46:KLRG1@100.0 start=40 end=46 score=100.0 -41-47:KLRG1@100.0 start=41 end=47 score=100.0 -41-46:KLRG2@10.0 start=41 end=46 score=10.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 41 - 46: 0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 526-561 -526-561:KLRG2@0.0 start=526 end=561 score=0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 526 - 561: 0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@0.0 chunk is not an abbreviation - -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 741-787 -741-787:ITIM@0.0 start=741 end=787 score=0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 741 - 787: 0.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@0.0 chunk is not an abbreviation - -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 4-39 -4-39:KLRG2@10.0 start=4 end=39 score=10.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 4 - 39: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 4-39:KLRG2@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 829-834 -829-832:SHP-1@100.0 start=829 end=832 score=100.0 -829-834:SHP-1@10.0 start=829 end=834 score=10.0 -829-833:SHP-1@50.0 start=829 end=833 score=50.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 829 - 834: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 829-834:SHP-1@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 843-848 -843-847:SHP-1@60.0 start=843 end=847 score=60.0 -843-848:SHP-1@10.0 start=843 end=848 score=10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 843 - 848: 10.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@10.0 chunk is not an abbreviation - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 289-294 -289-294:KLRG2@10.0 start=289 end=294 score=10.0 -289-294:KLRG1@0.0 start=289 end=294 score=0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 289 - 294: 0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 370-374 -370-374:KLRG2@100.0 start=370 end=374 score=100.0 -370-374:KLRG1@100.0 start=370 end=374 score=100.0 -14971 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 370 - 374: 100.0 -14973 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 370-374:KLRG2@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 818-824 -818-822:SHP-1@100.0 start=818 end=822 score=100.0 -818-824:SHP-1@100.0 start=818 end=824 score=100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 818 - 824: 100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 818-824:SHP-1@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 428-433 -428-433:KLRG1@0.0 start=428 end=433 score=0.0 -428-433:KLRG2@10.0 start=428 end=433 score=10.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 428 - 433: 0.0 -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14984 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=41; end=46]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=289; end=294]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=428; end=433]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=563; end=568]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=679; end=684]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14988 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14989 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G1 -EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 41 - end: 46 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 289 - end: 294 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG -EntityMention - sofa: _InitialView - begin: 370 - end: 374 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 428 - end: 433 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14993 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHIP-1 -EntityMention - sofa: _InitialView - begin: 818 - end: 824 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHIP-1" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHP-2 -EntityMention - sofa: _InitialView0 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - tests for errors when loading and initializing dictionary... -375 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -419 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - building dictionary took: 0 secs -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -2321 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -2327 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2330 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2334 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2344 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for EXACT matching (6 matches expected)... -14407 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14416 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14433 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14434 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14435 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14436 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14437 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14440 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14466 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14594 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14595 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14596 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14597 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@1.0 chunk is not an abbreviation - -14605 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14608 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@1.0 chunk is not an abbreviation - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@1.0 chunk is not an abbreviation - -14610 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14626 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14630 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14631 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14632 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14640 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14643 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - -14643 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for APPROX matching (13 matches expected)... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14848 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14850 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14851 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14859 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - all overlapping chunks: - -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 563-569 -563-568:KLRG1@10.0 start=563 end=568 score=10.0 -563-569:KLRG2@100.0 start=563 end=569 score=100.0 -563-568:KLRG2@0.0 start=563 end=568 score=0.0 -562-568:KLRG2@100.0 start=562 end=568 score=100.0 -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 563 - 568: 0.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 679-685 -679-684:KLRG1@10.0 start=679 end=684 score=10.0 -679-684:KLRG2@0.0 start=679 end=684 score=0.0 -679-685:KLRG2@100.0 start=679 end=685 score=100.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 679 - 684: 0.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 40-46 -41-46:KLRG1@0.0 start=41 end=46 score=0.0 -40-46:KLRG1@100.0 start=40 end=46 score=100.0 -41-47:KLRG1@100.0 start=41 end=47 score=100.0 -41-46:KLRG2@10.0 start=41 end=46 score=10.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 41 - 46: 0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 526-561 -526-561:KLRG2@0.0 start=526 end=561 score=0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 526 - 561: 0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@0.0 chunk is not an abbreviation - -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 741-787 -741-787:ITIM@0.0 start=741 end=787 score=0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 741 - 787: 0.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@0.0 chunk is not an abbreviation - -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 4-39 -4-39:KLRG2@10.0 start=4 end=39 score=10.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 4 - 39: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 4-39:KLRG2@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 829-834 -829-832:SHP-1@100.0 start=829 end=832 score=100.0 -829-834:SHP-1@10.0 start=829 end=834 score=10.0 -829-833:SHP-1@50.0 start=829 end=833 score=50.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 829 - 834: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 829-834:SHP-1@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 843-848 -843-847:SHP-1@60.0 start=843 end=847 score=60.0 -843-848:SHP-1@10.0 start=843 end=848 score=10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 843 - 848: 10.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@10.0 chunk is not an abbreviation - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 289-294 -289-294:KLRG2@10.0 start=289 end=294 score=10.0 -289-294:KLRG1@0.0 start=289 end=294 score=0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 289 - 294: 0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 370-374 -370-374:KLRG2@100.0 start=370 end=374 score=100.0 -370-374:KLRG1@100.0 start=370 end=374 score=100.0 -14971 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 370 - 374: 100.0 -14973 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 370-374:KLRG2@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 818-824 -818-822:SHP-1@100.0 start=818 end=822 score=100.0 -818-824:SHP-1@100.0 start=818 end=824 score=100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 818 - 824: 100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 818-824:SHP-1@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 428-433 -428-433:KLRG1@0.0 start=428 end=433 score=0.0 -428-433:KLRG2@10.0 start=428 end=433 score=10.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 428 - 433: 0.0 -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14984 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=41; end=46]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=289; end=294]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=428; end=433]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=563; end=568]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=679; end=684]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14988 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14989 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G1 -EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 41 - end: 46 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 289 - end: 294 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG -EntityMention - sofa: _InitialView - begin: 370 - end: 374 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 428 - end: 433 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14993 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHIP-1 -EntityMention - sofa: _InitialView - begin: 818 - end: 824 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHIP-1" - head: - mentionLevel: -0 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - tests for errors when loading and initializing dictionary... -375 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -419 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - building dictionary took: 0 secs -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -2321 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -2327 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2330 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2334 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2344 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for EXACT matching (6 matches expected)... -14407 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14416 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14433 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14434 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14435 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14436 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14437 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14440 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14466 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14594 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14595 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14596 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14597 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@1.0 chunk is not an abbreviation - -14605 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14608 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@1.0 chunk is not an abbreviation - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@1.0 chunk is not an abbreviation - -14610 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14626 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14630 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14631 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14632 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14640 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14643 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - -14643 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for APPROX matching (13 matches expected)... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14848 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14850 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14851 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14859 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - all overlapping chunks: - -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 563-569 -563-568:KLRG1@10.0 start=563 end=568 score=10.0 -563-569:KLRG2@100.0 start=563 end=569 score=100.0 -563-568:KLRG2@0.0 start=563 end=568 score=0.0 -562-568:KLRG2@100.0 start=562 end=568 score=100.0 -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 563 - 568: 0.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 679-685 -679-684:KLRG1@10.0 start=679 end=684 score=10.0 -679-684:KLRG2@0.0 start=679 end=684 score=0.0 -679-685:KLRG2@100.0 start=679 end=685 score=100.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 679 - 684: 0.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 40-46 -41-46:KLRG1@0.0 start=41 end=46 score=0.0 -40-46:KLRG1@100.0 start=40 end=46 score=100.0 -41-47:KLRG1@100.0 start=41 end=47 score=100.0 -41-46:KLRG2@10.0 start=41 end=46 score=10.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 41 - 46: 0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 526-561 -526-561:KLRG2@0.0 start=526 end=561 score=0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 526 - 561: 0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@0.0 chunk is not an abbreviation - -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 741-787 -741-787:ITIM@0.0 start=741 end=787 score=0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 741 - 787: 0.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@0.0 chunk is not an abbreviation - -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 4-39 -4-39:KLRG2@10.0 start=4 end=39 score=10.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 4 - 39: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 4-39:KLRG2@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 829-834 -829-832:SHP-1@100.0 start=829 end=832 score=100.0 -829-834:SHP-1@10.0 start=829 end=834 score=10.0 -829-833:SHP-1@50.0 start=829 end=833 score=50.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 829 - 834: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 829-834:SHP-1@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 843-848 -843-847:SHP-1@60.0 start=843 end=847 score=60.0 -843-848:SHP-1@10.0 start=843 end=848 score=10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 843 - 848: 10.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@10.0 chunk is not an abbreviation - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 289-294 -289-294:KLRG2@10.0 start=289 end=294 score=10.0 -289-294:KLRG1@0.0 start=289 end=294 score=0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 289 - 294: 0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 370-374 -370-374:KLRG2@100.0 start=370 end=374 score=100.0 -370-374:KLRG1@100.0 start=370 end=374 score=100.0 -14971 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 370 - 374: 100.0 -14973 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 370-374:KLRG2@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 818-824 -818-822:SHP-1@100.0 start=818 end=822 score=100.0 -818-824:SHP-1@100.0 start=818 end=824 score=100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 818 - 824: 100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 818-824:SHP-1@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 428-433 -428-433:KLRG1@0.0 start=428 end=433 score=0.0 -428-433:KLRG2@10.0 start=428 end=433 score=10.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 428 - 433: 0.0 -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14984 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=41; end=46]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=289; end=294]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=428; end=433]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=563; end=568]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=679; end=684]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14988 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14989 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G1 -EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 41 - end: 46 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 289 - end: 294 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG -EntityMention - sofa: _InitialView - begin: 370 - end: 374 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 428 - end: 433 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14993 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHIP-1 -EntityMention - sofa: _InitialView - begin: 818 - end: 824 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHIP-1" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHP-2 -EntityMention - sofa: _InitialView - begin: 829 - end: 834 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHP-2" - head: - mentionLevel: - -14996 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHP-2 -EntityMention - sofa: _InitialView - begin: 829 - end: 834 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHP-2" - head: - mentionLevel: - -14996 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - - - begin: 829 - end: 834 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHP-2" - head: - mentionLevel: - -14996 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - - - resourceEntryList: - textualRepresentation: "SHIP-1" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHP-2 -EntityMention - sofa: _InitialView - begin: 829 - end: 834 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHP-2" - head: - mentionLevel: - -14996 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - - - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14993 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHIP-1 -EntityMention - sofa: _InitialView - begin: 818 - end: 824 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHIP-1" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHP-2 -EntityMention - sofa: _InitialView - begin: 829 - end: 834 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHP-2" - head: - mentionLevel: - -14996 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - - nnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHIP-1" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHP-2 -EntityMention - sofa: _InitialView - begin: 829 - end: 834 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHP-2" - head: - mentionLevel: - -14996 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - - - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=289; end=294]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=428; end=433]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=563; end=568]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=679; end=684]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14988 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14989 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G1 -EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 41 - end: 46 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 289 - end: 294 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG -EntityMention - sofa: _InitialView - begin: 370 - end: 374 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 428 - end: 433 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14993 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHIP-1 -EntityMention - sofa: _InitialView - begin: 818 - end: 824 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHIP-1" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHP-2 -EntityMention - sofa: _InitialView - begin: 829 - end: 834 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHP-2" - head: - mentionLevel: - -14996 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - - - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14993 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHIP-1 -EntityMention - sofa: _InitialView - begin: 818 - end: 824 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHIP-1" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHP-2 -EntityMention - sofa: _InitialView - begin: 829 - end: 834 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHP-2" - head: - mentionLevel: - -14996 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHP-2 -EntityMention - sofa: _InitialView - begin: 829 - end: 834 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHP-2" - head: - mentionLevel: - -14996 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - diff --git a/jcore-lingscope-ae/component.meta b/jcore-lingscope-ae/component.meta index 3a5fc4991..c8ad54900 100644 --- a/jcore-lingscope-ae/component.meta +++ b/jcore-lingscope-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-lingscope-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Lingscope AE" } diff --git a/jcore-lingscope-ae/pom.xml b/jcore-lingscope-ae/pom.xml index 28836bd2b..e73d0386b 100644 --- a/jcore-lingscope-ae/pom.xml +++ b/jcore-lingscope-ae/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -47,8 +47,8 @@ jcore-descriptor-creator - junit - junit + org.junit.jupiter + junit-jupiter-engine de.julielab diff --git a/jcore-lingscope-ae/src/main/java/de/julielab/jcore/ae/lingscope/LingscopePosAnnotator.java b/jcore-lingscope-ae/src/main/java/de/julielab/jcore/ae/lingscope/LingscopePosAnnotator.java index ff34b56ad..aaaae3656 100644 --- a/jcore-lingscope-ae/src/main/java/de/julielab/jcore/ae/lingscope/LingscopePosAnnotator.java +++ b/jcore-lingscope-ae/src/main/java/de/julielab/jcore/ae/lingscope/LingscopePosAnnotator.java @@ -22,8 +22,8 @@ import java.io.File; import java.io.IOException; -import java.util.*; import java.util.List; +import java.util.*; import java.util.function.Supplier; import java.util.stream.Collectors; diff --git a/jcore-lingscope-ae/src/main/resources/de/julielab/jcore/ae/lingscope/desc/jcore-lingscope-ae.xml b/jcore-lingscope-ae/src/main/resources/de/julielab/jcore/ae/lingscope/desc/jcore-lingscope-ae.xml index 164a2ed7e..8442297cb 100644 --- a/jcore-lingscope-ae/src/main/resources/de/julielab/jcore/ae/lingscope/desc/jcore-lingscope-ae.xml +++ b/jcore-lingscope-ae/src/main/resources/de/julielab/jcore/ae/lingscope/desc/jcore-lingscope-ae.xml @@ -6,7 +6,7 @@ JCoRe Lingscope AE This component uses the Lingscope negation/hedge detection algorithm and models to annotate negation/hedge cues and the scope to which the cues apply. - 2.5.1-SNAPSHOT + 2.6.0 CueModel diff --git a/jcore-lingscope-ae/src/test/java/de/julielab/LingscopePosAnnotatorTest.java b/jcore-lingscope-ae/src/test/java/de/julielab/LingscopePosAnnotatorTest.java index 1e5d75496..7089675df 100644 --- a/jcore-lingscope-ae/src/test/java/de/julielab/LingscopePosAnnotatorTest.java +++ b/jcore-lingscope-ae/src/test/java/de/julielab/LingscopePosAnnotatorTest.java @@ -8,7 +8,7 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.stream.Collectors; diff --git a/jcore-linnaeus-species-ae/component.meta b/jcore-linnaeus-species-ae/component.meta index a4789114c..5484fb351 100644 --- a/jcore-linnaeus-species-ae/component.meta +++ b/jcore-linnaeus-species-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-linnaeus-species-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Linnaeus Species Tagger" } diff --git a/jcore-linnaeus-species-ae/pom.xml b/jcore-linnaeus-species-ae/pom.xml index 9e5c99785..78e432a3b 100644 --- a/jcore-linnaeus-species-ae/pom.xml +++ b/jcore-linnaeus-species-ae/pom.xml @@ -4,7 +4,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 jcore-linnaeus-species-ae JCoRe Linnaeus Species Tagger @@ -41,8 +41,8 @@ jcore-descriptor-creator - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-linnaeus-species-ae/src/main/java/de/julielab/jcore/ae/linnaeus/LinnaeusMatcherProvider.java b/jcore-linnaeus-species-ae/src/main/java/de/julielab/jcore/ae/linnaeus/LinnaeusMatcherProvider.java index bdccc500e..0bf56eb18 100644 --- a/jcore-linnaeus-species-ae/src/main/java/de/julielab/jcore/ae/linnaeus/LinnaeusMatcherProvider.java +++ b/jcore-linnaeus-species-ae/src/main/java/de/julielab/jcore/ae/linnaeus/LinnaeusMatcherProvider.java @@ -1,7 +1,5 @@ package de.julielab.jcore.ae.linnaeus; -import org.apache.uima.resource.DataResource; -import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.SharedResourceObject; import uk.ac.man.entitytagger.matching.Matcher; diff --git a/jcore-linnaeus-species-ae/src/main/java/de/julielab/jcore/ae/linnaeus/LinnaeusSpeciesAnnotator.java b/jcore-linnaeus-species-ae/src/main/java/de/julielab/jcore/ae/linnaeus/LinnaeusSpeciesAnnotator.java index cbab4f7e9..1853e3f50 100644 --- a/jcore-linnaeus-species-ae/src/main/java/de/julielab/jcore/ae/linnaeus/LinnaeusSpeciesAnnotator.java +++ b/jcore-linnaeus-species-ae/src/main/java/de/julielab/jcore/ae/linnaeus/LinnaeusSpeciesAnnotator.java @@ -35,7 +35,7 @@ * */ @ResourceMetaData(name="JCore LINNAEUS Species AE") -@TypeCapability(inputs = {"de.julielab.jcore.types.Organism", "de.julielab.jcore.types.ResourceEntry"}) +@TypeCapability(outputs = {"de.julielab.jcore.types.Organism", "de.julielab.jcore.types.ResourceEntry"}) public class LinnaeusSpeciesAnnotator extends JCasAnnotator_ImplBase { public static final String RES_KEY_LINNAEUS_MATCHER = "LinnaeusMatcher"; public static final String PARAM_CONFIG_FILE = "ConfigFile"; diff --git a/jcore-linnaeus-species-ae/src/main/resources/de/julielab/jcore/ae/linnaeus/desc/jcore-linnaeus-ae.xml b/jcore-linnaeus-species-ae/src/main/resources/de/julielab/jcore/ae/linnaeus/desc/jcore-linnaeus-ae.xml index e89d8d5f3..4668483c6 100644 --- a/jcore-linnaeus-species-ae/src/main/resources/de/julielab/jcore/ae/linnaeus/desc/jcore-linnaeus-ae.xml +++ b/jcore-linnaeus-species-ae/src/main/resources/de/julielab/jcore/ae/linnaeus/desc/jcore-linnaeus-ae.xml @@ -5,7 +5,7 @@ de.julielab.jcore.ae.linnaeus.LinnaeusSpeciesAnnotator JCore LINNAEUS Species AE - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-linnaeus-species-ae/src/test/java/de/julielab/jcore/ae/linnaeus/LinnaeusSpeciesAnnotatorTest.java b/jcore-linnaeus-species-ae/src/test/java/de/julielab/jcore/ae/linnaeus/LinnaeusSpeciesAnnotatorTest.java index 58a46dec9..16bcd3e2c 100644 --- a/jcore-linnaeus-species-ae/src/test/java/de/julielab/jcore/ae/linnaeus/LinnaeusSpeciesAnnotatorTest.java +++ b/jcore-linnaeus-species-ae/src/test/java/de/julielab/jcore/ae/linnaeus/LinnaeusSpeciesAnnotatorTest.java @@ -20,11 +20,10 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ExternalResourceDescription; -import org.apache.uima.resource.metadata.ExternalResourceBinding; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class LinnaeusSpeciesAnnotatorTest { @Test diff --git a/jcore-mantra-xml-types/pom.xml b/jcore-mantra-xml-types/pom.xml index 4108f1f6a..d44972ddf 100644 --- a/jcore-mantra-xml-types/pom.xml +++ b/jcore-mantra-xml-types/pom.xml @@ -6,7 +6,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 JCoRe Mantra XML Types https://github.com/JULIELab/jcore-base/tree/master/jcore-mantra-xml-types diff --git a/jcore-medxn-ae/component.meta b/jcore-medxn-ae/component.meta index d10bc8ded..1abfe0206 100644 --- a/jcore-medxn-ae/component.meta +++ b/jcore-medxn-ae/component.meta @@ -22,7 +22,7 @@ "maven-artifact": { "artifactId": "jcore-medxn-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe MedXN" } diff --git a/jcore-medxn-ae/pom.xml b/jcore-medxn-ae/pom.xml index 94a1d35ee..28ff0577c 100644 --- a/jcore-medxn-ae/pom.xml +++ b/jcore-medxn-ae/pom.xml @@ -4,7 +4,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 jcore-medxn-ae JCoRe MedXN @@ -25,8 +25,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/MedNormAE.xml b/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/MedNormAE.xml index e92306340..c96350f68 100644 --- a/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/MedNormAE.xml +++ b/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/MedNormAE.xml @@ -6,7 +6,7 @@ de.julielab.jcore.medxn.ae.desc.MedNormAE make a normalized medication description based on RxNorm standard - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/jcore-medxn-ae-attributes-german.xml b/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/jcore-medxn-ae-attributes-german.xml index 94393ddbf..54ca2cf90 100644 --- a/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/jcore-medxn-ae-attributes-german.xml +++ b/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/jcore-medxn-ae-attributes-german.xml @@ -6,7 +6,7 @@ de.julielab.jcore.medxn.ae.desc.MedAttrAE medication attribute tagger using regEx - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/jcore-medxn-ae-extractor-german.xml b/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/jcore-medxn-ae-extractor-german.xml index afdec1ce4..282491298 100644 --- a/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/jcore-medxn-ae-extractor-german.xml +++ b/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/jcore-medxn-ae-extractor-german.xml @@ -6,7 +6,7 @@ de.julielab.jcore.medxn.ae.desc.jcore-medxn-ae-extractor-german Associate medication and the corresponding attributes - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-medxn-ae/src/test/java/de/julielab/jcore/ae/medxn/MedAttrAnnotatorTest.java b/jcore-medxn-ae/src/test/java/de/julielab/jcore/ae/medxn/MedAttrAnnotatorTest.java index 110de0875..4f4e08302 100644 --- a/jcore-medxn-ae/src/test/java/de/julielab/jcore/ae/medxn/MedAttrAnnotatorTest.java +++ b/jcore-medxn-ae/src/test/java/de/julielab/jcore/ae/medxn/MedAttrAnnotatorTest.java @@ -21,16 +21,17 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.JFSIndexRepository; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Ignore; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; +import static org.junit.jupiter.api.Assertions.assertTrue; + public class MedAttrAnnotatorTest { private static final String AE_DESCRIPTOR = "de.julielab.jcore.ae.medxn.desc.jcore-medxn-ae-attributes-german"; @@ -66,11 +67,11 @@ private void check(String[] goldlines, JCas tcas) { Boolean lengthEqual = (goldlines.length == menCount); - Assert.assertTrue("Expression count differs; should be '" + - Integer.toString(goldlines.length) + "' but is '" + menCount.toString() +"'.", - lengthEqual); + assertTrue(lengthEqual, + "Expression count differs; should be '" + + goldlines.length + "' but is '" + menCount.toString() +"'."); Boolean arrayEqual = (goldlines.equals(actLines.toArray(new String[actLines.size()]))); - Assert.assertTrue("Expressions differ", arrayEqual); + assertTrue(arrayEqual, "Expressions differ"); } private void reset() { @@ -78,7 +79,7 @@ private void reset() { } - @Before + @BeforeEach public void initializeComponents() throws IOException, UIMAException { if (setUpIsDone) { return; @@ -90,7 +91,7 @@ public void initializeComponents() throws IOException, UIMAException { setUpIsDone = true; } - @Ignore + @Disabled @Test public void testDuration() { String text; @@ -113,7 +114,7 @@ public void testDuration() { } } - @Ignore + @Disabled @Test public void testDose() { String text; @@ -136,7 +137,7 @@ public void testDose() { } } - @Ignore + @Disabled @Test public void testFrequency() { String text; @@ -159,7 +160,7 @@ public void testFrequency() { } } - @Ignore + @Disabled @Test public void testModus() { String text; diff --git a/jcore-mmax2-reader/LICENSE b/jcore-mmax2-reader/LICENSE new file mode 100644 index 000000000..fbbd41e05 --- /dev/null +++ b/jcore-mmax2-reader/LICENSE @@ -0,0 +1,26 @@ +BSD 2-Clause License + +Copyright (c) 2017, JULIE Lab +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/jcore-mmax2-reader/README.md b/jcore-mmax2-reader/README.md new file mode 100644 index 000000000..2cacbc00a --- /dev/null +++ b/jcore-mmax2-reader/README.md @@ -0,0 +1,34 @@ +# JCoRe Component Skeleton +`Text that describes the component in brevity...` + +**Descriptor Path**: +``` +de.julielab.jcore.{reader, ae, consumer}.NAME.desc.ARTIFACT-NAME +``` + +`More thorough description` +`Are there any requirements or dependencies for this component?` + +**1. Parameters** + +| Parameter Name | Parameter Type | Mandatory | Multivalued | Description | +|----------------|----------------|-----------|-------------|-------------| +| param1 | UIMA-Type | Boolean | Boolean | Description | +| param2 | UIMA-Type | Boolean | Boolean | Description | + +**2. Predefined Settings** + +| Parameter Name | Parameter Syntax | Example | +|----------------|------------------|---------| +| param1 | Syntax-Description | `Example` | +| param2 | Syntax-Description | `Example` | + +**3. Capabilities** + +| Type | Input | Output | +|------|:-----:|:------:| +| de.julielab.jcore.types.TYPE | | `+` | +| de.julielab.jcore.types.ace.TYPE | `+` | | + + +[1] Some Literature? diff --git a/jcore-mmax2-reader/component.meta b/jcore-mmax2-reader/component.meta new file mode 100644 index 000000000..e1e8462db --- /dev/null +++ b/jcore-mmax2-reader/component.meta @@ -0,0 +1,20 @@ +{ + "categories": [ + "reader" + ], + "description": "Collection reader for MMAX2 annotation projects.", + "descriptors": [ + { + "category": "reader", + "location": "de.julielab.jcore.cr.mmax2.desc.jcore-mmax2-reader" + } + ], + "exposable": true, + "group": "general", + "maven-artifact": { + "artifactId": "jcore-mmax2-reader", + "groupId": "de.julielab", + "version": "2.6.0" + }, + "name": "JCoRe MMAX2 reader." +} diff --git a/jcore-mmax2-reader/pom.xml b/jcore-mmax2-reader/pom.xml new file mode 100644 index 000000000..812127c84 --- /dev/null +++ b/jcore-mmax2-reader/pom.xml @@ -0,0 +1,73 @@ + + + + 4.0.0 + jcore-mmax2-reader + jar + de.julielab + + + de.julielab + jcore-base + 2.6.0 + + + + + + + ch.qos.logback + logback-classic + test + + + org.slf4j + slf4j-api + + + de.julielab + jcore-descriptor-creator + + + de.julielab + jcore-types + ${jcore-types-version} + + + de.julielab + julielab-mmax-to-iob-iexml-converter + 1.0.2 + + + org.apache.commons + commons-lang3 + + + de.julielab + jcore-utilities + ${jcore-utilities-version} + + + org.assertj + assertj-core + + + org.junit.jupiter + junit-jupiter-engine + + + JCoRe MMAX2 reader. + + JULIE Lab Jena, Germany + http://www.julielab.de + + https://github.com/JULIELab/jcore-base/tree/master/jcore-mmax2-reader + Collection reader for MMAX2 annotation projects. + + + BSD 2-Clause + https://opensource.org/licenses/BSD-2-Clause + + + diff --git a/jcore-mmax2-reader/src/main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java b/jcore-mmax2-reader/src/main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java new file mode 100644 index 000000000..400c752f5 --- /dev/null +++ b/jcore-mmax2-reader/src/main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java @@ -0,0 +1,369 @@ +package de.julielab.jcore.cr.mmax2; + +import de.julielab.jcore.types.*; +import de.julielab.jcore.utility.JCoReAnnotationTools; +import de.julielab.jules.mmax.MarkableContainer; +import de.julielab.jules.mmax.Statistics; +import de.julielab.jules.mmax.WordInformation; +import org.apache.uima.UimaContext; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Progress; +import org.apache.uima.util.ProgressImpl; +import org.eml.MMAX2.annotation.markables.Markable; +import org.eml.MMAX2.discourse.MMAX2Discourse; +import org.eml.MMAX2.discourse.MMAX2DiscourseElement; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.util.List; +import java.util.*; +import java.util.function.Predicate; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +@ResourceMetaData(name = "JCoRe MMAX2 reader", description = "Collection reader for MMAX2 annotation projects.", vendor = "JULIE Lab Jena, Germany") +public class MMAX2Reader extends JCasCollectionReader_ImplBase { + + public static final String PARAM_INPUT_DIR = "InputDir"; + public static final String PARAM_ANNOTATION_LEVELS = "AnnotationLevels"; + public static final String PARAM_ORIGINAL_TEXT_FILES = "OriginalTextFiles"; + public static final String PARAM_UIMA_ANNOTATION_TYPES = "UimaAnnotationTypes"; + public static final String PARAM_REMOVE_OVERLAPPING_SHORTER_ANNOTATIONS = "RemoveOverlappingShorterAnnotations"; + private final static Logger log = LoggerFactory.getLogger(MMAX2Reader.class); + @ConfigurationParameter(name = PARAM_REMOVE_OVERLAPPING_SHORTER_ANNOTATIONS, mandatory = false, defaultValue = "false", description = "If set to true, for all overlapping annotations only the longest is kept.") + boolean removeOverlappingShorterAnnotations; + @ConfigurationParameter(name = PARAM_INPUT_DIR, description = "Should point to the directory of which the MMAX2 projects are sub directories of.") + private String inputDir; + @ConfigurationParameter(name = PARAM_ANNOTATION_LEVELS, description = "The names of the MMAX2 annotation levels to create annotations for.") + private String[] annotationLevels; + @ConfigurationParameter(name = PARAM_UIMA_ANNOTATION_TYPES, description = "The fully qualified names of the UIMA annotation types to be used for the representation of the input annotation level. Must match the indices of " + PARAM_ANNOTATION_LEVELS + ", i.e. the ith level will be added to the CAS as the ith type.") + private String[] uimaTypeNames; + @ConfigurationParameter(name = PARAM_ORIGINAL_TEXT_FILES, mandatory = false, description = "The MMAX2 base data consists of tokenized text and does not keep track of the original text. This parameter should point to a directory containing the original text files. The file names should match the MMAX2 project IDs.") + private String originalTextFilesDir; + private LinkedList folderList; + private String actualPath; + private HashMap levels2uimaNames; + private List> uimaAnnotationClasses; + private int numDocuments; + + /** + * This method is called a single time by the framework at component + * creation. Here, descriptor parameters are read and initial setup is done. + */ + @Override + public void initialize(UimaContext context) throws ResourceInitializationException { + super.initialize(context); + inputDir = (String) context.getConfigParameterValue(PARAM_INPUT_DIR); + annotationLevels = (String[]) context.getConfigParameterValue(PARAM_ANNOTATION_LEVELS); + uimaTypeNames = (String[]) getUimaContext().getConfigParameterValue(PARAM_UIMA_ANNOTATION_TYPES); + originalTextFilesDir = (String) context.getConfigParameterValue(PARAM_ORIGINAL_TEXT_FILES); + removeOverlappingShorterAnnotations = Optional.ofNullable((Boolean) context.getConfigParameterValue(PARAM_REMOVE_OVERLAPPING_SHORTER_ANNOTATIONS)).orElse(false); + actualPath = null; + if (annotationLevels.length != uimaTypeNames.length) + throw new IllegalArgumentException("The number of annotation levels and the number of UIMA type names must match. But the given annotation levels are '" + Arrays.toString(annotationLevels) + "' and the UIMA types names are '" + Arrays.toString(uimaTypeNames) + "'."); + try { + uimaAnnotationClasses = Arrays.stream(uimaTypeNames).map(name -> { + try { + return Class.forName(name); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + }).collect(Collectors.toList()); + } catch (Exception e) { + log.error("Could not initialize UIMA annotation classes from parameter values {}", Arrays.toString(uimaTypeNames)); + throw new ResourceInitializationException(e); + } + levels2uimaNames = IntStream.range(0, annotationLevels.length).collect(HashMap::new, (m, i) -> m.put(annotationLevels[i], uimaTypeNames[i]), (m1, m2) -> m1.putAll(m2)); + setUpFolderList(); + } + + private void setUpFolderList() throws ResourceInitializationException { + folderList = new LinkedList<>(); + if (!inputDir.endsWith(File.separator)) + this.inputDir += File.separator; + + File rootX = new File(inputDir); + + if (!rootX.exists()) { + File dir1 = new File("."); + try { + rootX = new File(dir1.getCanonicalPath() + inputDir); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + if (!rootX.exists()) { + log.error("{} does not exist", inputDir); + throw new ResourceInitializationException(new IllegalArgumentException(inputDir + " does not exist")); + } + } + + for (String rootFolder : rootX.list()) { + if (!rootFolder.endsWith(File.separator)) + rootFolder += File.separator; + File root = new File(inputDir + rootFolder); + if (root.isDirectory()) { + this.folderList.add(root); + } + } + numDocuments = folderList.size(); + } + + private String getPMID() throws CollectionException { + try { + FileInputStream fstream = new FileInputStream(this.actualPath + "Basedata.uri"); + // Get the object of DataInputStream + DataInputStream in = new DataInputStream(fstream); + BufferedReader br = new BufferedReader(new InputStreamReader(in)); + String strLine; + // Read File Line By Line + int count = 0; + String pmid = ""; + while ((strLine = br.readLine()) != null) { + count++; + pmid = strLine; + } + if (count > 1) { + log.error("unknown data in {}Basedata.uri", actualPath); + System.exit(1); + return null; + } + return pmid; + } catch (IOException e) { + log.error("Error while parsing {}Basedata.uri", actualPath); + throw new CollectionException(e); + } + } + + /** + * This method is called for each document going through the component. This + * is where the actual work happens. + */ + @Override + public void getNext(JCas jCas) throws CollectionException { + Statistics.projects++; + actualPath = this.folderList.poll().getAbsolutePath() + "/"; + // rename style file from default_style.xsl to generic_nongui_style.xsl + // (necessary for api use) + File style = new File(actualPath + "Styles/default_style.xsl"); + style.renameTo(new File(actualPath + "Styles/generic_nongui_style.xsl")); + + File mmaxfile = new File(actualPath + "project.mmax"); + MMAX2Discourse discourse = MMAX2Discourse.buildDiscourse(mmaxfile.getAbsolutePath()); + + // text from basedata with spaces between all words + String documentText = discourse.getNextDocumentChunk(); + + WordInformation[] words = new WordInformation[discourse.getDiscourseElementCount()]; + + int textPosition = 0; + // Words from basedata + for (MMAX2DiscourseElement elem : discourse.getDiscourseElements()) { + WordInformation word = new WordInformation(); + word.setId(elem.getID()); + int discoursePosition = elem.getDiscoursePosition(); + word.setPosition(discoursePosition); + StringBuilder textBuilder = new StringBuilder(); + int end = discourse.getDisplayEndPositionFromDiscoursePosition(discoursePosition); + for (textPosition = discourse.getDisplayStartPositionFromDiscoursePosition(discoursePosition); textPosition <= end; textPosition++) { + textBuilder.append(documentText.charAt(textPosition)); + } + word.setText(textBuilder.toString()); + words[discoursePosition] = word; + } + + this.produceOutput(discourse, words, jCas); + + // set stylefile back to normal + style = new File(actualPath + "Styles/generic_nongui_style.xsl"); + style.renameTo(new File(actualPath + "Styles/default_style.xsl")); + + Statistics.projects++; + } + + private void produceOutput(MMAX2Discourse discourse, WordInformation[] words, JCas jCas) throws CollectionException { + StringBuilder out = new StringBuilder(); + StringBuilder outPlain = new StringBuilder(); + String pmid = this.getPMID(); + if (originalTextFilesDir != null && this.originalTextFilesDir.length() > 0) + this.handleOriginalTextInformation(pmid, words); + + Map pos2offsets = new HashMap<>(); + + for (int i = 0; i < words.length; i++) { + WordInformation word = words[i]; + + Token token = new Token(jCas, outPlain.length(), outPlain.length() + word.getText().length()); + token.setComponentId(getClass().getCanonicalName()); + token.addToIndexes(); + pos2offsets.put(word.getPosition(), token); + + outPlain.append(word.getText()); + if (word.isFollowedBySpace()) { + out.append(" "); + outPlain.append(" "); + } + } + Set ignoredMarkables = getIgnoredMarkables(discourse); + for (int i = 0; i < annotationLevels.length; ++i) { + Iterator iterator = discourse.getMarkableLevelByName(annotationLevels[i], false).getMarkables().stream().map(Markable.class::cast).filter(Predicate.not(ignoredMarkables::contains)).filter(Predicate.not(Markable::isDiscontinuous)).iterator(); + int id = 0; + while (iterator.hasNext()) { + Markable markable = iterator.next(); + int beginPosition = markable.getLeftmostDiscoursePosition(); + int endPosition = markable.getRightmostDiscoursePosition(); + int beginOffset = pos2offsets.get(beginPosition).getBegin(); + int endOffset = pos2offsets.get(endPosition).getEnd(); + Annotation a; + try { + a = JCoReAnnotationTools.getAnnotationByClassName(jCas, uimaTypeNames[i]); + } catch (Exception e) { + throw new CollectionException(e); + } + a.setBegin(beginOffset); + a.setEnd(endOffset); + if (a instanceof ConceptMention) + ((ConceptMention) a).setSpecificType(markable.getAttributeValue(markable.getMarkableLevelName())); + else if (a instanceof Sentence) + ((Sentence) a).setId(String.valueOf(id)); + a.addToIndexes(); + ++id; + } + } + for (WordInformation word : words) { + for (MarkableContainer mc : word.getMarkables()) { + int beginPosition = mc.getBegin(); + if (beginPosition == word.getPosition()) { + int endPosition = mc.getEnd(); + int beginOffset = pos2offsets.get(beginPosition).getBegin(); + int endOffset = pos2offsets.get(endPosition).getEnd(); + Gene gene = new Gene(jCas, beginOffset, endOffset); + gene.addToIndexes(); + } + } + } + String textPlain = outPlain.toString(); + jCas.setDocumentText(textPlain); + + Header h = new Header(jCas); + h.setDocId(pmid); + h.addToIndexes(); + } + + private Set getIgnoredMarkables(MMAX2Discourse discourse) { + if (!removeOverlappingShorterAnnotations) + return Collections.emptySet(); + Set toIgnore = new HashSet<>(); + for (int i = 0; i < annotationLevels.length; ++i) { + Map> markablesByPos = new HashMap<>(); + Iterator iterator = discourse.getMarkableLevelByName(annotationLevels[i], false).getMarkables().stream().map(Markable.class::cast).filter(Predicate.not(Markable::isDiscontinuous)).iterator(); + while (iterator.hasNext()) { + Markable markable = iterator.next(); + // associate the markable with all the word indices it covers + IntStream.rangeClosed(markable.getLeftmostDiscoursePosition(), markable.getRightmostDiscoursePosition()).forEach(j -> markablesByPos.compute(j, (k, v) -> v != null ? v : new HashSet<>()).add(markable)); + } + // now, for each word index, keep only the longest markable + for (Integer pos : markablesByPos.keySet()) { + Set markables = markablesByPos.get(pos); + if (markables.size() > 1) { + int maxSize = 0; + Markable longestMarkable = null; + for (Markable markable : markables) { + // first, we just add all markables to ignore + toIgnore.add(markable); + int markableLength = markable.getRightmostDiscoursePosition() - markable.getLeftmostDiscoursePosition() + 1; + if (markableLength > maxSize) { + maxSize = markableLength; + longestMarkable = markable; + } + } + // now remove only the longest markable - that we want to keep - from the set of ignores markables + toIgnore.remove(longestMarkable); + } + } + } + return toIgnore; + } + + private void handleOriginalTextInformation(String pmid, WordInformation[] words) throws CollectionException { + if (originalTextFilesDir.length() > 0 && !originalTextFilesDir.endsWith("/")) + originalTextFilesDir += File.separator; + + File file = new File(originalTextFilesDir + pmid); + if (!file.exists()) { + log.warn("no original File found for {} using only mmax text.", pmid); + return; + } + try { + FileInputStream fis = new FileInputStream(file); + InputStreamReader isr = new InputStreamReader(fis); + int wordCounter = 0; + int i; + try { + WordInformation actualWord = words[wordCounter]; + String actualText = actualWord.getText(); + actualWord.setFollowedBySpace(false); + int wordCharCounter = 0; + while ((i = isr.read()) >= 0) { + if (wordCharCounter >= actualText.length()) { + wordCounter++; + if (wordCounter < words.length) { + actualWord = words[wordCounter]; + actualText = actualWord.getText(); + actualWord.setFollowedBySpace(false); + wordCharCounter = 0; + } else { + if (!Character.isWhitespace(i)) { + log.warn("original Text contains more words than mmax information"); + } + return; + } + } + + if (actualText.charAt(wordCharCounter) == i || Character.toLowerCase(actualText.charAt(wordCharCounter)) == Character.toLowerCase(i)) { + wordCharCounter++; + } else { + if (!Character.isWhitespace(i)) { + log.warn("there is a non whitespace character different in original text at document {} critical character is '{}' near word '{}' (MMAX2 word ID {})", pmid, i, actualText, actualWord.getId()); + } else { + words[wordCounter - 1].setFollowedBySpace(true); + } + } + } + isr.close(); + } catch (IOException e) { + log.error("Error attempting to read original text file ", e); + throw new CollectionException(e); + } + } catch (Exception e) { + log.error("Error attempting to read original text file", e); + if (e instanceof CollectionException) + throw (CollectionException) e; + throw new CollectionException(e); + } + } + + @Override + public void close() { + // nothing to do + } + + @Override + public Progress[] getProgress() { + return new Progress[]{new ProgressImpl(numDocuments - folderList.size(), numDocuments, "document")}; + } + + @Override + public boolean hasNext() { + return !this.folderList.isEmpty(); + } + + +} diff --git a/jcore-mmax2-reader/src/main/resources/de/julielab/jcore/cr/mmax2/desc/jcore-mmax2-reader.xml b/jcore-mmax2-reader/src/main/resources/de/julielab/jcore/cr/mmax2/desc/jcore-mmax2-reader.xml new file mode 100644 index 000000000..b25622530 --- /dev/null +++ b/jcore-mmax2-reader/src/main/resources/de/julielab/jcore/cr/mmax2/desc/jcore-mmax2-reader.xml @@ -0,0 +1,69 @@ + + + org.apache.uima.java + de.julielab.jcore.cr.mmax2.MMAX2Reader + + JCoRe MMAX2 reader + Collection reader for MMAX2 annotation projects. + 2.6.0 + JULIE Lab Jena, Germany + + + RemoveOverlappingShorterAnnotations + If set to true, for all overlapping annotations only the longest is kept. + Boolean + false + false + + + InputDir + Should point to the directory of which the MMAX2 projects are sub directories of. + String + false + true + + + AnnotationLevels + The names of the MMAX2 annotation levels to create annotations for. + String + true + true + + + UimaAnnotationTypes + The fully qualified names of the UIMA annotation types to be used for the representation of the input annotation level. Must match the indices of AnnotationLevels, i.e. the ith level will be added to the CAS as the ith type. + String + true + true + + + OriginalTextFiles + The MMAX2 base data consists of tokenized text and does not keep track of the original text. This parameter should point to a directory containing the original text files. The file names should match the MMAX2 project IDs. + String + false + false + + + + + RemoveOverlappingShorterAnnotations + + false + + + + + + + + + + + + + true + false + true + + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java b/jcore-mmax2-reader/src/test/java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java new file mode 100644 index 000000000..79b9bfb11 --- /dev/null +++ b/jcore-mmax2-reader/src/test/java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java @@ -0,0 +1,115 @@ +package de.julielab.jcore.cr.mmax2; + +import de.julielab.jcore.types.Header; +import de.julielab.jcore.types.Protein; +import de.julielab.jcore.types.Sentence; +import de.julielab.jcore.types.Token; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.factory.CollectionReaderFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.junit.jupiter.api.Test; + +import java.nio.file.Path; +import java.util.Collection; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Unit tests for jcore-mmax2-reader. + * + * @author + */ +public class MMAX2ReaderTest { + + @Test + public void testReader() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types"); + CollectionReader reader = CollectionReaderFactory.createReader("de.julielab.jcore.cr.mmax2.desc.jcore-mmax2-reader", + MMAX2Reader.PARAM_INPUT_DIR, Path.of("src", "test", "resources", "input").toString(), + MMAX2Reader.PARAM_ANNOTATION_LEVELS, new String[]{"proteins", "sentence"}, + MMAX2Reader.PARAM_UIMA_ANNOTATION_TYPES, new String[]{"de.julielab.jcore.types.Protein", "de.julielab.jcore.types.Sentence"}); + assertThat(reader.hasNext()).isTrue(); + reader.getNext(jCas.getCas()); + + Header h = JCasUtil.selectSingle(jCas, Header.class); + assertThat(h.getDocId()).isEqualTo("10048764"); + + // the text should be tokenized because we did not provide the original text + assertThat(jCas.getDocumentText()).startsWith("Characterization of antihuman IFNAR-1 monoclonal antibodies : epitope localization and functional analysis ."); + Collection proteins = JCasUtil.select(jCas, Protein.class); + assertThat(proteins).hasSize(16); + + assertThat(proteins).map(Protein::getCoveredText).contains("IFNAR-1", "type I interferon receptor", "HuIFNAR-1", "Stat"); + Collection sentences = JCasUtil.select(jCas, Sentence.class); + assertThat(sentences).hasSize(10); + assertThat(sentences).extracting(Sentence::getId).containsExactlyInAnyOrder("0", "1", "2", "3", "4", "5", "6", "7", "8", "9"); + + assertThat(proteins).extracting(Protein::getSpecificType).filteredOn(type -> type.equals("protein")).hasSize(13); + assertThat(proteins).extracting(Protein::getSpecificType).filteredOn(type -> type.equals("protein_complex")).hasSize(2); + assertThat(proteins).extracting(Protein::getSpecificType).filteredOn(type -> type.equals("protein_familiy_or_group")).hasSize(1); + + Collection tokens = JCasUtil.select(jCas, Token.class); + // check a small sample of tokens that should have been created + assertThat(tokens).map(Token::getCoveredText).contains("Characterization", "IFNAR-1", ":", "(", "subunits", "recognition", ".", "HuIFNAR-1"); + } + + @Test + public void testReaderOriginalText() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types"); + CollectionReader reader = CollectionReaderFactory.createReader("de.julielab.jcore.cr.mmax2.desc.jcore-mmax2-reader", + MMAX2Reader.PARAM_INPUT_DIR, Path.of("src", "test", "resources", "input").toString(), + MMAX2Reader.PARAM_ORIGINAL_TEXT_FILES, Path.of("src", "test", "resources", "originalText").toString(), + MMAX2Reader.PARAM_ANNOTATION_LEVELS, new String[]{"proteins"}, + MMAX2Reader.PARAM_UIMA_ANNOTATION_TYPES, new String[]{"de.julielab.jcore.types.Protein"}); + assertThat(reader.hasNext()).isTrue(); + reader.getNext(jCas.getCas()); + // in this test, the text should not appear tokenized but arranged according to the original text + assertThat(jCas.getDocumentText()).startsWith("Characterization of antihuman IFNAR-1 monoclonal antibodies: epitope localization and functional analysis."); + Collection proteins = JCasUtil.select(jCas, Protein.class); + assertThat(proteins).hasSize(16); + assertThat(proteins).map(Protein::getCoveredText).contains("IFNAR-1", "type I interferon receptor", "HuIFNAR-1", "Stat"); + Collection tokens = JCasUtil.select(jCas, Token.class); + // check a small sample of tokens that should have been created + assertThat(tokens).map(Token::getCoveredText).contains("Characterization", "IFNAR-1", ":", "(", "subunits", "recognition", ".", "HuIFNAR-1"); + } + + @Test + public void testReader2() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types"); + CollectionReader reader = CollectionReaderFactory.createReader("de.julielab.jcore.cr.mmax2.desc.jcore-mmax2-reader", + MMAX2Reader.PARAM_INPUT_DIR, Path.of("src", "test", "resources", "input2").toString(), + MMAX2Reader.PARAM_ANNOTATION_LEVELS, new String[]{"proteins", "sentence"}, + MMAX2Reader.PARAM_UIMA_ANNOTATION_TYPES, new String[]{"de.julielab.jcore.types.Protein", "de.julielab.jcore.types.Sentence"}); + assertThat(reader.hasNext()).isTrue(); + reader.getNext(jCas.getCas()); + + Header h = JCasUtil.selectSingle(jCas, Header.class); + assertThat(h.getDocId()).isEqualTo("14731280"); + + Collection proteins = JCasUtil.select(jCas, Protein.class); + // there is this one protein seemingly annotated double; while this is more of an error than the real case + // to handle, it was responsible for errors and works for a simple test + long overlappingProteinCount = proteins.stream().filter(p -> p.getBegin() == 95 && p.getEnd() == 99).count(); + assertThat(overlappingProteinCount).isEqualTo(2); + + // now activate the parameter to avoid overlapping annotations + jCas.reset(); + reader = CollectionReaderFactory.createReader("de.julielab.jcore.cr.mmax2.desc.jcore-mmax2-reader", + MMAX2Reader.PARAM_INPUT_DIR, Path.of("src", "test", "resources", "input2").toString(), + MMAX2Reader.PARAM_ANNOTATION_LEVELS, new String[]{"proteins", "sentence"}, + MMAX2Reader.PARAM_UIMA_ANNOTATION_TYPES, new String[]{"de.julielab.jcore.types.Protein", "de.julielab.jcore.types.Sentence"}, + MMAX2Reader.PARAM_REMOVE_OVERLAPPING_SHORTER_ANNOTATIONS, true); + assertThat(reader.hasNext()).isTrue(); + reader.getNext(jCas.getCas()); + + + proteins = JCasUtil.select(jCas, Protein.class); + // there shouldn't be an overlap any more + overlappingProteinCount = proteins.stream().filter(p -> p.getBegin() == 95 && p.getEnd() == 99).count(); + assertThat(overlappingProteinCount).isEqualTo(1); + } + + +} diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata.uri b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata.uri new file mode 100644 index 000000000..134fd8e79 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata.uri @@ -0,0 +1 @@ +10048764 diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata/Basedata.xml b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata/Basedata.xml new file mode 100644 index 000000000..cd5e3c8a3 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata/Basedata.xml @@ -0,0 +1,240 @@ + + + +Characterization +of +antihuman +IFNAR-1 +monoclonal +antibodies +: +epitope +localization +and +functional +analysis +. +The +type +I +interferon +receptor +( +IFNAR +) +is +composed +of +two +subunits +, +IFNAR-1 +and +IFNAR-2 +, +encoding +transmembrane +polypeptides +. +IFNAR-2 +has +a +dominant +role +in +ligand +binding +, +but +IFNAR-1 +contributes +to +binding +affinity +and +to +differential +ligand +recognition +. +A +panel +of +five +monoclonal +antibodies +( +mAb +) +to +human +IFNAR-1 +( +HuIFNAR-1 +) +was +produced +and +characterized +. +The +reactivity +of +each +mAb +toward +HuIFNAR-1 +on +native +and +transfected +cells +and +in +Western +blot +and +ELISA +formats +was +determined +. +In +functional +assays +, +one +mAb +, +EA12 +, +blocked +IFN-a2 +binding +to +human +cells +and +interfered +with +Stat +activation +and +antiviral +activity +. +Epitopes +for +the +mAb +were +localized +to +subdomains +of +the +HuIFNAR-1 +extracellular +domain +by +differential +reactivity +of +the +mAb +to +a +series +of +human +/ +bovine +IFNAR-1 +chimeras +. +The +antibody +EA12 +seems +to +require +native +HuIFNAR-1 +for +reactivity +and +does +not +map +to +a +single +subdomain +, +perhaps +recognizing +an +epitope +containing +noncontiguous +sequences +in +at +least +two +subdomains +. +In +contrast +, +the +epitopes +of +the +non +- +neutralizing +mAb +FB2 +, +AA3 +, +and +GB8 +mapped +, +respectively +, +to +the +first +, +second +, +and +third +subdomains +of +HuIFNAR-1 +. +The +mAb +DB2 +primarily +maps +to +the +fourth +subdomain +, +although +its +reactivity +may +be +affected +by +other +determinants +. + diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata/words.dtd b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata/words.dtd new file mode 100644 index 000000000..a02b470f1 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata/words.dtd @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Customizations/proteins.xml b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Customizations/proteins.xml new file mode 100644 index 000000000..0f4bd71f8 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Customizations/proteins.xml @@ -0,0 +1,72 @@ + + + + + + + + + + + + + + + + + diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Customizations/sentence.xml b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Customizations/sentence.xml new file mode 100644 index 000000000..6fbf9d136 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Customizations/sentence.xml @@ -0,0 +1,3 @@ + + + diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/markables.dtd b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/markables.dtd new file mode 100644 index 000000000..220e8b3c8 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/markables.dtd @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/proteins.xml b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/proteins.xml new file mode 100644 index 000000000..46c822f8d --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/proteins.xml @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/sentence.xml b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/sentence.xml new file mode 100644 index 000000000..9a91c925b --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/sentence.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Schemes/proteins.xml b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Schemes/proteins.xml new file mode 100644 index 000000000..1045dc27e --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Schemes/proteins.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Schemes/sentence.xml b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Schemes/sentence.xml new file mode 100644 index 000000000..f37fbc936 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Schemes/sentence.xml @@ -0,0 +1,3 @@ + + + diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Styles/default_style.xsl b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Styles/default_style.xsl new file mode 100644 index 000000000..ab671aa34 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Styles/default_style.xsl @@ -0,0 +1,58 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/common_paths.xml b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/common_paths.xml new file mode 100644 index 000000000..8f55971b4 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/common_paths.xml @@ -0,0 +1,17 @@ + + + +Basedata/ +Markables/ +Schemes/ +Styles/ +Customizations/ +default_style.xsl + + +proteins.xml +sentence.xml + + + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/project.mmax b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/project.mmax new file mode 100644 index 000000000..52fc0b1c1 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/project.mmax @@ -0,0 +1,7 @@ + + + +Basedata.xml + + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata.uri b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata.uri new file mode 100644 index 000000000..4e6d1a1f3 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata.uri @@ -0,0 +1 @@ +14731280 diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata/Basedata.xml b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata/Basedata.xml new file mode 100644 index 000000000..90e494de3 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata/Basedata.xml @@ -0,0 +1,299 @@ + + + +Multiple +stress +signal +integration +in +the +regulation +of +the +complex +sigma +S +- +dependent +csiD +- +ygaF +- +gabDTP +operon +in +Escherichia +coli +. +The +csiD +- +ygaF +- +gabDTP +region +in +the +Escherichia +coli +genome +represents +a +cluster +of +sigma +S +- +controlled +genes +. +Here +, +we +investigated +promoter +structures +, +sigma +factor +dependencies +, +potential +co +- +regulation +and +environmental +regulatory +patterns +for +all +of +these +genes +. +We +find +that +this +region +constitutes +a +complex +operon +with +expression +being +controlled +by +three +differentially +regulated +promoters +: +(i) +csiDp +, +which +affects +the +expression +of +all +five +genes +, +is +cAMP +- +CRP +/ +sigma +S +- +dependent +and +activated +exclusively +upon +carbon +starvation +and +stationary +phase +; +(ii) +gabDp1 +, +which +is +sigma +S +- +dependent +and +exhibits +multiple +stress +induction +like +sigma +S +itself +; +and +(iii) +gabDp2 +[ +previously +suggested +by +Schneider +, +B.L. +, +Ruback +, +S. +, +Kiupakis +, +A.K. +, +Kasbarian +, +H. +, +Pybus +, +C. +, +and +Reitzer +, +L. +( +2002 +) +J. +Bacteriol. +184 +: +6976-6986 +] +, +which +appears +to +be +Nac +/ +sigma +70 +- +controlled +and +to +respond +to +poor +nitrogen +sources +. +In +addition +, +we +identify +a +novel +repressor +, +CsiR +, +which +modulates +csiDp +activity +in +a +temporal +manner +during +early +stationary +phase +. +Finally +, +we +propose +a +physiological +role +for +sigma +S +- +controlled +GabT +/ +D +- +mediated +gamma-aminobutyrate +( +GABA +) +catabolism +and +glutamate +accumulation +in +general +stress +adaptation +. +This +physiological +role +is +reflected +by +the +activation +of +the +operon +- +internal +gabDp1 +promoter +under +the +different +conditions +that +also +induce +sigma +S +, +which +include +shifts +to +acidic +pH +or +high +osmolarity +as +well +as +starvation +or +stationary +phase +. + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata/words.dtd b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata/words.dtd new file mode 100644 index 000000000..a02b470f1 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata/words.dtd @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Customizations/proteins.xml b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Customizations/proteins.xml new file mode 100644 index 000000000..0f4bd71f8 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Customizations/proteins.xml @@ -0,0 +1,72 @@ + + + + + + + + + + + + + + + + + diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Customizations/sentence.xml b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Customizations/sentence.xml new file mode 100644 index 000000000..6fbf9d136 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Customizations/sentence.xml @@ -0,0 +1,3 @@ + + + diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/markables.dtd b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/markables.dtd new file mode 100644 index 000000000..220e8b3c8 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/markables.dtd @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/proteins.xml b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/proteins.xml new file mode 100644 index 000000000..1a5bd6616 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/proteins.xml @@ -0,0 +1,29 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/sentence.xml b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/sentence.xml new file mode 100644 index 000000000..c35553af7 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/sentence.xml @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Schemes/proteins.xml b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Schemes/proteins.xml new file mode 100644 index 000000000..1045dc27e --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Schemes/proteins.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Schemes/sentence.xml b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Schemes/sentence.xml new file mode 100644 index 000000000..f37fbc936 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Schemes/sentence.xml @@ -0,0 +1,3 @@ + + + diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Styles/default_style.xsl b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Styles/default_style.xsl new file mode 100644 index 000000000..ab671aa34 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Styles/default_style.xsl @@ -0,0 +1,58 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/common_paths.xml b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/common_paths.xml new file mode 100644 index 000000000..8f55971b4 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/common_paths.xml @@ -0,0 +1,17 @@ + + + +Basedata/ +Markables/ +Schemes/ +Styles/ +Customizations/ +default_style.xsl + + +proteins.xml +sentence.xml + + + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/project.mmax b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/project.mmax new file mode 100644 index 000000000..52fc0b1c1 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/project.mmax @@ -0,0 +1,7 @@ + + + +Basedata.xml + + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/originalText/10048764 b/jcore-mmax2-reader/src/test/resources/originalText/10048764 new file mode 100644 index 000000000..2db1f6185 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/originalText/10048764 @@ -0,0 +1,2 @@ +Characterization of antihuman IFNAR-1 monoclonal antibodies: epitope localization and functional analysis. +The type I interferon receptor (IFNAR) is composed of two subunits, IFNAR-1 and IFNAR-2, encoding transmembrane polypeptides. IFNAR-2 has a dominant role in ligand binding, but IFNAR-1 contributes to binding affinity and to differential ligand recognition. A panel of five monoclonal antibodies (mAb) to human IFNAR-1 (HuIFNAR-1) was produced and characterized. The reactivity of each mAb toward HuIFNAR-1 on native and transfected cells and in Western blot and ELISA formats was determined. In functional assays, one mAb, EA12, blocked IFN-a2 binding to human cells and interfered with Stat activation and antiviral activity. Epitopes for the mAb were localized to subdomains of the HuIFNAR-1 extracellular domain by differential reactivity of the mAb to a series of human/bovine IFNAR-1 chimeras. The antibody EA12 seems to require native HuIFNAR-1 for reactivity and does not map to a single subdomain, perhaps recognizing an epitope containing noncontiguous sequences in at least two subdomains. In contrast, the epitopes of the non-neutralizing mAb FB2, AA3, and GB8 mapped, respectively, to the first, second, and third subdomains of HuIFNAR-1. The mAb DB2 primarily maps to the fourth subdomain, although its reactivity may be affected by other determinants. diff --git a/jcore-msdoc-reader/component.meta b/jcore-msdoc-reader/component.meta index 28d3243a0..33c76c42e 100644 --- a/jcore-msdoc-reader/component.meta +++ b/jcore-msdoc-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-msdoc-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe MSdoc Reader" } diff --git a/jcore-msdoc-reader/pom.xml b/jcore-msdoc-reader/pom.xml index ed305d952..abed82145 100644 --- a/jcore-msdoc-reader/pom.xml +++ b/jcore-msdoc-reader/pom.xml @@ -5,7 +5,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 jcore-msdoc-reader JCoRe MSdoc Reader @@ -31,23 +31,23 @@ org.apache.poi poi - 3.15 + 5.2.1 org.apache.poi poi-scratchpad - 3.15 + 5.2.1 org.apache.poi poi-ooxml - 3.16 + 5.2.1 - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-msdoc-reader/src/main/resources/de/julielab/jcore/reader/msdoc/desc/jcore-msdoc-reader.xml b/jcore-msdoc-reader/src/main/resources/de/julielab/jcore/reader/msdoc/desc/jcore-msdoc-reader.xml index 18a03952b..992334132 100644 --- a/jcore-msdoc-reader/src/main/resources/de/julielab/jcore/reader/msdoc/desc/jcore-msdoc-reader.xml +++ b/jcore-msdoc-reader/src/main/resources/de/julielab/jcore/reader/msdoc/desc/jcore-msdoc-reader.xml @@ -5,7 +5,7 @@ JCoRe MSdoc Reader - 2.5.1-SNAPSHOT + 2.6.0 JULIELab Jena, Germany diff --git a/jcore-msdoc-reader/src/test/java/de/julielab/jcore/reader/msdoc/main/MSdocReaderTest.java b/jcore-msdoc-reader/src/test/java/de/julielab/jcore/reader/msdoc/main/MSdocReaderTest.java index d3945a6db..68942199c 100644 --- a/jcore-msdoc-reader/src/test/java/de/julielab/jcore/reader/msdoc/main/MSdocReaderTest.java +++ b/jcore-msdoc-reader/src/test/java/de/julielab/jcore/reader/msdoc/main/MSdocReaderTest.java @@ -27,17 +27,17 @@ import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.Test; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class MSdocReaderTest { /** @@ -69,7 +69,7 @@ public class MSdocReaderTest { private static final String DOC_DUMMY_NAME = "dummy.doc"; private static final String DOC_DUMMY_FILE = "src/test/resources/" + DOC_DUMMY_NAME; - @BeforeClass + @BeforeAll public static void setUp() throws Exception { /** * Create dummies of *.doc-files. @@ -161,7 +161,7 @@ private static void writeArtifact(String file_name) throws IOException { } } - @AfterClass + @AfterAll public static void tearDown() throws Exception { /** * Delete dummies from setUp. diff --git a/jcore-mstparser-ae/component.meta b/jcore-mstparser-ae/component.meta index d58972c4e..906bf56e4 100644 --- a/jcore-mstparser-ae/component.meta +++ b/jcore-mstparser-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-mstparser-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe MST Parser AE" } diff --git a/jcore-mstparser-ae/pom.xml b/jcore-mstparser-ae/pom.xml index ddbf1449e..eec1d63d5 100644 --- a/jcore-mstparser-ae/pom.xml +++ b/jcore-mstparser-ae/pom.xml @@ -54,7 +54,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 .. @@ -80,8 +80,8 @@ provided - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-mstparser-ae/src/main/resources/de/julielab/jcore/ae/mstparser/desc/jcore-mstparser.xml b/jcore-mstparser-ae/src/main/resources/de/julielab/jcore/ae/mstparser/desc/jcore-mstparser.xml index 36985423b..30da736ac 100644 --- a/jcore-mstparser-ae/src/main/resources/de/julielab/jcore/ae/mstparser/desc/jcore-mstparser.xml +++ b/jcore-mstparser-ae/src/main/resources/de/julielab/jcore/ae/mstparser/desc/jcore-mstparser.xml @@ -6,7 +6,7 @@ JCoRe MST Parser Annotator - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-mstparser-ae/src/test/java/de/julielab/jcore/ae/mstparser/main/MSTParserTest.java b/jcore-mstparser-ae/src/test/java/de/julielab/jcore/ae/mstparser/main/MSTParserTest.java index 297b93cb5..46a6fe3a9 100644 --- a/jcore-mstparser-ae/src/test/java/de/julielab/jcore/ae/mstparser/main/MSTParserTest.java +++ b/jcore-mstparser-ae/src/test/java/de/julielab/jcore/ae/mstparser/main/MSTParserTest.java @@ -19,7 +19,6 @@ import de.julielab.jcore.types.DependencyRelation; import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; @@ -36,7 +35,8 @@ import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; -import org.junit.Ignore; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; @@ -46,12 +46,15 @@ import java.io.FileOutputStream; import java.io.IOException; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + /** * This is the JUnit test for the MST Parser Annotator. * * @author Lichtenwald */ -public class MSTParserTest extends TestCase { +public class MSTParserTest { private static final String LOGGER_PROPERTIES = "src/test/java/log4j.properties"; public static final String PARAM_MAX_NUM_TOKENS = "MaxNumTokens"; @@ -68,7 +71,7 @@ public class MSTParserTest extends TestCase { /*--------------------------------------------------------------------------------------------*/ - @Ignore + @Disabled // public void testCAS() throws Exception { // // String[] heads = new String[] { "have", "Migrants", "drown", "coast", "off", "40", "40", "migrants", "have", // // "have", "drowned", "Sea", "Sea", "in", "drowned", "coast", "coast", "off", "coast", "of", "drowned", @@ -174,6 +177,7 @@ public class MSTParserTest extends TestCase { // jcas.reset(); // } // of initCas + @Test public void testThreads() throws Exception { try { int count = 3; @@ -188,7 +192,7 @@ public void testThreads() throws Exception { x.run(); Thread.sleep(5000); } catch (RuntimeException e) { - fail("Errorin Threads"); + fail("Error in Threads"); } } @@ -230,6 +234,7 @@ public void testThreads() throws Exception { * @throws AnalysisEngineProcessException * @throws SAXException */ + @Test public void testProcess() throws IOException, InvalidXMLException, ResourceInitializationException, CASException, AnalysisEngineProcessException, SAXException { XMLInputSource descriptor = new XMLInputSource(DESCRIPTOR_MST_PARSER); @@ -245,9 +250,10 @@ public void testProcess() throws IOException, InvalidXMLException, ResourceIniti FileOutputStream fos = new FileOutputStream(OUTPUT_DIR + File.separator + "test.xmi"); XmiCasSerializer.serialize(jcas.getCas(), fos); - assertTrue("Invalid JCas!", checkAnnotations(jcas, null)); + assertTrue(checkAnnotations(jcas, null), "Invalid JCas!"); } // of testProcess + @Test public void testProcessWithNumTokensRestriction() throws IOException, InvalidXMLException, ResourceInitializationException, CASException, AnalysisEngineProcessException, SAXException, ResourceConfigurationException { @@ -263,7 +269,7 @@ public void testProcessWithNumTokensRestriction() ae.process(jcas); FileOutputStream fos = new FileOutputStream(OUTPUT_DIR + File.separator + "test.xmi"); XmiCasSerializer.serialize(jcas.getCas(), fos); - assertTrue("Invalid JCas!", checkAnnotations(jcas, MAX_NUM_TOKENS)); + assertTrue(checkAnnotations(jcas, MAX_NUM_TOKENS), "Invalid JCas!"); } /** diff --git a/jcore-mstparser-ae/src/test/resources/de/julielab/jcore/ae/mstparser/desc/MSTParserDescriptorTest.xml b/jcore-mstparser-ae/src/test/resources/de/julielab/jcore/ae/mstparser/desc/MSTParserDescriptorTest.xml index a9b0d6b0e..9f66c4074 100644 --- a/jcore-mstparser-ae/src/test/resources/de/julielab/jcore/ae/mstparser/desc/MSTParserDescriptorTest.xml +++ b/jcore-mstparser-ae/src/test/resources/de/julielab/jcore/ae/mstparser/desc/MSTParserDescriptorTest.xml @@ -6,7 +6,7 @@ JCoRe MST Parser Annotator - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-muc7-reader/component.meta b/jcore-muc7-reader/component.meta index 882b76c87..ae898f70c 100644 --- a/jcore-muc7-reader/component.meta +++ b/jcore-muc7-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-muc7-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe MUC7 Reader" } diff --git a/jcore-muc7-reader/pom.xml b/jcore-muc7-reader/pom.xml index aeb5a81b5..9fbc80750 100644 --- a/jcore-muc7-reader/pom.xml +++ b/jcore-muc7-reader/pom.xml @@ -13,7 +13,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -22,8 +22,8 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-muc7-reader/scripts/muc7_SGML2XML.py b/jcore-muc7-reader/scripts/muc7_SGML2XML.py index b015b9342..9dbed485a 100644 --- a/jcore-muc7-reader/scripts/muc7_SGML2XML.py +++ b/jcore-muc7-reader/scripts/muc7_SGML2XML.py @@ -5,9 +5,7 @@ # - `

`: needs to be closed with `

` import re -import os import sys -import glob def close_paragraphs(line): diff --git a/jcore-muc7-reader/src/main/resources/de/julielab/jcore/reader/muc7/desc/jcore-muc7-reader.xml b/jcore-muc7-reader/src/main/resources/de/julielab/jcore/reader/muc7/desc/jcore-muc7-reader.xml index 2f6b99cc3..deecc0e59 100644 --- a/jcore-muc7-reader/src/main/resources/de/julielab/jcore/reader/muc7/desc/jcore-muc7-reader.xml +++ b/jcore-muc7-reader/src/main/resources/de/julielab/jcore/reader/muc7/desc/jcore-muc7-reader.xml @@ -5,7 +5,7 @@ JCoRe MUC7 Reader - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-muc7-reader/src/test/java/de/julielab/jcore/reader/muc7/MUC7ReaderTest.java b/jcore-muc7-reader/src/test/java/de/julielab/jcore/reader/muc7/MUC7ReaderTest.java index 77f12db5e..b2e97da26 100644 --- a/jcore-muc7-reader/src/test/java/de/julielab/jcore/reader/muc7/MUC7ReaderTest.java +++ b/jcore-muc7-reader/src/test/java/de/julielab/jcore/reader/muc7/MUC7ReaderTest.java @@ -11,7 +11,6 @@ import de.julielab.jcore.types.muc7.ENAMEX; import de.julielab.jcore.types.muc7.NUMEX; import de.julielab.jcore.types.muc7.TIMEX; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.metadata.AnalysisEngineMetaData; import org.apache.uima.cas.CAS; @@ -24,6 +23,8 @@ import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import org.xml.sax.SAXException; import javax.xml.parsers.ParserConfigurationException; @@ -31,7 +32,9 @@ import java.util.ArrayList; import java.util.Iterator; -public class MUC7ReaderTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class MUC7ReaderTest { /** * Path to the MedlineReader descriptor */ @@ -40,12 +43,11 @@ public class MUC7ReaderTest extends TestCase { /** * Object to be tested */ - private CollectionReader muc7Reader; + private static CollectionReader muc7Reader; + private static CAS cas; + - - private CAS cas; - /** * Test data */ @@ -87,12 +89,11 @@ public class MUC7ReaderTest extends TestCase { /** * * CAS array with CAS objects that where processed by the muc7Reader */ - private ArrayList cases = new ArrayList(); + private static ArrayList cases = new ArrayList(); - @Override - protected void setUp() throws Exception { - super.setUp(); + @BeforeAll + protected static void setUp() throws Exception { muc7Reader = produceCollectionReader(MUC7_READER_DESCRIPTOR); processAllCases(); } @@ -105,7 +106,7 @@ protected void setUp() throws Exception { * @throws SAXException * @throws ParserConfigurationException */ - private void processAllCases() throws CASException, SAXException, ParserConfigurationException { + private static void processAllCases() throws CASException, SAXException, ParserConfigurationException { try { while (muc7Reader.hasNext()) { cas = CasCreationUtils.createCas((AnalysisEngineMetaData) muc7Reader.getMetaData()); @@ -123,20 +124,21 @@ private void processAllCases() throws CASException, SAXException, ParserConfigur } /** * Test if method getNextCas() has done its job - */ + */ + @Test public void testGetNextCas() { //check for a TIMEX entity String[] timexData = getTimexData(DOC_ID); - assertTrue("TIMEX", checkTimex(timexData)); + assertTrue(checkTimex(timexData), "TIMEX"); //check for a ENAMEX entity String[] enamexData = getEnamexData(DOC_ID); - assertTrue("ENAMEX", checkEnamex(enamexData)); + assertTrue(checkEnamex(enamexData), "ENAMEX"); //check for a NUMEX entity String[] numexData = getNumexData(DOC_ID); - assertTrue("NUMEX", checkNumex(numexData)); + assertTrue(checkNumex(numexData), "NUMEX"); //TODO coreference doesn't works as of now //check for a coref chain @@ -337,7 +339,7 @@ private void buildCorefChain(int corefID, ArrayList corefChain, JCas jca /** * Gets an Iterator over the the CAS for the specific type * - * @param cas (the CAS) + * @param jcas (the CAS) * @param type (the type) * @return the iterator */ @@ -371,7 +373,7 @@ private String[] toStringArray(ArrayList stringArray) { * @throws InvalidXMLException * @throws ResourceInitializationException */ - private CollectionReader produceCollectionReader(String descriptor) throws InvalidXMLException, IOException, ResourceInitializationException { + private static CollectionReader produceCollectionReader(String descriptor) throws InvalidXMLException, IOException, ResourceInitializationException { CollectionReader collectionReader; ResourceSpecifier spec; spec = UIMAFramework.getXMLParser().parseResourceSpecifier(new XMLInputSource(descriptor)); diff --git a/jcore-muc7-reader/src/test/resources/de/julielab/jcore/reader/muc7/desc/jcore-muc7-reader.xml b/jcore-muc7-reader/src/test/resources/de/julielab/jcore/reader/muc7/desc/jcore-muc7-reader.xml index 87e9f1679..d3359b86a 100644 --- a/jcore-muc7-reader/src/test/resources/de/julielab/jcore/reader/muc7/desc/jcore-muc7-reader.xml +++ b/jcore-muc7-reader/src/test/resources/de/julielab/jcore/reader/muc7/desc/jcore-muc7-reader.xml @@ -5,7 +5,7 @@ JCoRe MUC7 Reader - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-mutationfinder-ae/component.meta b/jcore-mutationfinder-ae/component.meta index c0df6eb43..a72f76a2c 100644 --- a/jcore-mutationfinder-ae/component.meta +++ b/jcore-mutationfinder-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-mutationfinder-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Mutation Finder AE" } diff --git a/jcore-mutationfinder-ae/pom.xml b/jcore-mutationfinder-ae/pom.xml index bc0ff3ecb..bf2fc7bd4 100644 --- a/jcore-mutationfinder-ae/pom.xml +++ b/jcore-mutationfinder-ae/pom.xml @@ -5,7 +5,7 @@ jcore-base de.julielab - 2.5.1-SNAPSHOT + 2.6.0 4.0.0 JCoRe Mutation Finder AE @@ -23,8 +23,8 @@ 2.0.8 - junit - junit + org.junit.jupiter + junit-jupiter-engine de.julielab @@ -35,7 +35,16 @@ de.julielab jcore-descriptor-creator + + org.slf4j + slf4j-api + + + org.assertj + assertj-core + + BSD-2-Clause diff --git a/jcore-mutationfinder-ae/src/main/resources/de/julielab/jcore/ae/mutationfinder/desc/jcore-mutationfinder-ae.xml b/jcore-mutationfinder-ae/src/main/resources/de/julielab/jcore/ae/mutationfinder/desc/jcore-mutationfinder-ae.xml index d43c2caba..7cf388c99 100644 --- a/jcore-mutationfinder-ae/src/main/resources/de/julielab/jcore/ae/mutationfinder/desc/jcore-mutationfinder-ae.xml +++ b/jcore-mutationfinder-ae/src/main/resources/de/julielab/jcore/ae/mutationfinder/desc/jcore-mutationfinder-ae.xml @@ -7,7 +7,7 @@ JCoRe Mutation Annotator An analysis engine to recognize mentions of gene point mutations in document text. This is a wrapper around the original MutationFinder (http://mutationfinder.sourceforge.net/), published in the following paper: MutationFinder: A high-performance system for extracting point mutation mentions from text J. Gregory Caporaso, William A. Baumgartner Jr., David A. Randolph, K. Bretonnel Cohen, and Lawrence Hunter; Bioinformatics, 2007 23(14):1862-1865; doi:10.1093/bioinformatics/btm235; - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab, Germany diff --git a/jcore-mutationfinder-ae/src/test/java/de/julielab/jcore/ae/mutationfinder/MutationAnnotatorTest.java b/jcore-mutationfinder-ae/src/test/java/de/julielab/jcore/ae/mutationfinder/MutationAnnotatorTest.java index 5291c51fa..c877fdc14 100644 --- a/jcore-mutationfinder-ae/src/test/java/de/julielab/jcore/ae/mutationfinder/MutationAnnotatorTest.java +++ b/jcore-mutationfinder-ae/src/test/java/de/julielab/jcore/ae/mutationfinder/MutationAnnotatorTest.java @@ -6,12 +6,11 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.hamcrest.CoreMatchers; -import org.junit.Assert; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.Collection; +import static org.assertj.core.api.Assertions.assertThat; public class MutationAnnotatorTest { @Test @@ -21,8 +20,8 @@ public void testAnnotator() throws Exception { jCas.setDocumentText("A covalently bound catalytic intermediate in Escherichia coli asparaginase: crystal structure of a Thr-89-Val mutant."); annotator.process(jCas); final Collection mutations = JCasUtil.select(jCas, PointMutation.class); - Assert.assertThat(mutations.size(), CoreMatchers.is(1)); - Assert.assertThat(mutations.stream().findAny().get().getCoveredText(), CoreMatchers.equalTo("Thr-89-Val")); - Assert.assertThat(mutations.stream().findAny().get().getSpecificType(), CoreMatchers.equalTo("T89V")); + assertThat(mutations).hasSize(1); + assertThat(mutations.stream().findAny().get().getCoveredText()).isEqualTo("Thr-89-Val"); + assertThat(mutations.stream().findAny().get().getSpecificType()).isEqualTo("T89V"); } } diff --git a/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/MutationFinderTest.java b/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/MutationFinderTest.java index 51aa04218..4bc918ef2 100644 --- a/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/MutationFinderTest.java +++ b/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/MutationFinderTest.java @@ -1,24 +1,27 @@ package edu.uchsc.ccp.nlp.ei.mutation; -import junit.framework.TestCase; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileInputStream; import java.util.*; +import static org.junit.jupiter.api.Assertions.assertEquals; + /* * Copyright (c) 2007 Regents of the University of Colorado * Please refer to the licensing agreement at MUTATIONFINDER_HOME/doc/license.txt */ -public class MutationFinderTest extends TestCase { +public class MutationFinderTest { - private List regularExpressions; + private static List regularExpressions; - private MutationFinder mf; + private static MutationFinder mf; - @Override - protected void setUp() throws Exception { + @BeforeAll + protected static void setUp() { /* The first four default regular expressions */ regularExpressions = new ArrayList(); regularExpressions @@ -32,8 +35,6 @@ protected void setUp() throws Exception { .add("(^|[\\s\\(\\[\\'\"/,\\-])(?P(CYS|ILE|SER|GLN|MET|ASN|PRO|LYS|ASP|THR|PHE|ALA|GLY|HIS|LEU|ARG|TRP|VAL|GLU|TYR)|(GLUTAMINE|GLUTAMIC ACID|LEUCINE|VALINE|ISOLEUCINE|LYSINE|ALANINE|GLYCINE|ASPARTATE|METHIONINE|THREONINE|HISTIDINE|ASPARTIC ACID|ARGININE|ASPARAGINE|TRYPTOPHAN|PROLINE|PHENYLALANINE|CYSTEINE|SERINE|GLUTAMATE|TYROSINE))(?P[1-9][0-9]*) to (?P(CYS|ILE|SER|GLN|MET|ASN|PRO|LYS|ASP|THR|PHE|ALA|GLY|HIS|LEU|ARG|TRP|VAL|GLU|TYR)|(GLUTAMINE|GLUTAMIC ACID|LEUCINE|VALINE|ISOLEUCINE|LYSINE|ALANINE|GLYCINE|ASPARTATE|METHIONINE|THREONINE|HISTIDINE|ASPARTIC ACID|ARGININE|ASPARAGINE|TRYPTOPHAN|PROLINE|PHENYLALANINE|CYSTEINE|SERINE|GLUTAMATE|TYROSINE))(?=([.,\\s)\\]\\'\":;\\-?!/]|$))"); mf = new MutationFinder(new HashSet(regularExpressions)); - - super.setUp(); } /** @@ -41,6 +42,7 @@ protected void setUp() throws Exception { * * @throws Exception */ + @Test public void testConstructor() throws Exception { mf = new MutationFinder(new HashSet()); mf = new MutationFinder(new HashSet(regularExpressions)); @@ -62,6 +64,7 @@ public void testConstructor() throws Exception { * * @throws Exception */ + @Test public void testExtractMappingsFromPythonRegex() throws Exception { Map groupMappings = MutationFinder.extractMappingsFromPythonRegex(regularExpressions.get(0)); assertEquals(new Integer(2), groupMappings.get(MutationFinder.WT_RES)); @@ -80,6 +83,7 @@ public void testExtractMappingsFromPythonRegex() throws Exception { * * @throws Exception */ + @Test public void testRemoveTagsFromPythonRegex() throws Exception { String regex0WithoutTags = "(^|[\\s\\(\\[\\'\"/,\\-])([CISQMNPKDTFAGHLRWVEY])([1-9][0-9]+)([CISQMNPKDTFAGHLRWVEY])(?=([.,\\s)\\]\\'\":;\\-?!/]|$))[CASE_SENSITIVE]"; assertEquals(regex0WithoutTags, MutationFinder.removeTagsFromPythonRegex(regularExpressions.get(0))); @@ -95,6 +99,7 @@ public void testRemoveTagsFromPythonRegex() throws Exception { * * @throws Exception */ + @Test public void testExtractionNoMutations() throws Exception { Map> mutations = mf.extractMutations(""); assertEquals(0, mutations.size()); @@ -117,6 +122,7 @@ public void testExtractionNoMutations() throws Exception { * * @throws Exception */ + @Test public void testExtractSingleMutation() throws Exception { Map> mutations = mf.extractMutations("S42T"); Set expectedPMs = new HashSet(); @@ -141,6 +147,7 @@ public void testExtractSingleMutation() throws Exception { * * @throws Exception */ + @Test public void testExtractMultipleMutations() throws Exception { Map> mutations = mf.extractMutations("S42T and W36Y"); Set expectedPMs = new HashSet(); @@ -173,6 +180,7 @@ public void testExtractMultipleMutations() throws Exception { * * @throws Exception */ + @Test public void testExtractMultipleMutationsWithPositiveLookahead() throws Exception { Map> mutations = mf.extractMutations("S42T W36Y"); Set expectedPMs = new HashSet(); @@ -191,6 +199,7 @@ public void testExtractMultipleMutationsWithPositiveLookahead() throws Exception * * @throws Exception */ + @Test public void testExtractionSpanCalculations() throws Exception { Map> mutations = mf.extractMutations("S42T and W36Y"); Mutation expectedPM = new PointMutation(42, "S", "T"); @@ -248,6 +257,7 @@ public void testExtractionSpanCalculations() throws Exception { * * @throws Exception */ + @Test public void testExtractionOfVariousFormats() throws Exception { Map> mutations = mf.extractMutations("The A42G mutation was made."); Mutation expectedPM = new PointMutation(42, "A", "G"); @@ -296,6 +306,7 @@ public void testExtractionOfVariousFormats() throws Exception { * * @throws Exception */ + @Test public void testRegexCaseInsensitiveFlag() throws Exception { Map> mutations = mf.extractMutations("a64t"); assertEquals(0, mutations.size()); @@ -323,6 +334,7 @@ public void testRegexCaseInsensitiveFlag() throws Exception { * * @throws Exception */ + @Test public void testCaseInsensitiveCases() throws Exception { Map> mutations = mf.extractMutations("ala64gly"); assertEquals(1, mutations.size()); @@ -346,6 +358,7 @@ public void testCaseInsensitiveCases() throws Exception { * * @throws Exception */ + @Test public void testPostProcessing() throws Exception { Map> mutations = mf.extractMutations("A64G"); assertEquals(1, mutations.size()); @@ -366,6 +379,7 @@ public void testPostProcessing() throws Exception { * * @throws Exception */ + @Test public void testVariedDigitLength() throws Exception { Map> mutations = mf.extractMutations("ala64gly"); assertEquals(1, mutations.size()); @@ -388,6 +402,7 @@ public void testVariedDigitLength() throws Exception { * * @throws Exception */ + @Test public void testUnacceptableGeneralWordBoundaries() throws Exception { String startCharacters = "abcdefghijklmnopqrstuvwxyz0123456789~@#$%^&*_+=])"; String endCharacters = "abcdefghijklmnopqrstuvwxyz0123456789~@#$%^&*_+=(['"; @@ -408,6 +423,7 @@ public void testUnacceptableGeneralWordBoundaries() throws Exception { * * @throws Exception */ + @Test public void testAcceptableGeneralWordBoundaries() throws Exception { char[] endCharacters = { '.', ',', ' ', '\t', '\n', ')', ']', '"', '\'', ':', ';', '?', '!', '/', '-' }; char[] startCharacters = { ' ', '\t', '\n', '"', '\'', '(', '[', '/', ',', '-' }; @@ -429,6 +445,7 @@ public void testAcceptableGeneralWordBoundaries() throws Exception { * * @throws Exception */ + @Test public void testMixOneAndThreeLetterStrings() throws Exception { Map> mutations = mf.extractMutations("A64Gly"); assertEquals(0, mutations.size()); @@ -442,6 +459,7 @@ public void testMixOneAndThreeLetterStrings() throws Exception { * * @throws Exception */ + @Test public void testFullNameMethods() throws Exception { Map> mutations = mf.extractMutations("alanine64-->Gly"); assertEquals(1, mutations.size()); @@ -455,6 +473,7 @@ public void testFullNameMethods() throws Exception { * * @throws Exception */ + @Test public void testOneLetterAbbreviationFailsNon_wNmFormat() throws Exception { Map> mutations = mf.extractMutations("A64-->glycine"); assertEquals(0, mutations.size()); @@ -471,6 +490,7 @@ public void testOneLetterAbbreviationFailsNon_wNmFormat() throws Exception { * * @throws Exception */ + @Test public void testTextBasedMatches() throws Exception { String[] mutationTexts = { "Ala64 to Gly", "Alanine64 to Glycine", "Ala64 to Glycine", "alanine64 to Gly", "The Ala64 to Gly substitution", "The Ala64 to glycine substitution", "The Ala64 to Gly substitution" }; @@ -490,6 +510,7 @@ public void testTextBasedMatches() throws Exception { * * @throws Exception */ + @Test public void testTextMatchSpacing() throws Exception { Map> mutations = mf.extractMutations("TheAla40toGlymutation"); assertEquals(0, mutations.size()); diff --git a/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/MutationTest.java b/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/MutationTest.java index 671baf314..465898ff9 100644 --- a/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/MutationTest.java +++ b/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/MutationTest.java @@ -1,19 +1,23 @@ package edu.uchsc.ccp.nlp.ei.mutation; -import junit.framework.TestCase; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; /* * Copyright (c) 2007 Regents of the University of Colorado * Please refer to the licensing agreement at MUTATIONFINDER_HOME/doc/license.txt */ -public class MutationTest extends TestCase { +public class MutationTest { /** * Test the the constructor works for input of both int's and String's * * @throws Exception */ + @Test public void testConstructor() throws Exception { Mutation m = new Mutation(42); assertEquals(42, m.getPosition()); @@ -32,6 +36,7 @@ public void testConstructor() throws Exception { * * @throws Exception */ + @Test public void testUnsupportedMethods() throws Exception { Mutation m = new Mutation(42); try { diff --git a/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/PointMutationTest.java b/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/PointMutationTest.java index ec5704846..73bb0df0b 100644 --- a/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/PointMutationTest.java +++ b/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/PointMutationTest.java @@ -1,23 +1,26 @@ package edu.uchsc.ccp.nlp.ei.mutation; -import junit.framework.TestCase; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.util.HashMap; import java.util.Map; import java.util.Set; +import static org.junit.jupiter.api.Assertions.*; + /* * Copyright (c) 2007 Regents of the University of Colorado * Please refer to the licensing agreement at MUTATIONFINDER_HOME/doc/license.txt */ -public class PointMutationTest extends TestCase { - private PointMutation pointMutation; +public class PointMutationTest { + private static PointMutation pointMutation; - private Map aminoAcidCodeLookup; + private static Map aminoAcidCodeLookup; - @Override - protected void setUp() throws Exception { + @BeforeAll + protected static void setUp() throws Exception { pointMutation = new PointMutation(42, "W", "G"); aminoAcidCodeLookup = new HashMap(); @@ -84,7 +87,6 @@ protected void setUp() throws Exception { aminoAcidCodeLookup.put("D", "D"); aminoAcidCodeLookup.put("E", "E"); - super.setUp(); } /** @@ -92,6 +94,7 @@ protected void setUp() throws Exception { * * @throws Exception */ + @Test public void testConstructor() throws Exception { PointMutation pm = new PointMutation(42, "A", "C"); assertEquals(42, pm.getPosition()); @@ -125,6 +128,7 @@ public void testConstructor() throws Exception { * * @throws Exception */ + @Test public void testHashcode() throws Exception { PointMutation pm = new PointMutation(42, "W", "G"); assertEquals((pm.getClass().getName() + pm.toString()).hashCode(), pm.hashCode()); @@ -135,6 +139,7 @@ public void testHashcode() throws Exception { * * @throws Exception */ + @Test public void testInvalidInit() throws Exception { PointMutation pm; try { @@ -178,6 +183,7 @@ public void testInvalidInit() throws Exception { * * @throws Exception */ + @Test public void testEquals() throws Exception { PointMutation pm = new PointMutation(42, "W", "G"); assertTrue(pointMutation.equals(pm)); @@ -200,6 +206,7 @@ public void testEquals() throws Exception { * * @throws Exception */ + @Test public void testNormalizationOfResidue() throws Exception { Set residuesToNormalize = aminoAcidCodeLookup.keySet(); for (String residue : residuesToNormalize) { @@ -212,6 +219,7 @@ public void testNormalizationOfResidue() throws Exception { * * @throws Exception */ + @Test public void testNormalizationOfInvalidResidue() throws Exception { try { pointMutation.normalizeResidueIdentity(""); @@ -260,6 +268,7 @@ public void testNormalizationOfInvalidResidue() throws Exception { * Test the static method which enables creation of a PointMutation object from a String in the wNm format * @throws Exception */ + @Test public void testCreateNewPointMutationFrom_wNm() throws Exception { PointMutation pm = PointMutation.createPointMutationFrom_wNm("W42G"); assertEquals(pointMutation, pm); diff --git a/jcore-neo4j-relations-consumer/LICENSE b/jcore-neo4j-relations-consumer/LICENSE new file mode 100644 index 000000000..fbbd41e05 --- /dev/null +++ b/jcore-neo4j-relations-consumer/LICENSE @@ -0,0 +1,26 @@ +BSD 2-Clause License + +Copyright (c) 2017, JULIE Lab +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/jcore-neo4j-relations-consumer/README.md b/jcore-neo4j-relations-consumer/README.md new file mode 100644 index 000000000..7b8a2a0a9 --- /dev/null +++ b/jcore-neo4j-relations-consumer/README.md @@ -0,0 +1,34 @@ +# JCoRe Neo4j Relations Consumer + +**Descriptor Path**: +``` +de.julielab.jcore.consumer.neo4jrelations.desc.jcore-neo4j-relations-consumer +``` + +Writes EventMentions to Neo4j. + + + +**1. Parameters** + +| Parameter Name | Parameter Type | Mandatory | Multivalued | Description | +|----------------|----------------|-----------|-------------|-------------| +| param1 | UIMA-Type | Boolean | Boolean | Description | +| param2 | UIMA-Type | Boolean | Boolean | Description | + +**2. Predefined Settings** + +| Parameter Name | Parameter Syntax | Example | +|----------------|------------------|---------| +| param1 | Syntax-Description | `Example` | +| param2 | Syntax-Description | `Example` | + +**3. Capabilities** + +| Type | Input | Output | +|------|:-----:|:------:| +| de.julielab.jcore.types.TYPE | | `+` | +| de.julielab.jcore.types.ace.TYPE | `+` | | + + +[1] Some Literature? diff --git a/jcore-neo4j-relations-consumer/component.meta b/jcore-neo4j-relations-consumer/component.meta new file mode 100644 index 000000000..43cb60101 --- /dev/null +++ b/jcore-neo4j-relations-consumer/component.meta @@ -0,0 +1,20 @@ +{ + "categories": [ + "consumer" + ], + "description": "Writes EventMentions to Neo4j.", + "descriptors": [ + { + "category": "consumer", + "location": "de.julielab.jcore.consumer.neo4jrelations.desc.jcore-neo4j-relations-consumer" + } + ], + "exposable": true, + "group": "general", + "maven-artifact": { + "artifactId": "jcore-neo4j-relations-consumer", + "groupId": "de.julielab", + "version": "2.6.0" + }, + "name": "JCoRe Neo4j Relations Consumer" +} diff --git a/jcore-neo4j-relations-consumer/pom.xml b/jcore-neo4j-relations-consumer/pom.xml new file mode 100644 index 000000000..670b0449c --- /dev/null +++ b/jcore-neo4j-relations-consumer/pom.xml @@ -0,0 +1,80 @@ + + + + 4.0.0 + jcore-neo4j-relations-consumer + jar + de.julielab + + + de.julielab + jcore-base + 2.6.0 + + + + + + ch.qos.logback + logback-classic + test + + + org.slf4j + slf4j-api + + + de.julielab + jcore-types + ${jcore-types-version} + + + de.julielab + jcore-utilities + ${jcore-utilities-version} + + + de.julielab + jcore-db-checkpoint-ae + 2.6.0 + + + org.neo4j.test + neo4j-harness + 4.4.2 + test + + + de.julielab + julielab-neo4j-plugins-concepts-representation + 3.0.1 + + + de.julielab + julielab-neo4j-plugins-concepts + 3.0.1 + test + + + org.assertj + assertj-core + + + de.julielab + jcore-descriptor-creator + + + junit + junit + 4.13.1 + + + JCoRe Neo4j Relations Consumer + + JULIE Lab Jena, Germany + http://www.julielab.de + + https://github.com/JULIELab/jcore-base/tree/master/jcore-neo4j-relations-consumer + Writes EventMentions to Neo4j. + diff --git a/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java new file mode 100644 index 000000000..190cf30cd --- /dev/null +++ b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java @@ -0,0 +1,335 @@ +package de.julielab.jcore.consumer.neo4jrelations; + +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.HashMultiset; +import com.google.common.collect.Multiset; +import de.julielab.java.utilities.IOStreamUtilities; +import de.julielab.jcore.ae.checkpoint.DocumentId; +import de.julielab.jcore.ae.checkpoint.DocumentReleaseCheckpoint; +import de.julielab.jcore.types.ArgumentMention; +import de.julielab.jcore.types.ConceptMention; +import de.julielab.jcore.types.ResourceEntry; +import de.julielab.jcore.types.ext.DBProcessingMetaData; +import de.julielab.jcore.types.ext.FlattenedRelation; +import de.julielab.jcore.utility.JCoReTools; +import de.julielab.neo4j.plugins.datarepresentation.ImportIERelation; +import de.julielab.neo4j.plugins.datarepresentation.ImportIERelationArgument; +import de.julielab.neo4j.plugins.datarepresentation.ImportIERelationDocument; +import de.julielab.neo4j.plugins.datarepresentation.ImportIETypedRelations; +import de.julielab.neo4j.plugins.datarepresentation.constants.ImportIERelations; +import org.apache.commons.codec.binary.Base64; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.resource.ResourceInitializationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.ws.rs.HttpMethod; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.HttpURLConnection; +import java.net.URI; +import java.net.URL; +import java.util.*; +import java.util.stream.StreamSupport; + +@ResourceMetaData(name = "JCoRe Neo4j Relations Consumer", description = "This component assumes that a Neo4j server with an installed julieliab-neo4j-plugins-concepts plugin installed. It then sends FlattenedRelation instances with more then one arguments to Neo4j. Note that this requires the event arguments to have a ResourceEntry list to obtain database concept IDs from.", vendor = "JULIE Lab, Germany", copyright = "JULIE Lab", version = "2.6.0-SNAPSHOT") +@TypeCapability(inputs = {"de.julielab.jcore.types.EventMention"}) +public class Neo4jRelationsConsumer extends JCasAnnotator_ImplBase { + + public static final String PARAM_URL = "URL"; + public static final String PARAM_ID_PROPERTY = "IdProperty"; + public static final String PARAM_SOURCE = "ConceptSource"; + public static final String PARAM_NEO4J_USER = "Neo4jUser"; + public static final String PARAM_NEO4J_PASSWORD = "Neo4jPassword"; + public static final String PARAM_WRITE_BATCH_SIZE = "WriteBatchSize"; + private final static Logger log = LoggerFactory.getLogger(Neo4jRelationsConsumer.class); + @ConfigurationParameter(name = PARAM_URL, description = "The complete URL to the endpoint of the Neo4j server for relation insertion.") + private String url; + @ConfigurationParameter(name = PARAM_ID_PROPERTY, description = "The ID property to look up concept nodes in the Neo4j graph. Common options are 'id', 'sourceIds' and 'originalId'. You must know to which ID type the ResourceEntry objects of the relation arguments refer to.") + private String idProperty; + @ConfigurationParameter(name = PARAM_SOURCE, mandatory = false, description = "Optional. Sets the global source for the concept IDs taken from the ResourceEntry instances of the relation arguments. This causes the 'source' feature of the ResourceEntry objects to be omitted and to globally use the specified source instead. This causes the Neo4j database plugin to resolve the provided argument IDs against the source specified here.") + private String globalSource; + @ConfigurationParameter(name = PARAM_NEO4J_USER, mandatory = false, description = "Optional. The Neo4j server user name.") + private String neo4jUser; + @ConfigurationParameter(name = PARAM_NEO4J_PASSWORD, mandatory = false, description = "Optional. The Neo4j server password.") + private String neo4jPassword; + @ConfigurationParameter(name = PARAM_WRITE_BATCH_SIZE, mandatory = false, defaultValue = "50", description = + "The number of processed CASes after which the relation data should be flushed into the database. Defaults to 50.") + private int writeBatchSize; + + private ImportIERelations importIERelations; + private ObjectMapper om; + + private Set documentIds; + + private long docNum; + + /** + * This method is called a single time by the framework at component + * creation. Here, descriptor parameters are read and initial setup is done. + */ + @Override + public void initialize(final UimaContext aContext) throws ResourceInitializationException { + try { + url = (String) aContext.getConfigParameterValue(PARAM_URL); + idProperty = (String) aContext.getConfigParameterValue(PARAM_ID_PROPERTY); + globalSource = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_SOURCE)).orElse(null); + neo4jUser = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_NEO4J_USER)).orElse(null); + neo4jPassword = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_NEO4J_PASSWORD)).orElse(null); + writeBatchSize = Optional.ofNullable((Integer) aContext.getConfigParameterValue(PARAM_WRITE_BATCH_SIZE)).orElse(50); + om = new ObjectMapper(); + om.setSerializationInclusion(JsonInclude.Include.NON_NULL); + om.setSerializationInclusion(JsonInclude.Include.NON_EMPTY); + initImportRelations(); + DocumentReleaseCheckpoint.get().register(Neo4jRelationsConsumer.class.getCanonicalName()); + documentIds = new HashSet<>(); + docNum = 0; + } catch (Throwable e) { + log.error("Could not initialize", e); + throw new ResourceInitializationException(e); + } + } + + private void initImportRelations() { + importIERelations = globalSource != null ? new ImportIERelations(idProperty, globalSource) : new ImportIERelations(idProperty); + } + + /** + * This method is called for each document going through the component. This + * is where the actual work happens. + */ + @Override + public void process(final JCas aJCas) throws AnalysisEngineProcessException { + try { + ImportIERelationDocument document = convertRelations(aJCas); + if (!document.getRelations().isEmpty()) + importIERelations.addRelationDocument(document); + + Optional metaOpt = JCasUtil.select(aJCas, DBProcessingMetaData.class).stream().findAny(); + documentIds.add(metaOpt.isPresent() ? new DocumentId(metaOpt.get()) : new DocumentId(JCoReTools.getDocId(aJCas))); + + if (documentIds.size() % writeBatchSize == 0) { + log.trace("Document nr {} processed, sending batch nr {} of size {} to database.", docNum, docNum / writeBatchSize, writeBatchSize); + batchProcessComplete(); + } + } catch (Throwable e) { + log.error("Exception occurred in document {}", JCoReTools.getDocId(aJCas), e); + if (!(e instanceof AnalysisEngineProcessException)) + throw new AnalysisEngineProcessException(e); + throw e; + } + } + + private ImportIERelationDocument convertRelations(JCas aJCas) { + Map> relationCounts = getEquivalentRelationGroups(aJCas); + ImportIERelationDocument relDoc = new ImportIERelationDocument(); + relDoc.setDb(false); + String docId = JCoReTools.getDocId(aJCas); + relDoc.setName(docId); + ImportIETypedRelations typedRelations = new ImportIETypedRelations(); + for (String relationType : relationCounts.keySet()) { + Multiset unificationRelations = relationCounts.get(relationType); + List ieRelations4relationType = new ArrayList<>(); + for (UnificationRelation rel : unificationRelations.elementSet()) { + ieRelations4relationType.add(rel.toImportRelation(unificationRelations.count(rel))); + } + typedRelations.put(relationType, ieRelations4relationType); + } + relDoc.setRelations(typedRelations); + return relDoc; + } + + @Override + public void batchProcessComplete() throws AnalysisEngineProcessException { + super.batchProcessComplete(); + sendRelationsToNeo4j(); + } + + @Override + public void collectionProcessComplete() throws AnalysisEngineProcessException { + super.collectionProcessComplete(); + log.info("Collection processing finished."); + sendRelationsToNeo4j(); + DocumentReleaseCheckpoint.get().unregister(Neo4jRelationsConsumer.class.getCanonicalName()); + } + + private void sendRelationsToNeo4j() throws AnalysisEngineProcessException { + try { + if (!importIERelations.getDocuments().isEmpty()) { + URL url = URI.create(this.url).toURL(); + HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection(); + urlConnection.addRequestProperty("Content-Type", "application/json"); + String authorizationToken = neo4jUser != null && neo4jPassword != null + ? "Basic " + Base64.encodeBase64URLSafeString((neo4jUser + ":" + neo4jPassword).getBytes()) + : null; + if (authorizationToken != null) + urlConnection.setRequestProperty("Authorization", authorizationToken); + urlConnection.setRequestMethod(HttpMethod.POST); + urlConnection.setDoOutput(true); + try (OutputStream outputStream = urlConnection.getOutputStream()) { + JsonFactory jf = new JsonFactory(om); + JsonGenerator g = jf.createGenerator(outputStream); + g.writeStartObject(); + g.writeObjectField(ImportIERelations.NAME_ID_PROPERTY, idProperty); + g.writeObjectField(ImportIERelations.NAME_ID_SOURCE, globalSource); + + List documents = importIERelations.getDocuments(); + g.writeFieldName(ImportIERelations.NAME_DOCUMENTS); + g.writeStartArray(); + log.debug("Converting {} relation documents to JSON.", documents.size()); + for (ImportIERelationDocument document : (Iterable) documents::iterator) { + g.writeObject(document); + } + g.writeEndArray(); + g.writeEndObject(); + g.close(); + } + try (InputStream inputStream = urlConnection.getInputStream()) { + log.debug("Response from Neo4j: {}", IOStreamUtilities.getStringFromInputStream(inputStream)); + } catch (IOException e) { + log.error("Exception occurred while sending relation data to Neo4j server."); + try (InputStream inputStream = urlConnection.getErrorStream()) { + if (inputStream != null) + log.error("Error from Neo4j: {}", IOStreamUtilities.getStringFromInputStream(inputStream)); + } + throw e; + } + importIERelations.clear(); + } + log.debug("Releasing {} document IDs that have successfully been sent to Neo4j", documentIds.size()); + DocumentReleaseCheckpoint.get().release(Neo4jRelationsConsumer.class.getCanonicalName(), documentIds.stream()); + documentIds.clear(); + } catch (IOException e) { + log.error("Could not send relations to Neo4j endpoint {}", url, e); + throw new AnalysisEngineProcessException(e); + } + } + + /** + *

Iterates through the FlattenedRelations in the JCas and creates an intermediate representation that is primarily meant to group relations together that are basically the same. Then we can just count them instead of sending duplicates to the server.

+ * + * @param aJCas The JCas to get relations from. + * @return The grouped relations. + */ + private Map> getEquivalentRelationGroups(JCas aJCas) { + // Maps relation types to the complete relations. + Map> relationCounts = new HashMap<>(); + for (FlattenedRelation fr : aJCas.getAnnotationIndex(FlattenedRelation.type)) { + Iterator cmIt = StreamSupport.stream(fr.getArguments().spliterator(), false) + .map(ArgumentMention.class::cast) + .map(ArgumentMention::getRef) + .map(ConceptMention.class::cast) + .iterator(); + Set unificationArgs = new HashSet<>(); + // Add all arguments to the relation object. So there could be 1, 2, 3 or even more arguments. + while (cmIt.hasNext()) { + ConceptMention cm = cmIt.next(); + FSArray resourceEntryList = cm.getResourceEntryList(); + if (resourceEntryList != null) { + ResourceEntry resourceEntry = (ResourceEntry) resourceEntryList.get(0); + String id = resourceEntry.getEntryId(); + String source = resourceEntry.getSource(); + if (globalSource == null) + unificationArgs.add(new UnificationArgument(id, source)); + else + unificationArgs.add(new UnificationArgument(id)); + } + } + if (unificationArgs.size() > 1) { + UnificationRelation rel = new UnificationRelation(fr.getRootRelation().getSpecificType(), unificationArgs); + relationCounts.compute(rel.getRelationType(), (k, v) -> v != null ? v : HashMultiset.create()).add(rel); + } + } + return relationCounts; + } + + private class UnificationRelation { + private String relationType; + private Set args; + + public UnificationRelation(String relationType, Set args) { + this.relationType = relationType; + this.args = args; + } + + public ImportIERelation toImportRelation(int count) { + return ImportIERelation.of(count, () -> args.stream().map(UnificationArgument::toImportArgument).iterator()); + } + + public String getRelationType() { + return relationType; + } + + public Set getArgs() { + return args; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + UnificationRelation that = (UnificationRelation) o; + return relationType.equals(that.relationType) && + args.equals(that.args); + } + + @Override + public int hashCode() { + return Objects.hash(relationType, args); + } + } + + private class UnificationArgument { + private String id; + private String source; + + public UnificationArgument(String id) { + this.id = id; + } + + public UnificationArgument(String id, String source) { + this.id = id; + this.source = source; + } + + public ImportIERelationArgument toImportArgument() { + return source != null ? ImportIERelationArgument.of(id, source) : ImportIERelationArgument.of(id); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + UnificationArgument that = (UnificationArgument) o; + return id.equals(that.id) && + Objects.equals(source, that.source); + } + + @Override + public int hashCode() { + return Objects.hash(id, source); + } + + public String getId() { + return id; + } + + public String getSource() { + return source; + } + } + + +} diff --git a/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/jcore-neo4j-relations-consumer.xml b/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/jcore-neo4j-relations-consumer.xml new file mode 100644 index 000000000..4e1449c27 --- /dev/null +++ b/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/jcore-neo4j-relations-consumer.xml @@ -0,0 +1,88 @@ + + + org.apache.uima.java + true + de.julielab.jcore.consumer.neo4jrelations.Neo4jRelationsConsumer + + JCoRe Neo4j Relations Consumer + This component assumes that a Neo4j server with an installed julieliab-neo4j-plugins-concepts plugin installed. It then sends FlattenedRelation instances with more then one arguments to Neo4j. Note that this requires the event arguments to have a ResourceEntry list to obtain database concept IDs from. + 2.6.0 + JULIE Lab, Germany + JULIE Lab + + + URL + The complete URL to the endpoint of the Neo4j server for relation insertion. + String + false + true + + + IdProperty + The ID property to look up concept nodes in the Neo4j graph. Common options are 'id', 'sourceIds' and 'originalId'. You must know to which ID type the ResourceEntry objects of the relation arguments refer to. + String + false + true + + + ConceptSource + Optional. Sets the global source for the concept IDs taken from the ResourceEntry instances of the relation arguments. This causes the 'source' feature of the ResourceEntry objects to be omitted and to globally use the specified source instead. This causes the Neo4j database plugin to resolve the provided argument IDs against the source specified here. + String + false + false + + + Neo4jUser + Optional. The Neo4j server user name. + String + false + false + + + Neo4jPassword + Optional. The Neo4j server password. + String + false + false + + + WriteBatchSize + The number of processed CASes after which the relation data should be flushed into the database. Defaults to 50. + Integer + false + false + + + + + WriteBatchSize + + 50 + + + + + + + + + + + + + + + + de.julielab.jcore.types.EventMention + + + + + + + true + true + false + + + \ No newline at end of file diff --git a/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerIntegrationTest.java b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerIntegrationTest.java new file mode 100644 index 000000000..174a19537 --- /dev/null +++ b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerIntegrationTest.java @@ -0,0 +1,107 @@ + +package de.julielab.jcore.consumer.neo4jrelations; + +import de.julielab.jcore.types.pubmed.Header; +import de.julielab.neo4j.plugins.Indexes; +import de.julielab.neo4j.plugins.concepts.ConceptLookup; +import de.julielab.neo4j.plugins.concepts.ConceptManager; +import de.julielab.neo4j.plugins.datarepresentation.*; +import de.julielab.neo4j.plugins.datarepresentation.constants.FacetConstants; +import de.julielab.neo4j.plugins.datarepresentation.util.ConceptsJsonSerializer; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.neo4j.graphdb.GraphDatabaseService; +import org.neo4j.graphdb.Node; +import org.neo4j.graphdb.RelationshipType; +import org.neo4j.graphdb.Transaction; +import org.neo4j.harness.junit.rule.Neo4jRule; +import org.neo4j.test.server.HTTP; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.stream.Stream; + +import static de.julielab.jcore.consumer.neo4jrelations.Neo4jRelationsConsumerTest.addFlattenedRelation1ToCas; +import static de.julielab.jcore.consumer.neo4jrelations.Neo4jRelationsConsumerTest.addFlattenedRelation2ToCas; +import static de.julielab.neo4j.plugins.constants.semedico.SemanticRelationConstants.PROP_DOC_IDS; +import static de.julielab.neo4j.plugins.datarepresentation.constants.ConceptConstants.PROP_SRC_IDS; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.neo4j.configuration.GraphDatabaseSettings.DEFAULT_DATABASE_NAME; + +/** + * Unit tests for jcore-neo4j-relations-consumer. + * + */ +public class Neo4jRelationsConsumerIntegrationTest { + private final static Logger log = LoggerFactory.getLogger(Neo4jRelationsConsumerIntegrationTest.class); + @ClassRule + public static Neo4jRule neo4j = new Neo4jRule() + .withUnmanagedExtension("/concepts", ConceptManager.class).withFixture(graphDatabaseService -> { + new Indexes(null).createIndexes(graphDatabaseService); + return null; + }); + + @BeforeClass + public static void beforeClass() throws Exception { + ImportFacet facet = new ImportFacet(new ImportFacetGroup("FG"), "myfacet", "myfacet", "myfacet", FacetConstants.SRC_TYPE_HIERARCHICAL); + ImportConcept c11 = new ImportConcept("concept11", new ConceptCoordinates("id11", "source11", CoordinateType.SRC)); + ImportConcept c12 = new ImportConcept("concept12", new ConceptCoordinates("id12", "source12", CoordinateType.SRC)); + ImportConcept c13 = new ImportConcept("concept13", new ConceptCoordinates("id13", "source13", CoordinateType.SRC)); + ImportConcept c21 = new ImportConcept("concept21", new ConceptCoordinates("id21", "source21", CoordinateType.SRC)); + ImportConcept c22 = new ImportConcept("concept22", new ConceptCoordinates("id22", "source22", CoordinateType.SRC)); + ImportConcepts importConcepts = new ImportConcepts(Stream.of(c11, c12, c13, c21, c22), facet); + String uri = neo4j.httpURI().resolve("concepts/" + ConceptManager.CM_REST_ENDPOINT+"/"+ConceptManager.INSERT_CONCEPTS).toString(); + log.debug("Sending concepts to {}", uri); + HTTP.Response response = HTTP.POST(uri, ConceptsJsonSerializer.toJsonTree(importConcepts)); + log.debug("Response to test concepts import: {}", response); + assertEquals(200, response.status()); + } + + @Test + public void insertEventMentions() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.extensions.jcore-semantics-mention-extension-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types"); + Header h = new Header(jCas); + h.setDocId("testdoc"); + h.addToIndexes(); + addFlattenedRelation1ToCas(jCas); + // Here is a duplicate. It should be recognized and just be counted up + addFlattenedRelation2ToCas(jCas); + addFlattenedRelation2ToCas(jCas); + + AnalysisEngine engine = AnalysisEngineFactory.createEngine( + "de.julielab.jcore.consumer.neo4jrelations.desc.jcore-neo4j-relations-consumer", + Neo4jRelationsConsumer.PARAM_URL, neo4j.httpURI().resolve("concepts/" + ConceptManager.CM_REST_ENDPOINT+"/"+ConceptManager.INSERT_IE_RELATIONS).toString(), + Neo4jRelationsConsumer.PARAM_ID_PROPERTY, "sourceIds"); + + engine.process(jCas); + engine.collectionProcessComplete(); + + GraphDatabaseService graphDb = neo4j.databaseManagementService().database(DEFAULT_DATABASE_NAME); + try (Transaction tx = graphDb.beginTx()) { + Node id11 = ConceptLookup.lookupSingleConceptBySourceId(tx, "id11"); + // There should be connections to 12 and 13. + assertThat(id11.getRelationships(RelationshipType.withName("regulation"))).hasSize(2); + assertThat(id11.getRelationships(RelationshipType.withName("regulation"))).flatExtracting(r -> List.of((String[]) r.getProperty(PROP_DOC_IDS))).containsExactly("testdoc", "testdoc"); + assertThat(id11.getRelationships(RelationshipType.withName("regulation"))).extracting(r -> r.getOtherNode(id11).getProperty(PROP_SRC_IDS+0)).containsExactlyInAnyOrder("id12", "id13"); + + Node id13 = ConceptLookup.lookupSingleConceptBySourceId(tx, "id13"); + // There should be connections to 11 and 12. + assertThat(id13.getRelationships(RelationshipType.withName("regulation"))).hasSize(2); + assertThat(id13.getRelationships(RelationshipType.withName("regulation"))).flatExtracting(r -> List.of((String[]) r.getProperty(PROP_DOC_IDS))).containsExactly("testdoc", "testdoc"); + assertThat(id13.getRelationships(RelationshipType.withName("regulation"))).extracting(r -> r.getOtherNode(id13).getProperty(PROP_SRC_IDS+0)).containsExactlyInAnyOrder("id11", "id12"); + + Node id22 = ConceptLookup.lookupSingleConceptBySourceId(tx, "id22"); + // There should be connections to 21 + assertThat(id22.getRelationships(RelationshipType.withName("regulation"))).hasSize(1); + assertThat(id22.getRelationships(RelationshipType.withName("regulation"))).flatExtracting(r -> List.of((String[]) r.getProperty(PROP_DOC_IDS))).containsExactly("testdoc"); + assertThat(id22.getRelationships(RelationshipType.withName("regulation"))).extracting(r -> r.getOtherNode(id22).getProperty(PROP_SRC_IDS+0)).containsExactlyInAnyOrder("id21"); + } + } +} diff --git a/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java new file mode 100644 index 000000000..6e242d25d --- /dev/null +++ b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java @@ -0,0 +1,147 @@ + +package de.julielab.jcore.consumer.neo4jrelations; + +import de.julielab.jcore.types.ArgumentMention; +import de.julielab.jcore.types.ConceptMention; +import de.julielab.jcore.types.EventMention; +import de.julielab.jcore.types.ResourceEntry; +import de.julielab.jcore.types.ext.FlattenedRelation; +import de.julielab.jcore.types.pubmed.Header; +import de.julielab.jcore.utility.JCoReTools; +import de.julielab.neo4j.plugins.datarepresentation.ImportIERelation; +import de.julielab.neo4j.plugins.datarepresentation.ImportIERelationArgument; +import de.julielab.neo4j.plugins.datarepresentation.ImportIERelationDocument; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.factory.UimaContextFactory; +import org.apache.uima.jcas.JCas; +import org.junit.jupiter.api.Test; + +import java.lang.reflect.Method; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Unit tests for jcore-neo4j-relations-consumer. + * + */ +public class Neo4jRelationsConsumerTest { + + + @Test + public void insertEventMentions() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.extensions.jcore-semantics-mention-extension-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types"); + Header h = new Header(jCas); + h.setDocId("testdoc"); + h.addToIndexes(); + Neo4jRelationsConsumer engine = new Neo4jRelationsConsumer(); + engine.initialize(UimaContextFactory.createUimaContext(Neo4jRelationsConsumer.PARAM_URL, "", Neo4jRelationsConsumer.PARAM_ID_PROPERTY, "sourceIds")); + addFlattenedRelation1ToCas(jCas); + // Here is a duplicate. It should be recognized and just be counted up + addFlattenedRelation2ToCas(jCas); + addFlattenedRelation2ToCas(jCas); + + Method m = Neo4jRelationsConsumer.class.getDeclaredMethod("convertRelations", JCas.class); + m.setAccessible(true); + ImportIERelationDocument relations = (ImportIERelationDocument) m.invoke(engine, jCas); + assertThat(relations).extracting(ImportIERelationDocument::getRelations).isNotNull(); + assertThat(relations.getRelations()).hasSize(1); + List regulations = relations.getRelations().get("regulation"); + assertThat(regulations).hasSize(2); + assertThat(regulations.get(0)).extracting(ImportIERelation::getCount).isEqualTo(1); + assertThat(regulations.get(1)).extracting(ImportIERelation::getCount).isEqualTo(2); + assertThat(regulations).flatExtracting(ImportIERelation::getArgs).flatExtracting(ImportIERelationArgument::getId).containsExactlyInAnyOrder("id11", "id12", "id13", "id21", "id22"); + assertThat(regulations).flatExtracting(ImportIERelation::getArgs).flatExtracting(ImportIERelationArgument::getSource).containsExactlyInAnyOrder("source11", "source12", "source13", "source21", "source22"); + } + + @Test + public void insertEventMentionsGlobalSource() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.extensions.jcore-semantics-mention-extension-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types"); + Header h = new Header(jCas); + h.setDocId("testdoc"); + h.addToIndexes(); + Neo4jRelationsConsumer engine = new Neo4jRelationsConsumer(); + engine.initialize(UimaContextFactory.createUimaContext(Neo4jRelationsConsumer.PARAM_URL, "", Neo4jRelationsConsumer.PARAM_ID_PROPERTY, "sourceIds", Neo4jRelationsConsumer.PARAM_SOURCE, "globalSource")); + addFlattenedRelation1ToCas(jCas); + addFlattenedRelation2ToCas(jCas); + + Method m = Neo4jRelationsConsumer.class.getDeclaredMethod("convertRelations", JCas.class); + m.setAccessible(true); + ImportIERelationDocument relations = (ImportIERelationDocument) m.invoke(engine, jCas); + assertThat(relations).extracting(ImportIERelationDocument::getRelations).isNotNull(); + assertThat(relations.getRelations()).hasSize(1); + List regulations = relations.getRelations().get("regulation"); + assertThat(regulations).hasSize(2); + // With the global source set, the individual sources are left out + assertThat(regulations).flatExtracting(ImportIERelation::getArgs).flatExtracting(ImportIERelationArgument::getSource).containsExactlyInAnyOrder(null, null, null, null, null); + } + + /** + * Adds a FlattenedRelation with three arguments. + * @param jCas The CAS. + */ + public static void addFlattenedRelation1ToCas(JCas jCas) { + FlattenedRelation fr = new FlattenedRelation(jCas); + EventMention rootEm = new EventMention(jCas); + rootEm.setSpecificType("regulation"); + fr.setRootRelation(rootEm); + + ArgumentMention am1 = new ArgumentMention(jCas); + ConceptMention cm1 = new ConceptMention(jCas); + ResourceEntry re1 = new ResourceEntry(jCas); + re1.setEntryId("id11"); + re1.setSource("source11"); + cm1.setResourceEntryList(JCoReTools.addToFSArray(null, re1)); + am1.setRef(cm1); + + ArgumentMention am2 = new ArgumentMention(jCas); + ConceptMention cm2 = new ConceptMention(jCas); + ResourceEntry re2 = new ResourceEntry(jCas); + re2.setEntryId("id12"); + re2.setSource("source12"); + cm2.setResourceEntryList(JCoReTools.addToFSArray(null, re2)); + am2.setRef(cm2); + + ArgumentMention am3 = new ArgumentMention(jCas); + ConceptMention cm3 = new ConceptMention(jCas); + ResourceEntry re3 = new ResourceEntry(jCas); + re3.setEntryId("id13"); + re3.setSource("source13"); + cm3.setResourceEntryList(JCoReTools.addToFSArray(null, re3)); + am3.setRef(cm3); + + fr.setArguments(JCoReTools.addToFSArray(null, List.of(am1, am2, am3))); + fr.addToIndexes(); + } + + /** + * Adds a FlattenedRelation with two arguments. + * @param jCas The CAS. + */ + public static void addFlattenedRelation2ToCas(JCas jCas) { + FlattenedRelation fr = new FlattenedRelation(jCas); + EventMention rootEm = new EventMention(jCas); + rootEm.setSpecificType("regulation"); + fr.setRootRelation(rootEm); + + ArgumentMention am1 = new ArgumentMention(jCas); + ConceptMention cm1 = new ConceptMention(jCas); + ResourceEntry re1 = new ResourceEntry(jCas); + re1.setEntryId("id21"); + re1.setSource("source21"); + cm1.setResourceEntryList(JCoReTools.addToFSArray(null, re1)); + am1.setRef(cm1); + + ArgumentMention am2 = new ArgumentMention(jCas); + ConceptMention cm2 = new ConceptMention(jCas); + ResourceEntry re2 = new ResourceEntry(jCas); + re2.setEntryId("id22"); + re2.setSource("source22"); + cm2.setResourceEntryList(JCoReTools.addToFSArray(null, re2)); + am2.setRef(cm2); + + fr.setArguments(JCoReTools.addToFSArray(null, List.of(am1, am2))); + fr.addToIndexes(); + } + +} diff --git a/jcore-nlmgene-reader/BioC.dtd b/jcore-nlmgene-reader/BioC.dtd new file mode 100644 index 000000000..8bd0d55ca --- /dev/null +++ b/jcore-nlmgene-reader/BioC.dtd @@ -0,0 +1,158 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/jcore-nlmgene-reader/LICENSE b/jcore-nlmgene-reader/LICENSE new file mode 100644 index 000000000..fbbd41e05 --- /dev/null +++ b/jcore-nlmgene-reader/LICENSE @@ -0,0 +1,26 @@ +BSD 2-Clause License + +Copyright (c) 2017, JULIE Lab +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/jcore-nlmgene-reader/README.md b/jcore-nlmgene-reader/README.md new file mode 100644 index 000000000..4b24b0fe9 --- /dev/null +++ b/jcore-nlmgene-reader/README.md @@ -0,0 +1,48 @@ +# JCoRe Component Skeleton +`Text that describes the component in brevity...` + +**Descriptor Path**: +``` +de.julielab.jcore.{reader, ae, consumer}.NAME.desc.ARTIFACT-NAME +``` + +`More thorough description` +`Are there any requirements or dependencies for this component?` + +**1. Parameters** + +| Parameter Name | Parameter Type | Mandatory | Multivalued | Description | +|----------------|----------------|-----------|-------------|-------------| +| param1 | UIMA-Type | Boolean | Boolean | Description | +| param2 | UIMA-Type | Boolean | Boolean | Description | + +**2. Predefined Settings** + +| Parameter Name | Parameter Syntax | Example | +|----------------|------------------|---------| +| param1 | Syntax-Description | `Example` | +| param2 | Syntax-Description | `Example` | + +**3. Capabilities** + +| Type | Input | Output | +|------|:-----:|:------:| +| de.julielab.jcore.types.TYPE | | `+` | +| de.julielab.jcore.types.ace.TYPE | `+` | | + + +NLM-Gene annotation code meanings (taken from the file `NLM-Gene-Annotation-Guidelines.docx` on the FTP server linked in the paper): + +* 000: Mention is not explicitly linked to a species; use the gene ID of the mention at another text position where the species is specified. +* 111: The given ID is actually the ID of an ortholog of it because the gene does not yet have an ID in NCBI Gene. The used ID should stem from the article, if such an ortholog is mentioned there. +* 222: This is a family/group/class of genes. Annotate with all the gene IDs of that family/group/class that appear in the same article. +* 333: This ia a family/group/class but none of its members were used in the abstract. Use some family member gene that belongs to the main organism discussed in the article. This code is also used for references to protein domains. +* 444: This is a protein complex. Analogous to families, use the ID of the subunits mentioned in the article. +* 555: This is a protein complex without mentions of subunits in the same article. Use the ID of some subunit that belongs to the main organism of the abstract. + +Gene annotations with multiple IDs: +* for enumerations with ellipsis, IDs are separated by semicolons +* for other text phrases that have multiple IDs, IDs are separated by commas +* for some IDs, their homologene-ID is also given, separated by a pipe (this does not seem to be documented anywhere; for this reason, the homologene-ID is stripped by this reader) + +[1] Islamaj, R., Wei, C. H., Cissel, D., Miliaras, N., Printseva, O., Rodionov, O., … Lu, Z. (2021). NLM-Gene, a richly annotated gold standard dataset for gene entities that addresses ambiguity and multi-species gene recognition. Journal of Biomedical Informatics, 118(March), 103779. https://doi.org/10.1016/j.jbi.2021.103779 diff --git a/jcore-nlmgene-reader/component.meta b/jcore-nlmgene-reader/component.meta new file mode 100644 index 000000000..57b636559 --- /dev/null +++ b/jcore-nlmgene-reader/component.meta @@ -0,0 +1,20 @@ +{ + "categories": [ + "reader" + ], + "description": "Collection reader for the BioC format of the NLM-Gene corpus.", + "descriptors": [ + { + "category": "reader", + "location": "de.julielab.jcore.reader.nlmgene.desc.jcore-nlmgene-reader" + } + ], + "exposable": true, + "group": "general", + "maven-artifact": { + "artifactId": "jcore-nlmgene-reader", + "groupId": "de.julielab", + "version": "2.6.0" + }, + "name": "JCoRe NLM-Gene Reader" +} diff --git a/jcore-nlmgene-reader/pom.xml b/jcore-nlmgene-reader/pom.xml new file mode 100644 index 000000000..ce98b1697 --- /dev/null +++ b/jcore-nlmgene-reader/pom.xml @@ -0,0 +1,64 @@ + + + + 4.0.0 + jcore-nlmgene-reader + jar + de.julielab + + + de.julielab + jcore-base + 2.6.0 + + + + + + + org.assertj + assertj-core + + + com.pengyifan.bioc + pengyifan-bioc + 1.0.3 + + + ch.qos.logback + logback-classic + test + + + org.slf4j + slf4j-api + + + de.julielab + jcore-descriptor-creator + + + de.julielab + jcore-types + ${jcore-types-version} + + + org.junit.jupiter + junit-jupiter-engine + + + JCoRe NLM-Gene Reader + + JULIE Lab Jena, Germany + http://www.julielab.de + + https://github.com/JULIELab/jcore-base/tree/master/jcore-nlmgene-reader + Collection reader for the BioC format of the NLM-Gene corpus. + + + BSD 2-Clause + https://opensource.org/licenses/BSD-2-Clause + + + diff --git a/jcore-nlmgene-reader/src/main/java/de/julielab/jcore/reader/nlmgene/NLMGeneReader.java b/jcore-nlmgene-reader/src/main/java/de/julielab/jcore/reader/nlmgene/NLMGeneReader.java new file mode 100644 index 000000000..f7aaba55f --- /dev/null +++ b/jcore-nlmgene-reader/src/main/java/de/julielab/jcore/reader/nlmgene/NLMGeneReader.java @@ -0,0 +1,187 @@ +package de.julielab.jcore.reader.nlmgene; + +import com.pengyifan.bioc.BioCAnnotation; +import com.pengyifan.bioc.BioCCollection; +import com.pengyifan.bioc.BioCDocument; +import com.pengyifan.bioc.BioCPassage; +import com.pengyifan.bioc.io.BioCCollectionReader; +import de.julielab.jcore.types.Gene; +import de.julielab.jcore.types.ResourceEntry; +import de.julielab.jcore.types.Title; +import de.julielab.jcore.types.pubmed.AbstractText; +import de.julielab.jcore.types.pubmed.Header; +import org.apache.uima.UimaContext; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Progress; +import org.apache.uima.util.ProgressImpl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.xml.stream.XMLStreamException; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Collections; +import java.util.Iterator; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +@ResourceMetaData(name = "JCoRe NLM-Gene Reader", description = "Collection reader for the BioC format of the NLM-Gene corpus.", vendor = "JULIE Lab Jena, Germany") +@TypeCapability(inputs = {}, outputs = {"de.julielab.jcore.types.Gene", "de.julielab.jcore.types.ResourceEntry"}) +public class NLMGeneReader extends JCasCollectionReader_ImplBase { + + public static final String PARAM_INPUT_DIR = "InputDirectory"; + public static final String PARAM_ID_LIST_PATH = "IdList"; + private final static Logger log = LoggerFactory.getLogger(NLMGeneReader.class); + @ConfigurationParameter(name = PARAM_INPUT_DIR, description = "Path to the directory that contains the BioC XML files of the NLM-Gene corpus.") + private String inputDir; + @ConfigurationParameter(name = PARAM_ID_LIST_PATH, mandatory = false, description = "Path to a file with a list of IDs to restrict the read files to. This will typically be the list with IDs for the training or for the test set of the corpus. When no list is specified, the whole corpus is read.") + private String idList; + private Iterator corpusFileIterator; + private int numRead; + + /** + * This method is called a single time by the framework at component + * creation. Here, descriptor parameters are read and initial setup is done. + */ + @Override + public void initialize(UimaContext context) throws ResourceInitializationException { + super.initialize(context); + inputDir = (String) context.getConfigParameterValue(PARAM_INPUT_DIR); + idList = (String) context.getConfigParameterValue(PARAM_ID_LIST_PATH); + try { + corpusFileIterator = readInputFiles(inputDir, idList); + } catch (IOException e) { + log.error("Could not read NLM-Gene corpus input files.", e); + throw new ResourceInitializationException(e); + } + numRead = 0; + } + + private Iterator readInputFiles(String inputDir, String idList) throws IOException { + Path inputPath = Path.of(inputDir); + Path idListPath = idList != null ? Path.of(idList) : null; + Set ids = idListPath != null && Files.exists(idListPath) ? Files.readAllLines(idListPath).stream().collect(Collectors.toSet()) : Collections.emptySet(); + return Files.list(inputPath) + .filter(p -> p.toString().toLowerCase().endsWith(".xml") || p.toString().toLowerCase().endsWith(".xml.gz")) + .filter(p -> ids.isEmpty() ? true : ids.contains(p.getFileName().toString().replaceAll("(?i)\\.bioc\\.xml(\\.gz)?", ""))) + .iterator(); + } + + /** + * This method is called for each document going through the component. This + * is where the actual work happens. + */ + @Override + public void getNext(JCas jCas) throws CollectionException { + final Path nextFile = corpusFileIterator.next(); + try { + final BioCCollectionReader reader = new BioCCollectionReader(nextFile); + final BioCCollection collection = reader.readCollection(); + if (collection.getDocmentCount() > 1) + throw new IllegalArgumentException("A single document per BioC collection is expected but the collection of file " + nextFile + " has " + collection.getDocmentCount() + " documents. This case is not supported."); + final BioCDocument document = collection.getDocument(0); + + handleHeader(jCas, document); + StringBuilder textBuilder = new StringBuilder(); + for (BioCPassage p : document.getPassages()) { + int previousTextLength = textBuilder.length(); + textBuilder.append(p.getText().get()); + handlePassageStructureType(jCas, textBuilder, p, previousTextLength); + handleAnnotation(jCas, document, p, textBuilder); + textBuilder.append(System.getProperty("line.separator")); + } + + jCas.setDocumentText(textBuilder.toString()); + } catch (XMLStreamException | IOException e) { + log.error("Could not read NLM-Gene corpus file {}", nextFile, e); + throw new CollectionException(e); + } + } + + private void handleHeader(JCas jCas, BioCDocument document) { + final Header h = new Header(jCas); + h.setDocId(document.getID()); + h.setComponentId(getClass().getSimpleName()); + h.setSource("NLM-Gene"); + h.addToIndexes(); + } + + private void handleAnnotation(JCas jCas, BioCDocument document, BioCPassage p, StringBuilder textBuilder) { + for (BioCAnnotation a : p.getAnnotations()) { + final Gene g = new Gene(jCas, a.getTotalLocation().getOffset(), a.getTotalLocation().getOffset() + a.getTotalLocation().getLength()); + g.setComponentId(getClass().getSimpleName()); + final Optional typeInfon = a.getInfon("type"); + final Optional codeInfon = a.getInfon("code"); + handleErrors(document, p, a, g, typeInfon, textBuilder); + handleGeneId(jCas, a, g); + handleSpecificType(g, typeInfon, codeInfon); + g.addToIndexes(); + } + } + + private void handleSpecificType(Gene g, Optional typeInfon, Optional codeInfon) { + g.setSpecificType(typeInfon.get()); + if (codeInfon.isPresent()) + g.setSpecificType(typeInfon.get() + "-" + codeInfon.get()); + } + + private void handleErrors(BioCDocument document, BioCPassage p, BioCAnnotation a, Gene g, Optional typeInfon, StringBuilder textBuilder) { +// if (typeInfon.isPresent() && !(typeInfon.get().equals("Gene") || typeInfon.get().equals("GENERIF"))) +// throw new IllegalStateException("The annotation " + a.getID() + " of passage " + p.getInfon("type").get() + " of document " + document.getID() + " was neither of type Gene nor GENERIF. but '" + typeInfon.get() + "'"); + if (!typeInfon.isPresent()) + throw new IllegalStateException("The annotation " + a.getID() + " of passage " + p.getInfon("type").get() + " of document " + document.getID() + " does not specify a type."); +// if (!textBuilder.substring(g.getBegin(), g.getEnd()).equals(a.getText().get())) +// throw new IllegalStateException("The annotation " + a.getID() + " of passage " + p.getInfon("type").get() + " of document " + document.getID() + " has the covered text " + textBuilder.substring(g.getBegin(), g.getEnd()) + " but should have the text " + a.getText().get() + " according to the BioC XML information."); + } + + private void handleGeneId(JCas jCas, BioCAnnotation a, Gene g) { + final Optional ncbiGeneId = a.getInfon("NCBI Gene identifier"); + if (ncbiGeneId.isPresent()) { + final ResourceEntry re = new ResourceEntry(jCas, g.getBegin(), g.getEnd()); + re.setEntryId(ncbiGeneId.get()); + // for a few cases, the ID looks like this: 8074|10771 (gene name FGF23) + // it seems that the first number is the NCBI Gene ID and the second is the homologene ID. We omit the + // homologene ID for know, we don't use it + if (ncbiGeneId.get().contains("|")) + re.setEntryId(ncbiGeneId.get().split("\\|")[0]); + re.setComponentId(getClass().getSimpleName()); + final FSArray resourceEntryList = new FSArray(jCas, 1); + resourceEntryList.set(0, re); + g.setResourceEntryList(resourceEntryList); + } + } + + private void handlePassageStructureType(JCas jCas, StringBuilder textBuilder, BioCPassage p, int previousTextLength) { + final Optional typeInfon = p.getInfon("type"); + if (typeInfon.isPresent() && typeInfon.get().equals("title")) { + final Title t = new Title(jCas, previousTextLength, textBuilder.length()); + t.setTitleType("document"); + t.setComponentId(getClass().getSimpleName()); + t.addToIndexes(); + } else if (typeInfon.isPresent() && typeInfon.get().equals("abstract")) { + final AbstractText abstractText = new AbstractText(jCas, previousTextLength, textBuilder.length()); + abstractText.setComponentId(getClass().getSimpleName()); + abstractText.addToIndexes(); + } + } + + @Override + public Progress[] getProgress() { + return new Progress[]{new ProgressImpl(numRead, 0, "documents")}; + } + + @Override + public boolean hasNext() { + return corpusFileIterator.hasNext(); + } + +} diff --git a/jcore-nlmgene-reader/src/main/resources/de/julielab/jcore/reader/nlmgene/desc/PLACEHOLDER b/jcore-nlmgene-reader/src/main/resources/de/julielab/jcore/reader/nlmgene/desc/PLACEHOLDER new file mode 100644 index 000000000..e4b0b196a --- /dev/null +++ b/jcore-nlmgene-reader/src/main/resources/de/julielab/jcore/reader/nlmgene/desc/PLACEHOLDER @@ -0,0 +1,4 @@ +The actual descriptor must be created by UIMA fit. +For this purpose, use UIMAfit annotations to annotate the reader component class. +Then employ the jcore-descriptor-creator's main method to build the descriptor from the reader class. +The jcore-descriptor-creator is already on the classpath as a Maven dependency. diff --git a/jcore-nlmgene-reader/src/main/resources/de/julielab/jcore/reader/nlmgene/desc/jcore-nlmgene-reader.xml b/jcore-nlmgene-reader/src/main/resources/de/julielab/jcore/reader/nlmgene/desc/jcore-nlmgene-reader.xml new file mode 100644 index 000000000..9bf087ce7 --- /dev/null +++ b/jcore-nlmgene-reader/src/main/resources/de/julielab/jcore/reader/nlmgene/desc/jcore-nlmgene-reader.xml @@ -0,0 +1,51 @@ + + + org.apache.uima.java + de.julielab.jcore.reader.nlmgene.NLMGeneReader + + JCoRe NLM-Gene Reader + Collection reader for the BioC format of the NLM-Gene corpus. + 2.6.0 + JULIE Lab Jena, Germany + + + InputDirectory + Path to the directory that contains the BioC XML files of the NLM-Gene corpus. + String + false + true + + + IdList + Path to a file with a list of IDs to restrict the read files to. This will typically be the list with IDs for the training or for the test set of the corpus. When no list is specified, the whole corpus is read. + String + false + false + + + + + + + + + + + + + + + + de.julielab.jcore.types.Gene + de.julielab.jcore.types.ResourceEntry + + + + + + true + false + true + + + \ No newline at end of file diff --git a/jcore-nlmgene-reader/src/test/java/de/julielab/jcore/reader/nlmgene/NLMGeneReaderTest.java b/jcore-nlmgene-reader/src/test/java/de/julielab/jcore/reader/nlmgene/NLMGeneReaderTest.java new file mode 100644 index 000000000..a346cdb75 --- /dev/null +++ b/jcore-nlmgene-reader/src/test/java/de/julielab/jcore/reader/nlmgene/NLMGeneReaderTest.java @@ -0,0 +1,75 @@ + +package de.julielab.jcore.reader.nlmgene; + +import de.julielab.jcore.types.Gene; +import de.julielab.jcore.types.ResourceEntry; +import de.julielab.jcore.types.Title; +import de.julielab.jcore.types.pubmed.AbstractText; +import de.julielab.jcore.types.pubmed.Header; +import org.apache.uima.UIMAException; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.factory.CollectionReaderFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; +/** + * Unit tests for jcore-nlmgene-reader. + * @author + * + */ +public class NLMGeneReaderTest{ + + private final static Logger log = LoggerFactory.getLogger(NLMGeneReaderTest.class); + + @Test + public void testReader() throws Exception { + final JCas jCas = getJCas(); + final CollectionReader reader = CollectionReaderFactory.createReader("de.julielab.jcore.reader.nlmgene.desc.jcore-nlmgene-reader", NLMGeneReader.PARAM_INPUT_DIR, Path.of("src", "test", "resources", "input").toString()); + assertThat(reader.hasNext()).isTrue(); + reader.getNext(jCas.getCas()); + assertThat(reader.hasNext()).isFalse(); + final Header header = JCasUtil.selectSingle(jCas, Header.class); + assertThat(header.getDocId()).isEqualTo("12461077"); + final Title title = JCasUtil.selectSingle(jCas, Title.class); + assertThat(title).extracting(Title::getBegin, Title::getEnd).isEqualTo(List.of(0, 151)); + final AbstractText abstractText = JCasUtil.selectSingle(jCas, AbstractText.class); + assertThat(abstractText).extracting(AbstractText::getBegin, AbstractText::getEnd).isEqualTo(List.of(152, 2168)); + final List genes = new ArrayList<>(JCasUtil.select(jCas, Gene.class)); + assertThat(genes).hasSize(39); + final Gene firstGene = genes.get(0); + assertThat(firstGene).extracting(Gene::getCoveredText).isEqualTo("ICSBP"); + assertThat(firstGene.getResourceEntryList()).isNotNull().isNotEmpty(); + assertThat(firstGene.getResourceEntryList(0)).extracting(ResourceEntry::getEntryId).isEqualTo("15900"); + + final Gene secondGene = genes.get(9); + assertThat(secondGene).extracting(Gene::getCoveredText).isEqualTo("CD11c"); + assertThat(secondGene.getResourceEntryList(0)).extracting(ResourceEntry::getEntryId).isEqualTo("16411"); + } + + private JCas getJCas() throws UIMAException { + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-semantics-biology-types"); + return jCas; + } + + @Test + public void testReadFilesOnList() throws Exception{ + final CollectionReader reader = CollectionReaderFactory.createReader("de.julielab.jcore.reader.nlmgene.desc.jcore-nlmgene-reader", + NLMGeneReader.PARAM_INPUT_DIR, Path.of("src", "test", "resources", "input").toString(), + NLMGeneReader.PARAM_ID_LIST_PATH, Path.of("src", "test", "resources", "input", "listWithTestDoc.txt").toString()); + assertThat(reader.hasNext()).isTrue(); + + final CollectionReader reader2 = CollectionReaderFactory.createReader("de.julielab.jcore.reader.nlmgene.desc.jcore-nlmgene-reader", + NLMGeneReader.PARAM_INPUT_DIR, Path.of("src", "test", "resources", "input").toString(), + NLMGeneReader.PARAM_ID_LIST_PATH, Path.of("src", "test", "resources", "input", "listWithoutTestDoc.txt").toString()); + assertThat(reader2.hasNext()).isFalse(); + } +} diff --git a/jcore-nlmgene-reader/src/test/resources/input/12461077.BioC.XML b/jcore-nlmgene-reader/src/test/resources/input/12461077.BioC.XML new file mode 100644 index 000000000..5347ff6f6 --- /dev/null +++ b/jcore-nlmgene-reader/src/test/resources/input/12461077.BioC.XML @@ -0,0 +1,3 @@ +PubTator2020-12-04BioC.key +12461077title0ICSBP is essential for the development of mouse type I interferon-producing cells and for the generation and activation of CD8alpha(+) dendritic cells.15900GENERIFICSBP33315977Genetype I interferon12525GeneCD8alphaabstract152Interferon (IFN) consensus sequence-binding protein (ICSBP) is a transcription factor playing a critical role in the regulation of lineage commitment, especially in myeloid cell differentiation. In this study, we have characterized the phenotype and activation pattern of subsets of dendritic cells (DCs) in ICSBP(-/-) mice. Remarkably, the recently identified mouse IFN-producing cells (mIPCs) were absent in all lymphoid organs from ICSBP(-/-) mice, as revealed by lack of CD11c(low)B220(+)Ly6C(+)CD11b(-) cells. In parallel, CD11c(+) cells isolated from ICSBP(-/-) spleens were unable to produce type I IFNs in response to viral stimulation. ICSBP(-/-) mice also displayed a marked reduction of the DC subset expressing the CD8alpha marker (CD8alpha(+) DCs) in spleen, lymph nodes, and thymus. Moreover, ICSBP(-/-) CD8alpha(+) DCs exhibited a markedly impaired phenotype when compared with WT DCs. They expressed very low levels of costimulatory molecules (intercellular adhesion molecule [ICAM]-1, CD40, CD80, CD86) and of the T cell area-homing chemokine receptor CCR7, whereas they showed higher levels of CCR2 and CCR6, as revealed by reverse transcription PCR. In addition, these cells were unable to undergo full phenotypic activation upon in vitro culture in presence of maturation stimuli such as lipopolysaccharide or poly (I:C), which paralleled with lack of Toll-like receptor (TLR)3 mRNA expression. Finally, cytokine expression pattern was also altered in ICSBP(-/-) DCs, as they did not express interleukin (IL)-12p40 or IL-15, but they displayed detectable IL-4 mRNA levels. On the whole, these results indicate that ICSBP is a crucial factor in the regulation of two possibly linked processes: (a) the development and activity of mIPCs, whose lack in ICSBP(-/-) mice may explain their high susceptibility to virus infections; (b) the generation and activation of CD8alpha(+) DCs, whose impairment in ICSBP(-/-) mice can be responsible for the defective generation of a Th1 type of immune response.00015900GeneInterferon (IFN) consensus sequence-binding protein15900GENERIFICSBP22215900Genetranscription factor15900GENERIFICSBP33315978GeneIFN15900GENERIFICSBP16411GeneCD11c19264GeneB22017067GeneLy6C16409GeneCD11b16411GeneCD11c15900GENERIFICSBP33315977Genetype I IFNs15900GENERIFICSBP12525GeneCD8alpha12525GeneCD8alpha15900GENERIFICSBP12525GeneCD8alpha15894Geneintercellular adhesion molecule [ICAM]-121939GeneCD4012519GeneCD8012524GeneCD8622212458,12772,12775Genechemokine receptor12775GeneCCR712772GeneCCR212458GeneCCR6142980GeneToll-like receptor (TLR)322216160,16168,16189Genecytokine15900GENERIFICSBP16160Geneinterleukin (IL)-12p4016168GeneIL-1516189GeneIL-415900GENERIFICSBP15900GENERIFICSBP12525GeneCD8alpha15900GENERIFICSBP + diff --git a/jcore-nlmgene-reader/src/test/resources/input/listWithTestDoc.txt b/jcore-nlmgene-reader/src/test/resources/input/listWithTestDoc.txt new file mode 100644 index 000000000..32547ffca --- /dev/null +++ b/jcore-nlmgene-reader/src/test/resources/input/listWithTestDoc.txt @@ -0,0 +1 @@ +12461077 \ No newline at end of file diff --git a/jcore-nlmgene-reader/src/test/resources/input/listWithoutTestDoc.txt b/jcore-nlmgene-reader/src/test/resources/input/listWithoutTestDoc.txt new file mode 100644 index 000000000..7b4d68d70 --- /dev/null +++ b/jcore-nlmgene-reader/src/test/resources/input/listWithoutTestDoc.txt @@ -0,0 +1 @@ +empty \ No newline at end of file diff --git a/jcore-opennlp-chunk-ae/component.meta b/jcore-opennlp-chunk-ae/component.meta index 202885b41..472b579c9 100644 --- a/jcore-opennlp-chunk-ae/component.meta +++ b/jcore-opennlp-chunk-ae/component.meta @@ -9,7 +9,7 @@ "maven-artifact": { "artifactId": "jcore-opennlp-chunk-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Open NLP Chunker" } diff --git a/jcore-opennlp-chunk-ae/pom.xml b/jcore-opennlp-chunk-ae/pom.xml index d691531cd..6ef500507 100644 --- a/jcore-opennlp-chunk-ae/pom.xml +++ b/jcore-opennlp-chunk-ae/pom.xml @@ -14,7 +14,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -53,8 +53,8 @@ julielab-java-utilities - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-opennlp-chunk-ae/src/main/java/de/julielab/jcore/ae/opennlp/chunk/ChunkAnnotator.java b/jcore-opennlp-chunk-ae/src/main/java/de/julielab/jcore/ae/opennlp/chunk/ChunkAnnotator.java index ff1ba4fdc..42a163349 100644 --- a/jcore-opennlp-chunk-ae/src/main/java/de/julielab/jcore/ae/opennlp/chunk/ChunkAnnotator.java +++ b/jcore-opennlp-chunk-ae/src/main/java/de/julielab/jcore/ae/opennlp/chunk/ChunkAnnotator.java @@ -38,6 +38,7 @@ import java.io.*; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; +import java.util.ArrayList; import java.util.HashMap; import java.util.Map; @@ -210,23 +211,26 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { FSIterator tokenIterator = tokenIndex.subiterator(sentence); //get number of Tokens contained in Sentence and move iterator back to beginning - int numTokens = 0; - while (tokenIterator.isValid()){ - numTokens++; - tokenIterator.moveToNext(); - } - tokenIterator.moveToFirst(); - Token[] tokenArray = new Token[numTokens]; - String[] tokenTextArray = new String[numTokens]; - String[] tagArray = new String[numTokens]; +// int numTokens = 0; +// while (tokenIterator.isValid()){ +// numTokens++; +// tokenIterator.moveToNext(); +// } +// tokenIterator.moveToFirst(); +// Token[] tokenArray = new Token[numTokens]; +// String[] tokenTextArray = new String[numTokens]; +// String[] tagArray = new String[numTokens]; + java.util.List tokensInSentence = new ArrayList<>(); + java.util.List tokenTags = new ArrayList<>(); int i = 0; // iterate over Tokens in current sentence while (tokenIterator.hasNext()) { Token token = (Token) tokenIterator.next(); - tokenArray[i] = token; - tokenTextArray[i] = token.getCoveredText(); + tokensInSentence.add(token); +// tokenArray[i] = token; +// tokenTextArray[i] = token.getCoveredText(); POSTag postag = null; // if a POS TagSet preference exists try to get a correspondent POSTag for the current token if (posTagSetPreference != null) { @@ -241,14 +245,15 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { LOGGER.error("Token has no POS tag annotation: " + token.getCoveredText()); throw new AnalysisEngineProcessException(); } - tagArray[i] = postag.getValue(); +// tagArray[i] = postag.getValue(); + tokenTags.add(postag.getValue()); i++; } // OpenNLP Chunker predicts chunks - String[] chunks = chunker.chunk(tokenTextArray, tagArray); - - createChunkAnnotations(chunks, tokenArray, aJCas); +// String[] chunks = chunker.chunk(tokenTextArray, tagArray); + String[] chunks = chunker.chunk(tokensInSentence.stream().map(Token::getCoveredText).toArray(String[]::new), tokenTags.toArray(String[]::new)); + createChunkAnnotations(chunks, tokensInSentence.toArray(Token[]::new), aJCas); } } diff --git a/jcore-opennlp-chunk-ae/src/test/java/de/julielab/jcore/ae/opennlp/chunk/ChunkAnnotatorTest.java b/jcore-opennlp-chunk-ae/src/test/java/de/julielab/jcore/ae/opennlp/chunk/ChunkAnnotatorTest.java index b87f1ab61..08be7f7ab 100644 --- a/jcore-opennlp-chunk-ae/src/test/java/de/julielab/jcore/ae/opennlp/chunk/ChunkAnnotatorTest.java +++ b/jcore-opennlp-chunk-ae/src/test/java/de/julielab/jcore/ae/opennlp/chunk/ChunkAnnotatorTest.java @@ -21,7 +21,6 @@ import de.julielab.jcore.types.PennBioIEPOSTag; import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.fit.factory.AnalysisEngineFactory; @@ -33,6 +32,7 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,13 +41,13 @@ import java.util.function.BiConsumer; import java.util.stream.Collectors; -public class ChunkAnnotatorTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class ChunkAnnotatorTest { private static final Logger LOGGER = LoggerFactory.getLogger(ChunkAnnotatorTest.class); - protected void setUp() throws Exception { - super.setUp(); - } String text = "A study on the Prethcamide hydroxylation system in rat hepatic microsomes ."; @@ -84,6 +84,7 @@ private void initCas(JCas jcas) { } } + @Test public void testProcess() { XMLInputSource chunkerXML = null; @@ -134,7 +135,7 @@ public void testProcess() { assertEquals(chunks, predictedChunks); } - + @Test public void testProcessWithDefaultMappings() { XMLInputSource chunkerXML = null; @@ -185,7 +186,7 @@ public void testProcessWithDefaultMappings() { assertEquals(chunks, predictedChunks); } - + @Test public void testPunctuation() throws Exception { JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types"); AnalysisEngine chunker = AnalysisEngineFactory.createEngine("ChunkAnnotatorTest"); diff --git a/jcore-opennlp-chunk-ae/src/test/resources/ChunkAnnotatorTest.xml b/jcore-opennlp-chunk-ae/src/test/resources/ChunkAnnotatorTest.xml index a9d5953cf..a3e373d28 100644 --- a/jcore-opennlp-chunk-ae/src/test/resources/ChunkAnnotatorTest.xml +++ b/jcore-opennlp-chunk-ae/src/test/resources/ChunkAnnotatorTest.xml @@ -6,7 +6,7 @@ jcore-openlp-chunk-ae - 2.5.1-SNAPSHOT + 2.6.0 julielab diff --git a/jcore-opennlp-chunk-ae/src/test/resources/ChunkAnnotatorTestDefaultMappings.xml b/jcore-opennlp-chunk-ae/src/test/resources/ChunkAnnotatorTestDefaultMappings.xml index 8d522d208..d4281c635 100644 --- a/jcore-opennlp-chunk-ae/src/test/resources/ChunkAnnotatorTestDefaultMappings.xml +++ b/jcore-opennlp-chunk-ae/src/test/resources/ChunkAnnotatorTestDefaultMappings.xml @@ -6,7 +6,7 @@ jcore-openlp-chunk-ae - 2.5.1-SNAPSHOT + 2.6.0 julielab diff --git a/jcore-opennlp-parser-ae/component.meta b/jcore-opennlp-parser-ae/component.meta index 8233a2b6f..cdb9e0e2f 100644 --- a/jcore-opennlp-parser-ae/component.meta +++ b/jcore-opennlp-parser-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-opennlp-parser-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe OpenNLP Constituency Parser" } diff --git a/jcore-opennlp-parser-ae/pom.xml b/jcore-opennlp-parser-ae/pom.xml index 87af33491..18be8ac33 100644 --- a/jcore-opennlp-parser-ae/pom.xml +++ b/jcore-opennlp-parser-ae/pom.xml @@ -12,7 +12,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -102,8 +102,8 @@ 1.6.0 - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-opennlp-parser-ae/src/main/resources/de/julielab/jcore/ae/opennlpparser/desc/jcore-opennlpparser.xml b/jcore-opennlp-parser-ae/src/main/resources/de/julielab/jcore/ae/opennlpparser/desc/jcore-opennlpparser.xml index ca499d279..e98a8e50f 100644 --- a/jcore-opennlp-parser-ae/src/main/resources/de/julielab/jcore/ae/opennlpparser/desc/jcore-opennlpparser.xml +++ b/jcore-opennlp-parser-ae/src/main/resources/de/julielab/jcore/ae/opennlpparser/desc/jcore-opennlpparser.xml @@ -6,7 +6,7 @@ JCoRe OpenNLP Constituency Parser AE -2.5.1-SNAPSHOT +2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-opennlp-parser-ae/src/test/java/de/julielab/jcore/ae/opennlpparser/main/ParseAnnotatorTest.java b/jcore-opennlp-parser-ae/src/test/java/de/julielab/jcore/ae/opennlpparser/main/ParseAnnotatorTest.java index 0f0cd1315..6955ce7c3 100644 --- a/jcore-opennlp-parser-ae/src/test/java/de/julielab/jcore/ae/opennlpparser/main/ParseAnnotatorTest.java +++ b/jcore-opennlp-parser-ae/src/test/java/de/julielab/jcore/ae/opennlpparser/main/ParseAnnotatorTest.java @@ -18,7 +18,6 @@ package de.julielab.jcore.ae.opennlpparser.main; import de.julielab.jcore.types.*; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; @@ -26,23 +25,19 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Iterator; -public class ParseAnnotatorTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class ParseAnnotatorTest { private static final Logger LOGGER = LoggerFactory.getLogger(ParseAnnotatorTest.class); private static final String LOGGER_PROPERTIES = "src/test/java/log4j.properties"; - @Override - protected void setUp() throws Exception { - super.setUp(); - // set log4j properties file - // PropertyConfigurator.configure(LOGGER_PROPERTIES); - } - String text = "A study on the Prethcamide hydroxylation system in rat hepatic microsomes ."; String wantedCons = "NP NP PP NP NP PP NP "; @@ -68,6 +63,7 @@ public void initCas(JCas jcas) { } } + @Test public void testProcess() { boolean annotationsOK = true; diff --git a/jcore-opennlp-parser-ae/src/test/resources/de/julielab/jcore/ae/opennlpparser/desc/jcore-opennlpparser-test.xml b/jcore-opennlp-parser-ae/src/test/resources/de/julielab/jcore/ae/opennlpparser/desc/jcore-opennlpparser-test.xml index 5943431f4..cacd88573 100644 --- a/jcore-opennlp-parser-ae/src/test/resources/de/julielab/jcore/ae/opennlpparser/desc/jcore-opennlpparser-test.xml +++ b/jcore-opennlp-parser-ae/src/test/resources/de/julielab/jcore/ae/opennlpparser/desc/jcore-opennlpparser-test.xml @@ -6,7 +6,7 @@ JCoRe OpenNLP Parser Test -2.5.1-SNAPSHOT +2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-opennlp-postag-ae/component.meta b/jcore-opennlp-postag-ae/component.meta index 4f3b87ffb..792b8ced1 100644 --- a/jcore-opennlp-postag-ae/component.meta +++ b/jcore-opennlp-postag-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-opennlp-postag-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe OpenNLP POS Tagger" } diff --git a/jcore-opennlp-postag-ae/pom.xml b/jcore-opennlp-postag-ae/pom.xml index 77abc3243..d59112f75 100644 --- a/jcore-opennlp-postag-ae/pom.xml +++ b/jcore-opennlp-postag-ae/pom.xml @@ -12,7 +12,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -76,8 +76,8 @@ provided - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe OpenNLP POS Tagger diff --git a/jcore-opennlp-postag-ae/src/main/resources/de/julielab/jcore/ae/opennlppostag/desc/jcore-opennlppostag.xml b/jcore-opennlp-postag-ae/src/main/resources/de/julielab/jcore/ae/opennlppostag/desc/jcore-opennlppostag.xml index 9a7640c32..b7927c192 100644 --- a/jcore-opennlp-postag-ae/src/main/resources/de/julielab/jcore/ae/opennlppostag/desc/jcore-opennlppostag.xml +++ b/jcore-opennlp-postag-ae/src/main/resources/de/julielab/jcore/ae/opennlppostag/desc/jcore-opennlppostag.xml @@ -6,7 +6,7 @@ JCoRe OpenNLP POS Tagger - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-opennlp-postag-ae/src/test/java/de/julielab/jcore/ae/opennlp/postag/PosTagAnnotatorTest.java b/jcore-opennlp-postag-ae/src/test/java/de/julielab/jcore/ae/opennlp/postag/PosTagAnnotatorTest.java index d2db4293f..ebdeb2c5b 100644 --- a/jcore-opennlp-postag-ae/src/test/java/de/julielab/jcore/ae/opennlp/postag/PosTagAnnotatorTest.java +++ b/jcore-opennlp-postag-ae/src/test/java/de/julielab/jcore/ae/opennlp/postag/PosTagAnnotatorTest.java @@ -33,15 +33,15 @@ import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Collection; import java.util.Iterator; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class PosTagAnnotatorTest { diff --git a/jcore-opennlp-postag-ae/src/test/java/de/julielab/jcore/ae/opennlp/postag/PosTagDictCreatorTest.java b/jcore-opennlp-postag-ae/src/test/java/de/julielab/jcore/ae/opennlp/postag/PosTagDictCreatorTest.java index 22dd88ad2..d7b8f6742 100644 --- a/jcore-opennlp-postag-ae/src/test/java/de/julielab/jcore/ae/opennlp/postag/PosTagDictCreatorTest.java +++ b/jcore-opennlp-postag-ae/src/test/java/de/julielab/jcore/ae/opennlp/postag/PosTagDictCreatorTest.java @@ -13,7 +13,7 @@ import opennlp.tools.postag.POSDictionary; import opennlp.tools.postag.POSSample; import org.apache.commons.io.FileUtils; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.IOException; @@ -23,8 +23,8 @@ import java.util.List; import java.util.Set; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class PosTagDictCreatorTest { @Test diff --git a/jcore-opennlp-postag-ae/src/test/resources/PosTagAnnotatorTest.xml b/jcore-opennlp-postag-ae/src/test/resources/PosTagAnnotatorTest.xml index 1c6b115ca..c40b894e7 100644 --- a/jcore-opennlp-postag-ae/src/test/resources/PosTagAnnotatorTest.xml +++ b/jcore-opennlp-postag-ae/src/test/resources/PosTagAnnotatorTest.xml @@ -6,7 +6,7 @@ jcore-opennlp-postag-ae - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-opennlp-sentence-ae/component.meta b/jcore-opennlp-sentence-ae/component.meta index 33e67cb8d..15519490d 100644 --- a/jcore-opennlp-sentence-ae/component.meta +++ b/jcore-opennlp-sentence-ae/component.meta @@ -9,7 +9,7 @@ "maven-artifact": { "artifactId": "jcore-opennlp-sentence-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe OpenNLP Sentence Splitter" } diff --git a/jcore-opennlp-sentence-ae/pom.xml b/jcore-opennlp-sentence-ae/pom.xml index d2e778487..bfbbabdf0 100644 --- a/jcore-opennlp-sentence-ae/pom.xml +++ b/jcore-opennlp-sentence-ae/pom.xml @@ -12,7 +12,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -41,8 +41,8 @@ slf4j-api - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe OpenNLP Sentence Splitter diff --git a/jcore-opennlp-sentence-ae/src/test/java/de/julielab/jcore/ae/jsentsplit/SentenceAnnotatorTest.java b/jcore-opennlp-sentence-ae/src/test/java/de/julielab/jcore/ae/jsentsplit/SentenceAnnotatorTest.java index 3dcbbef41..6aacdf297 100644 --- a/jcore-opennlp-sentence-ae/src/test/java/de/julielab/jcore/ae/jsentsplit/SentenceAnnotatorTest.java +++ b/jcore-opennlp-sentence-ae/src/test/java/de/julielab/jcore/ae/jsentsplit/SentenceAnnotatorTest.java @@ -18,7 +18,6 @@ package de.julielab.jcore.ae.jsentsplit; import de.julielab.jcore.types.Sentence; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; @@ -26,12 +25,15 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Iterator; -public class SentenceAnnotatorTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class SentenceAnnotatorTest { /** * Logger for this class @@ -43,10 +45,7 @@ public class SentenceAnnotatorTest extends TestCase { String offsets = "0-15;16-32;"; - protected void setUp() throws Exception { - super.setUp(); - } - + @Test public void testProcess() { XMLInputSource sentenceXML = null; diff --git a/jcore-opennlp-sentence-ae/src/test/resources/SentenceAnnotatorTest.xml b/jcore-opennlp-sentence-ae/src/test/resources/SentenceAnnotatorTest.xml index 127ce56d8..49bd07b2d 100644 --- a/jcore-opennlp-sentence-ae/src/test/resources/SentenceAnnotatorTest.xml +++ b/jcore-opennlp-sentence-ae/src/test/resources/SentenceAnnotatorTest.xml @@ -6,7 +6,7 @@ jcore-opennlp-sentence-ae sentence splitter based on opennlp -2.5.1-SNAPSHOT +2.6.0 diff --git a/jcore-opennlp-token-ae/component.meta b/jcore-opennlp-token-ae/component.meta index 373b7c246..02ee26d5d 100644 --- a/jcore-opennlp-token-ae/component.meta +++ b/jcore-opennlp-token-ae/component.meta @@ -9,7 +9,7 @@ "maven-artifact": { "artifactId": "jcore-opennlp-token-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe OpenNLP Tokenizer" } diff --git a/jcore-opennlp-token-ae/desc/TokenAnnotator.xml b/jcore-opennlp-token-ae/desc/TokenAnnotator.xml index a8eecd2b1..459b6dac9 100644 --- a/jcore-opennlp-token-ae/desc/TokenAnnotator.xml +++ b/jcore-opennlp-token-ae/desc/TokenAnnotator.xml @@ -6,7 +6,7 @@ jcore-opennlp-token-ae -2.5.1-SNAPSHOT +2.6.0 diff --git a/jcore-opennlp-token-ae/pom.xml b/jcore-opennlp-token-ae/pom.xml index 3145d63a6..5343425aa 100644 --- a/jcore-opennlp-token-ae/pom.xml +++ b/jcore-opennlp-token-ae/pom.xml @@ -12,7 +12,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -40,8 +40,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe OpenNLP Tokenizer diff --git a/jcore-opennlp-token-ae/src/test/java/de/julielab/jcore/ae/opennlp/token/TokenAnnotatorTest.java b/jcore-opennlp-token-ae/src/test/java/de/julielab/jcore/ae/opennlp/token/TokenAnnotatorTest.java index 04ab72c43..f42582429 100644 --- a/jcore-opennlp-token-ae/src/test/java/de/julielab/jcore/ae/opennlp/token/TokenAnnotatorTest.java +++ b/jcore-opennlp-token-ae/src/test/java/de/julielab/jcore/ae/opennlp/token/TokenAnnotatorTest.java @@ -19,7 +19,6 @@ import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; @@ -32,7 +31,9 @@ import java.util.Iterator; -public class TokenAnnotatorTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TokenAnnotatorTest { private static final Logger LOGGER = LoggerFactory .getLogger(TokenAnnotatorTest.class); diff --git a/jcore-opennlp-token-ae/src/test/resources/TokenAnnotatorTest.xml b/jcore-opennlp-token-ae/src/test/resources/TokenAnnotatorTest.xml index 2ab75743c..490c73e69 100644 --- a/jcore-opennlp-token-ae/src/test/resources/TokenAnnotatorTest.xml +++ b/jcore-opennlp-token-ae/src/test/resources/TokenAnnotatorTest.xml @@ -6,7 +6,7 @@ jcore-opennlp-token-ae -2.5.1-SNAPSHOT +2.6.0 diff --git a/jcore-pmc-db-reader/LICENSE b/jcore-pmc-db-reader/LICENSE new file mode 100644 index 000000000..d0f946a29 --- /dev/null +++ b/jcore-pmc-db-reader/LICENSE @@ -0,0 +1,26 @@ +BSD 2-Clause License + +Copyright (c) 2022, JULIE Lab +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/jcore-pmc-db-reader/README.md b/jcore-pmc-db-reader/README.md new file mode 100644 index 000000000..f97bc30d2 --- /dev/null +++ b/jcore-pmc-db-reader/README.md @@ -0,0 +1,34 @@ +# JCoRe Pubmed Central DB Reader + +**Descriptor Path**: +``` +de.julielab.jcore.reader.pmc.desc.jcore-pmc-db-reader +``` + +JeDIS database reader for PMC base documents. + + + +**1. Parameters** + +| Parameter Name | Parameter Type | Mandatory | Multivalued | Description | +|----------------|----------------|-----------|-------------|-------------| +| param1 | UIMA-Type | Boolean | Boolean | Description | +| param2 | UIMA-Type | Boolean | Boolean | Description | + +**2. Predefined Settings** + +| Parameter Name | Parameter Syntax | Example | +|----------------|------------------|---------| +| param1 | Syntax-Description | `Example` | +| param2 | Syntax-Description | `Example` | + +**3. Capabilities** + +| Type | Input | Output | +|------|:-----:|:------:| +| de.julielab.jcore.types.TYPE | | `+` | +| de.julielab.jcore.types.ace.TYPE | `+` | | + + +[1] Some Literature? diff --git a/jcore-pmc-db-reader/component.meta b/jcore-pmc-db-reader/component.meta new file mode 100644 index 000000000..3cb02b74f --- /dev/null +++ b/jcore-pmc-db-reader/component.meta @@ -0,0 +1,25 @@ +{ + "categories": [ + "multiplier", + "reader" + ], + "description": "JeDIS database reader for PMC base documents.", + "descriptors": [ + { + "category": "multiplier", + "location": "de.julielab.jcore.multiplier.pmc.desc.jcore-pmc-db-multiplier" + }, + { + "category": "reader", + "location": "de.julielab.jcore.multiplier.pmc.desc.jcore-pmc-db-multiplier-reader" + } + ], + "exposable": true, + "group": "general", + "maven-artifact": { + "artifactId": "jcore-pmc-db-reader", + "groupId": "de.julielab", + "version": "2.6.0" + }, + "name": "JCoRe PubMed Central DB Reader" +} diff --git a/jcore-pmc-db-reader/pom.xml b/jcore-pmc-db-reader/pom.xml new file mode 100644 index 000000000..65e909de5 --- /dev/null +++ b/jcore-pmc-db-reader/pom.xml @@ -0,0 +1,71 @@ + + + + 4.0.0 + jcore-pmc-db-reader + jar + de.julielab + + + de.julielab + jedis-parent + 2.6.0 + ../jedis-parent + + + + + ch.qos.logback + logback-classic + test + + + org.slf4j + slf4j-api + + + de.julielab + jcore-descriptor-creator + + + de.julielab + jcore-db-reader + 2.6.0 + + + de.julielab + jcore-pmc-reader + 2.6.0 + + + de.julielab + jcore-types + ${jcore-types-version} + + + org.junit.jupiter + junit-jupiter-engine + + + de.julielab + jcore-db-test-utilities + + + org.assertj + assertj-core + + + de.julielab + jcore-utilities + ${jcore-utilities-version} + + + JCoRe PubMed Central DB Reader + + JULIE Lab Jena, Germany + http://www.julielab.de + + https://github.com/JULIELab/jcore-base/tree/master/jcore-pmc-db-reader + JeDIS database reader for PMC base documents. + diff --git a/jcore-pmc-db-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java b/jcore-pmc-db-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java new file mode 100644 index 000000000..4648f81a2 --- /dev/null +++ b/jcore-pmc-db-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java @@ -0,0 +1,227 @@ +package de.julielab.jcore.multiplier.pmc; + +import de.julielab.costosys.configuration.FieldConfig; +import de.julielab.costosys.dbconnection.CoStoSysConnection; +import de.julielab.jcore.reader.db.DBMultiplier; +import de.julielab.jcore.reader.db.DBReader; +import de.julielab.jcore.reader.pmc.CasPopulator; +import de.julielab.jcore.reader.pmc.NoDataAvailableException; +import de.julielab.jcore.reader.pmc.PMCReaderBase; +import de.julielab.jcore.reader.pmc.parser.ElementParsingException; +import de.julielab.jcore.types.casflow.ToVisit; +import de.julielab.jcore.types.casmultiplier.RowBatch; +import de.julielab.jcore.types.pubmed.Header; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.AbstractCas; +import org.apache.uima.cas.FeatureStructure; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.StringArray; +import org.apache.uima.resource.ResourceInitializationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.*; +import java.util.stream.Collectors; + +public class PMCDBMultiplier extends DBMultiplier { + public static final String PARAM_OMIT_BIB_REFERENCES = PMCReaderBase.PARAM_OMIT_BIB_REFERENCES; + public static final String PARAM_ADD_SHA_HASH = "AddShaHash"; + public static final String PARAM_TABLE_DOCUMENT = "DocumentTable"; + public static final String PARAM_TABLE_DOCUMENT_SCHEMA = "DocumentTableSchema"; + public static final String PARAM_TO_VISIT_KEYS = "ToVisitKeys"; + public static final String PARAM_TRUNCATE_AT_SIZE = "TruncateAtSize"; + private final static Logger log = LoggerFactory.getLogger(PMCDBMultiplier.class); + @ConfigurationParameter(name = PARAM_OMIT_BIB_REFERENCES, mandatory = false, defaultValue = "false", description = "If set to true, references to the bibliography are omitted from the CAS text.") + protected boolean omitBibReferences; + @ConfigurationParameter(name = PARAM_ADD_SHA_HASH, mandatory = false, description = "For use with AnnotationDefinedFlowController. Possible values: document_text, defaults to 'document_text' and thus doesn't need to be specified manually at the moment. This parameter needs to match the value for the same parameter given to the XMIDBWriter in this pipeline. Then, a comparison between the existing hash in the database and the new hash of the CAS read in this pipeline can be made. In case the hashes match, the CAS is directly routed to the components specified in the " + PARAM_TO_VISIT_KEYS + " parameter, skipping all other components. Note that this only works with AAEs where the first component is an 'AnnotationControlledFlow'.") + private String documentItemToHash; + @ConfigurationParameter(name = PARAM_TABLE_DOCUMENT, mandatory = false, description = "For use with AnnotationDefinedFlowController. String parameter indicating the name of the " + + "table where the XMI data and, thus, the hash is stored. The name must be schema qualified. Note that in this component, only the ToVisit annotation is created that determines which components to apply to a CAS with matching (unchanged) hash. The logic to actually control the CAS flow is contained in the AnnotationDefinedFlowController.") + private String xmiStorageDataTable; + @ConfigurationParameter(name = PARAM_TABLE_DOCUMENT_SCHEMA, mandatory = false, description = "For use with AnnotationDefinedFlowController. The name of the schema that the document table - given with the " + PARAM_TABLE_DOCUMENT + " parameter - adheres to. Only the primary key part is required for hash value retrieval.") + private String xmiStorageDataTableSchema; + @ConfigurationParameter(name = PARAM_TO_VISIT_KEYS, mandatory = false, description = "For use with AnnotationDefinedFlowController. The delegate AE keys of the AEs this CAS should still applied on although the hash has not changed. Can be null or empty indicating that no component should be applied to the CAS. This is, however, the task of the AnnotationDefinedFlowController.") + private String[] toVisitKeys; + @ConfigurationParameter(name = PARAM_TRUNCATE_AT_SIZE, mandatory = false, description = "The maximum number of characters allowed in the document text. Characters exceeding this size are discarded. This can be necessary when large documents cannot be handled by subsequent components in the pipeline. Defaults to Integer.MAX_VALUE.") + private int truncationSize; + + private CasPopulator casPopulator; + private Map docId2HashMap; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException { + super.initialize(aContext); + xmiStorageDataTable = (String) aContext.getConfigParameterValue(PARAM_TABLE_DOCUMENT); + xmiStorageDataTableSchema = (String) aContext.getConfigParameterValue(PARAM_TABLE_DOCUMENT_SCHEMA); + documentItemToHash = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_ADD_SHA_HASH)).orElse("document_text"); + toVisitKeys = (String[]) aContext.getConfigParameterValue(PARAM_TO_VISIT_KEYS); + omitBibReferences = Optional.ofNullable((Boolean) aContext.getConfigParameterValue(PARAM_OMIT_BIB_REFERENCES)).orElse(false); + truncationSize = Optional.ofNullable((Integer)aContext.getConfigParameterValue(PARAM_TRUNCATE_AT_SIZE)).orElse(Integer.MAX_VALUE); + // We don't know yet which tables to read. Thus, we leave the row mapping out. + // We will now once the DBMultiplier#process(JCas) will have been run. + initialized = false; + + if (!(xmiStorageDataTable == null && xmiStorageDataTableSchema == null) && !(xmiStorageDataTable != null && xmiStorageDataTableSchema != null && documentItemToHash != null)) { + String errorMsg = String.format("From the parameters '%s' and '%s' some are specified and some aren't. To activate hash value comparison in order to add aggregate component keys for CAS visit, specify all those parameters. Otherwise, specify none.", PARAM_TABLE_DOCUMENT, PARAM_TABLE_DOCUMENT_SCHEMA); + log.error(errorMsg); + throw new ResourceInitializationException(new IllegalArgumentException(errorMsg)); + } + + try { + casPopulator = new CasPopulator(omitBibReferences, truncationSize); + } catch (IOException e) { + String errorMsg = "Could not initialize the PMC CasPopulator."; + log.error(errorMsg); + throw new ResourceInitializationException(e); + } + } + + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException { + docId2HashMap = fetchCurrentHashesFromDatabase(JCasUtil.selectSingle(aJCas, RowBatch.class)); + super.process(aJCas); + } + + @Override + public AbstractCas next() throws AnalysisEngineProcessException { + JCas jCas = getEmptyJCas(); + try { + if (documentDataIterator.hasNext()) { + byte[][] documentData = documentDataIterator.next(); + String pkString = DBReader.setDBProcessingMetaData(dbc, readDataTable, tableName, documentData, jCas); + populateCas(jCas, documentData, pkString); + setToVisitAnnotation(jCas, pkString); + } + } catch (Exception e) { + log.error("Exception occurred: ", e); + throw new AnalysisEngineProcessException(e); + } + return jCas; + } + + private void populateCas(JCas jCas, byte[][] documentData, String pkString) throws NoDataAvailableException, ElementParsingException { + List pkIndices = dbc.getPrimaryKeyIndices(); + + // get index of xmlData; + // assumes that only one byte[] in arrayArray contains this data + // and that this byte[] is at the only index position that holds no + // primary key + List allIndices = new ArrayList(); + for (int i = 0; i < documentData.length; i++) { + allIndices.add(i); + } + List xmlIndices = new ArrayList<>(allIndices); + for (Integer pkIndex : pkIndices) + xmlIndices.remove(pkIndex); + int xmlIndex = xmlIndices.get(0); + try { + casPopulator.populateCas(new ByteArrayInputStream(documentData[xmlIndex]), jCas); + } catch (Exception e) { + log.error("Could not parse document {}.", pkString, e); + throw e; + } + // It actually happens that some PMC XML documents do not contain their own ID. We can use the ID obtained + // via the database primary key, which in turn might be derived from the original file name or some meta file. + Header header = JCasUtil.selectSingle(jCas, Header.class); + if (StringUtils.isBlank(header.getDocId())) { + log.debug("Document has no docId set. Derived the ID {} from the primary key and setting it as the Header#docId feature.", pkString); + header.setDocId(pkString); + } + } + + /** + *

Fetches the hashes of the currently stored documents in the database.

+ * + * @param rowBatch The annotation specifying which documents should be fetched by the multiplier and then be processed by the aggregate. + * @return A map from a string representation of the RowBatches document IDs to the hashes for the respective IDs. + * @throws AnalysisEngineProcessException If the SQL request fails. + */ + private Map fetchCurrentHashesFromDatabase(RowBatch rowBatch) throws AnalysisEngineProcessException { + if (dbc == null) + dbc = getDataBaseConnector(rowBatch.getCostosysConfiguration()); + if (xmiStorageDataTable != null && dbc.tableExists(xmiStorageDataTable) && rowBatch.getIdentifiers() != null && rowBatch.getIdentifiers().size() > 0) { + String hashColumn = documentItemToHash + "_sha256"; + // Extract the document IDs in this RowBatch. The IDs could be composite keys. + List documentIds = new ArrayList<>(rowBatch.getIdentifiers().size()); + Iterator documentIDsIt = rowBatch.getIdentifiers().iterator(); + while (documentIDsIt.hasNext()) { + StringArray pkArray = (StringArray) documentIDsIt.next(); + documentIds.add(pkArray.toStringArray()); + } + Map id2hash = new HashMap<>(documentIds.size()); + // This is the map we want to fill that lets us look up the hash of the document text by document ID. + String sql = null; + // Query the database for the document IDs in the current RowBatch and retrieve hashes. + try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { + FieldConfig xmiTableSchema = dbc.getFieldConfiguration(xmiStorageDataTableSchema); + String idQuery = documentIds.stream() + .map(key -> Arrays.stream(key).map(part -> "%s='" + part + "'").toArray(String[]::new)) + .map(xmiTableSchema::expandPKNames).map(expandedKeys -> String.join(" AND ", expandedKeys)) + .collect(Collectors.joining(" OR ")); + sql = String.format("SELECT %s,%s FROM %s WHERE %s", xmiTableSchema.getPrimaryKeyString(), hashColumn, xmiStorageDataTable, idQuery); + ResultSet rs = conn.createStatement().executeQuery(sql); + while (rs.next()) { + StringBuilder pkSb = new StringBuilder(); + for (int i = 0; i < xmiTableSchema.getPrimaryKey().length; i++) + pkSb.append(rs.getString(i + 1)).append(','); + // Remove trailing comma + pkSb.deleteCharAt(pkSb.length() - 1); + String hash = rs.getString(xmiTableSchema.getPrimaryKey().length + 1); + id2hash.put(pkSb.toString(), hash); + } + } catch (SQLException e) { + log.error("Could not retrieve hashes from the database. SQL query was '{}':", sql, e); + throw new AnalysisEngineProcessException(e); + } + return id2hash; + } + return null; + } + + /** + *

Creates a {@link ToVisit} annotation based on document text hash comparison and the defined parameter values.

+ *

Computes the hash of the newly read CAS and compares it to the hash for the same document retrieved from the + * database, if present. If there was a hash in the database and the hash values are equal, creates the ToVisit + * annotation and adds the toVisitKeys passed in the configuration of this component.

+ * + * @param jCas The newly read JCas. + * @param pkString + */ + private void setToVisitAnnotation(JCas jCas, String pkString) { + if (xmiStorageDataTable != null && xmiStorageDataTable != null) { + String existingHash = docId2HashMap.get(pkString); + if (existingHash != null) { + String newHash = getHash(jCas); + if (existingHash.equals(newHash)) { + if (log.isTraceEnabled()) + log.trace("Document {} has a document text hash that equals the one present in the database. Creating a ToVisit annotation routing it only to the components with delegate keys {}.", pkString, toVisitKeys); + ToVisit toVisit = new ToVisit(jCas); + if (toVisitKeys != null && toVisitKeys.length != 0) { + StringArray keysArray = new StringArray(jCas, toVisitKeys.length); + keysArray.copyFromArray(toVisitKeys, 0, 0, toVisitKeys.length); + toVisit.setDelegateKeys(keysArray); + } + toVisit.addToIndexes(); + } + } else { + log.trace("No existing hash was found for document {}", pkString); + } + } + } + + private String getHash(JCas newCas) { + final String documentText = newCas.getDocumentText(); + final byte[] sha = DigestUtils.sha256(documentText.getBytes()); + return Base64.encodeBase64String(sha); + } +} diff --git a/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier-reader.xml b/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier-reader.xml new file mode 100644 index 000000000..66c46729a --- /dev/null +++ b/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier-reader.xml @@ -0,0 +1,191 @@ + + + org.apache.uima.java + de.julielab.jcore.reader.db.DBMultiplierReader + + JCoRe PMC Database Multiplier Reader + A collection reader that receives the IDs of documents from a database table. Additional tables may + be specified which will, together with the IDs, be sent to a CAS multiplier extending the DBMultiplierReader. + The multiplier will read documents and the joined additional tables according to the list of document IDs + sent by this reader. The component leverages the corpus storage system (CoStoSys) for this purpose and is + part of the Jena Document Information System, JeDIS. + + 2.6.0 + JULIE Lab Jena, Germany + + + ResetTable + If set to true and the parameter 'Table' is set to a subset table, the subset table will be + reset atthe initialization of the reader to be ready for processing of the whole subset. Do not use + when multiple readers read the same subset table. + + Boolean + false + false + + + Timestamp + PostgreSQL timestamp expression that is evaluated against the data table. The data table + schema, which must be the active data table schema in the CoStoSys configuration as always, must + specify a single timestamp field for this parameter to work. Only data rows with a timestamp value + larger than the given timestamp expression will be processed. Note that when reading from a subset + table, there may be subset rows indicated to be in process which are finally not read from the data + table. This is an implementational shortcoming and might be addressed if respective feature requests + are given through the JULIE Lab GitHub page or JCoRe issues. + + String + false + false + + + FetchIdsProactively + If set to true and when reading from a subset table, batches of document IDs will be + retrieved in a background thread while the previous batch is already in process. This is meant to + minimize waiting time for the database. Deactivate this feature if you encounter issues with + databaase connections. + + Boolean + false + true + + + AdditionalTables + An array of table names. By default, the table names will be resolved against the active + data postgres schema configured in the CoStoSys configuration file. If a name is already schema + qualified, i.e. contains a dot, the active data schema will be ignored. When reading documents from + the document data table, the additional tables will be joined onto the data table using the primary + keys of the queried documents. Using the table schema for the additional documents defined by the + 'AdditionalTableSchema' parameter, the columns that are marked as 'retrieve=true' in the table + schema, are returned together with the main document data. This mechanism is most prominently used + to retrieve annotation table data together with the original document text in XMI format for the + JeDIS system. + + String + true + false + + + AdditionalTableSchemas + The table schemas that corresponds to the additional tables given with the + 'AdditionalTables' parameter. If only one schema name is given, that schema must apply to all + additional tables. + + String + true + false + + + BatchSize + + Integer + false + true + + + DBDriver + Currently unused because the Hikari JDBC library should recognize the correct driver. + However, there seem to be cases where this doesn't work (HSQLDB). So we keep the parameter for + later. When this issue comes up, the driver would have to be set manually. This isn't done right + now. + + String + false + false + + + Table + The data or subset database table to read from. The name will be resolved against the + active Postgres schema defined in the CoStoSys configuration file.However, if the name contains a + schema qualification (i.e. 'schemaname.tablename), the configuration file will be ignored in this + point. + + String + false + true + + + SelectionOrder + WARNING: Potential SQL injection vulnerability. Do not let unknown users interact with your + database with this component. An SQL ORDER clause specifying in which order the documents in the + target database table should be processed. Only the clause itself must be specified, the ORDER + keyword is automatically added. + + String + false + false + + + WhereCondition + WARNING: Potential SQL injection vulnerability. Do not let unknown users interact with your + database with this component. Only used when reading data tables directly. No effect when the + 'tableName' parameter specifies a subset table. The parameter value should be an SQL WHERE clause + restricting the documents to be read. Only the clause itself must be specified, the WHERE keyword is + added automatically. + + String + false + false + + + Limit + + Integer + false + false + + + CostosysConfigFile + File path or classpath resource location to the CoStoSys XML configuration. This + configuration must specify the table schema of the table referred to by the 'Table' parameter as + active table schema. The active table schema is always the schema of the data table that is either + queried directly for documents or, if 'tableName' points to a subset table, indirectly through the + subset table. Make also sure that the active database connection in the configuration points to the + correct database. + + String + false + true + + + + + ResetTable + + false + + + + FetchIdsProactively + + true + + + + BatchSize + + 50 + + + + SelectionOrder + + + + + + + + + + + + + + + + true + false + true + + + \ No newline at end of file diff --git a/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier.xml b/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier.xml new file mode 100644 index 000000000..c9f9ca13d --- /dev/null +++ b/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier.xml @@ -0,0 +1,79 @@ + + + org.apache.uima.java + true + de.julielab.jcore.multiplier.pmc.PMCDBMultiplier + + JCoRe PMC Database Multiplier + A multiplier that receives document IDs to read from a database table from the DBMultiplierReader. The reader also delivers the path to the corpus storage system (CoStoSys) configuration and additional tables for joining with the main data table. This multiplier class is abstract and cannot be used directly.Extending classes must implement the next() method to actually read documents from the database and populate CASes with them. This component is a part of the Jena Document Information System, JeDIS. + 2.6.0 + JULIE Lab Jena, Germany + JULIE Lab Jena, Germany + + + OmitBibliographyReferences + If set to true, references to the bibliography are omitted from the CAS text. + Boolean + false + false + + + AddShaHash + For use with AnnotationDefinedFlowController. Possible values: document_text, defaults to 'document_text' and thus doesn't need to be specified manually at the moment. This parameter needs to match the value for the same parameter given to the XMIDBWriter in this pipeline. Then, a comparison between the existing hash in the database and the new hash of the CAS read in this pipeline can be made. In case the hashes match, the CAS skips all component except the DBCheckpointAE to mark the document as processed. + String + false + false + + + DocumentTable + For use with AnnotationDefinedFlowController. String parameter indicating the name of the table where the XMI data and, thus, the hash is stored. The name must be schema qualified. Note that in this component, only the ToVisit annotation is created that determines which components to apply to a CAS with matching (unchanged) hash. The logic to actually control the CAS flow is contained in the AnnotationDefinedFlowController. + String + false + false + + + DocumentTableSchema + For use with AnnotationDefinedFlowController. The name of the schema that the document table - given with the DocumentTable parameter - adheres to. Only the primary key part is required for hash value retrieval. + String + false + false + + + ToVisitKeys + For use with AnnotationDefinedFlowController. The delegate AE keys of the AEs this CAS should still applied on although the hash has not changed. Can be null or empty indicating that no component should be applied to the CAS. This is, however, the task of the AnnotationDefinedFlowController. + String + true + false + + + TruncateAtSize + The maximum number of characters allowed in the document text. Characters exceeding this size are discarded. This can be necessary when large documents cannot be handled by subsequent components in the pipeline. Defaults to Integer.MAX_VALUE. + Integer + false + false + + + + + OmitBibliographyReferences + + false + + + + + + + + + + + + + + true + true + true + + + \ No newline at end of file diff --git a/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/ErrorTest.java b/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/ErrorTest.java new file mode 100644 index 000000000..674d61685 --- /dev/null +++ b/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/ErrorTest.java @@ -0,0 +1,39 @@ +package de.julielab.jcore.multiplier.pmc; + +import de.julielab.jcore.reader.db.DBMultiplierReader; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.JCasIterator; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.CollectionReaderFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import java.nio.file.Path; + +/** + * This is not as much a test as it is a facility to check error cases in isolation. The existing code + * reads from an XML database table and parses the PMC document from there + */ +@Disabled +public class ErrorTest { + + @Test + public void errorTest() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types"); + CollectionReader reader = CollectionReaderFactory.createReader(DBMultiplierReader.class, DBMultiplierReader.PARAM_COSTOSYS_CONFIG_NAME, Path.of("src", "test", "resources", "costosys-errortest.xml").toString(), DBMultiplierReader.PARAM_TABLE, "_data.errordoc", DBMultiplierReader.PARAM_RESET_TABLE, true); + AnalysisEngine engine = AnalysisEngineFactory.createEngine(PMCDBMultiplier.class, PMCDBMultiplier.PARAM_OMIT_BIB_REFERENCES, true); + while (reader.hasNext()) { + reader.getNext(jCas.getCas()); + JCasIterator jCasIterator = engine.processAndOutputNewCASes(jCas); + while (jCasIterator.hasNext()) { + JCas next = jCasIterator.next(); + System.out.println(JCoReTools.getDocId(next)); + next.release(); + } + } + } +} diff --git a/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplierHashComparisonTest.java b/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplierHashComparisonTest.java new file mode 100644 index 000000000..83d7cf9f6 --- /dev/null +++ b/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplierHashComparisonTest.java @@ -0,0 +1,217 @@ +package de.julielab.jcore.multiplier.pmc; + + +import de.julielab.costosys.dbconnection.CoStoSysConnection; +import de.julielab.costosys.dbconnection.DataBaseConnector; +import de.julielab.jcore.db.test.DBTestUtils; +import de.julielab.jcore.types.casflow.ToVisit; +import de.julielab.jcore.types.casmultiplier.RowBatch; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.configuration2.ex.ConfigurationException; +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.JCasIterator; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.factory.TypeSystemDescriptionFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.jcas.cas.StringArray; +import org.apache.uima.resource.metadata.TypeSystemDescription; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.PostgreSQLContainer; + +import java.io.File; +import java.nio.file.Path; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * This test is an adaption of the XMLDBMultiplierTest in jcore-xml-db-reader. It tests whether the hash code comparison + * works as intended. + */ +public class PMCDBMultiplierHashComparisonTest { + + private static final String SOURCE_XML_TABLE = "source_xml_table"; + private static final String TARGET_XMI_TABLE = "target_xmi_table"; + private static final String PMCID_FIELD_NAME = "pmcid"; + private static final String DOCID_FIELD_NAME = "docid"; + private static final String XML_FIELD_NAME = "xml"; + private static final String BASE_DOCUMENT_FIELD_NAME = "base_document"; + private static final String HASH_FIELD_NAME = "documentText_sha256"; + private static final String MAX_XMI_ID_FIELD_NAME = "max_xmi_id"; + private static final String SOFA_MAPPING_FIELD_NAME = "sofa_mapping"; + private static final String SUBSET_TABLE = "test_subset"; + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); + private static String costosysConfig; + + @BeforeAll + public static void setup() throws SQLException, ConfigurationException { + postgres.start(); + DBTestUtils.createAndSetHiddenConfig(Path.of("src", "test", "resources", "hiddenConfig").toString(), postgres); + + DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); + dbc.setActiveTableSchema("pmc"); + costosysConfig = DBTestUtils.createTestCostosysConfig("pmc", 2, postgres); + new File(costosysConfig).deleteOnExit(); + try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { + // We create two tables. One is the XML table the multiplier reads from and maps the contents to the JCas. + // The other is a simulation of an XMI table used to serialize CAS instances via the jcore-xmi-db-writer. + // We need that target table to test the hash value comparison mechanism: If a document does not exist + // in the target table or has a non-matching hash on its document text, proceed as normal. + // But if the hash matches, we want to reserve the possibility to skip most part of the subsequent pipeline. + // For this, we could use the AnnnotationDefinedFlowController for jcore-flow-controllers. This controller + // looks for annotations of the ToVisit type that specify which exact components in an aggregate should + // be applied to the CAS carrying the ToVisit annotation. + prepareSourceXMLTable(dbc, conn); + prepareTargetXMITable(dbc, conn); + } + dbc.defineSubset(SUBSET_TABLE, SOURCE_XML_TABLE, "Test subset"); + assertThat(dbc.getNumRows(SOURCE_XML_TABLE)).isEqualTo(10); + assertThat(dbc.getNumRows(TARGET_XMI_TABLE)).isEqualTo(5); + + dbc.close(); + } + + private static void prepareSourceXMLTable(DataBaseConnector dbc, CoStoSysConnection conn) throws SQLException { + String xmlFmt = "\n" + + "
%d42\n" + + "

This is text nr %d.

\n" + + "
"; + dbc.createTable(SOURCE_XML_TABLE, "Test table for hash comparison test."); + String sql = String.format("INSERT INTO %s (%s,%s) VALUES (?,XMLPARSE(CONTENT ?))", SOURCE_XML_TABLE, PMCID_FIELD_NAME, XML_FIELD_NAME); + PreparedStatement ps = conn.prepareStatement(sql); + for (int i = 0; i < 10; i++) { + String xml = String.format(xmlFmt, i, i); + ps.setString(1, String.valueOf(i)); + ps.setString(2, xml); + ps.addBatch(); + } + ps.executeBatch(); + } + + private static void prepareTargetXMITable(DataBaseConnector dbc, CoStoSysConnection conn) throws SQLException { + // The PMC parser tries to format blocks of content using newlines which makes the test a bit awkward. + // The test might break if this formatting is changed. + String documentTextFmt = "This is text nr %d.\n"; + dbc.createTable(TARGET_XMI_TABLE, "xmi_text", "Test table for hash comparison test."); + dbc.assureColumnsExist(TARGET_XMI_TABLE, List.of(HASH_FIELD_NAME), "text"); + String sql = String.format("INSERT INTO %s (%s,%s,%s,%s,%s) VALUES (?,XMLPARSE(CONTENT ?),?,?,?)", TARGET_XMI_TABLE, DOCID_FIELD_NAME, BASE_DOCUMENT_FIELD_NAME, HASH_FIELD_NAME, MAX_XMI_ID_FIELD_NAME, SOFA_MAPPING_FIELD_NAME); + PreparedStatement ps = conn.prepareStatement(sql); + // Note that we only add half of the documents compared to the source XML import. This way we test + // if the code behaves right when the target document does not yet exist at all. + for (int i = 0; i < 5; i++) { + String xml = String.format(documentTextFmt, i); + ps.setString(1, String.valueOf(i)); + ps.setString(2, xml); + // For one document in the "target XMI" table we put in a wrong hash. Thus, this document should not trigger + // the "toVisit" mechanism. + if (i != 3) + ps.setString(3, getHash(xml)); + else ps.setString(3, "someanotherhash"); + ps.setInt(4, 0); + ps.setString(5, "dummy"); + ps.addBatch(); + } + ps.executeBatch(); + } + + @AfterAll + public static void tearDown() { + postgres.stop(); + } + + private static String getHash(String str) { + final byte[] sha = DigestUtils.sha256(str.getBytes()); + return Base64.encodeBase64String(sha); + } + + /** + * Creates a JCas and adds a RowBatch for all 10 documents in the source XML table as well as the data table and subset table and schema names. + * + * @return A JCas prepared for the tests in this class. + * @throws UIMAException If some UIMA operation fails. + */ + private JCas prepareCas() throws UIMAException { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types", "de.julielab.jcore.types.jcore-casflow-types"); + RowBatch rowBatch = new RowBatch(jCas); + StringArray dataTable = new StringArray(jCas, 1); + dataTable.set(0, SOURCE_XML_TABLE); + rowBatch.setTables(dataTable); + StringArray tableSchema = new StringArray(jCas, 1); + tableSchema.set(0, "pmc"); + rowBatch.setTableSchemas(tableSchema); + rowBatch.setTableName(SUBSET_TABLE); + FSArray pks = new FSArray(jCas, 10); + // Read all documents + for (int i = 0; i < 10; i++) { + StringArray pk = new StringArray(jCas, 1); + pk.set(0, String.valueOf(i)); + pks = JCoReTools.addToFSArray(pks, pk); + } + rowBatch.setIdentifiers(pks); + rowBatch.setCostosysConfiguration(costosysConfig); + rowBatch.addToIndexes(); + return jCas; + } + + @Test + public void testHashComparison() throws Exception { + // This simulates the PMC DB reader output: a cas that lists the primary keys of the 10 source XML documents, + // the names of the source XML table, the XMI target table etc. + JCas jCas = prepareCas(); + TypeSystemDescription tsDesc = TypeSystemDescriptionFactory.createTypeSystemDescription("de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.types.jcore-document-structure-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types", "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.jcore-casflow-types"); + AnalysisEngine engine = AnalysisEngineFactory.createEngine(PMCDBMultiplier.class, tsDesc, + PMCDBMultiplier.PARAM_ADD_SHA_HASH, "documentText", + PMCDBMultiplier.PARAM_TABLE_DOCUMENT, TARGET_XMI_TABLE, + PMCDBMultiplier.PARAM_TABLE_DOCUMENT_SCHEMA, "xmi_text", + PMCDBMultiplier.PARAM_TO_VISIT_KEYS, "ThisIsTheVisitKey" + ); + JCasIterator jCasIterator = engine.processAndOutputNewCASes(jCas); + List toVisitKeys = new ArrayList<>(); + while (jCasIterator.hasNext()) { + JCas newCas = jCasIterator.next(); + // Collect the ToVisitKeys from each CAS. We expect four CASes to have one, i.e. that the document text + // has is the same as already existing in the target XMI document table, we added 5 XMI documents + // to the target table and for one we changed the hash code. + Collection select = JCasUtil.select(newCas, ToVisit.class); + select.forEach(tv -> tv.getDelegateKeys().forEach(k -> toVisitKeys.add(k))); + newCas.release(); + } + // There are 4 documents in the target table with the correct hash, so we expect the delegate key 4 times + assertThat(toVisitKeys).containsExactly("ThisIsTheVisitKey", "ThisIsTheVisitKey", "ThisIsTheVisitKey", "ThisIsTheVisitKey"); + } + + @Test + public void testHashComparison2() throws Exception { + JCas jCas = prepareCas(); + TypeSystemDescription tsDesc = TypeSystemDescriptionFactory.createTypeSystemDescription("de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.types.jcore-document-structure-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types", "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.jcore-casflow-types"); + // In this test, we do not specify the keys to visit; the whole subsequent pipeline should be skipped. + // To indicate that, there should be ToVisit annotations but they should be null. + AnalysisEngine engine = AnalysisEngineFactory.createEngine(PMCDBMultiplier.class, tsDesc, + PMCDBMultiplier.PARAM_ADD_SHA_HASH, "documentText", + PMCDBMultiplier.PARAM_TABLE_DOCUMENT, TARGET_XMI_TABLE, + PMCDBMultiplier.PARAM_TABLE_DOCUMENT_SCHEMA, "xmi_text" + ); + JCasIterator jCasIterator = engine.processAndOutputNewCASes(jCas); + List emptyToVisitAnnotation = new ArrayList<>(); + while (jCasIterator.hasNext()) { + JCas newCas = jCasIterator.next(); + Collection select = JCasUtil.select(newCas, ToVisit.class); + select.stream().filter(tv -> tv.getDelegateKeys() == null).forEach(emptyToVisitAnnotation::add); + newCas.release(); + } + // There are 4 documents in the target table with the correct hash so we expect the delegate key 4 times + assertThat(emptyToVisitAnnotation).hasSize(4); + } +} diff --git a/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplierTest.java b/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplierTest.java new file mode 100644 index 000000000..dfba24034 --- /dev/null +++ b/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplierTest.java @@ -0,0 +1,121 @@ +package de.julielab.jcore.multiplier.pmc; + +import de.julielab.costosys.Constants; +import de.julielab.costosys.dbconnection.DBCIterator; +import de.julielab.costosys.dbconnection.DataBaseConnector; +import de.julielab.jcore.db.test.DBTestUtils; +import de.julielab.jcore.types.casmultiplier.RowBatch; +import de.julielab.jcore.types.pubmed.Header; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.commons.configuration2.ex.ConfigurationException; +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.JCasIterator; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.jcas.cas.StringArray; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.PostgreSQLContainer; + +import java.io.File; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Spliterators; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +import static org.assertj.core.api.Assertions.assertThat; + +class PMCDBMultiplierTest { + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:" + DataBaseConnector.POSTGRES_VERSION); + private static String costosysConfig; + + @BeforeAll + public static void setup() throws ConfigurationException { + postgres.start(); + DBTestUtils.createAndSetHiddenConfig(Path.of("src", "test", "resources", "hiddenConfig").toString(), postgres); + + DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); + dbc.setActiveTableSchema("pmc"); + costosysConfig = DBTestUtils.createTestCostosysConfig("pmc", 2, postgres); + new File(costosysConfig).deleteOnExit(); + dbc.withConnectionExecute(d -> d.createTable(Constants.DEFAULT_DATA_TABLE_NAME, "Test data table.")); + dbc.withConnectionExecute(d -> d.importFromXMLFile(Path.of("src", "test", "resources", "testdocs").toString(), Constants.DEFAULT_DATA_TABLE_NAME)); + dbc.withConnectionExecute(d -> d.createSubsetTable("testsubset", Constants.DEFAULT_DATA_TABLE_NAME, "Test subset.")); + dbc.withConnectionExecute(d -> d.initSubset("testsubset", Constants.DEFAULT_DATA_TABLE_NAME)); + assertThat(dbc.countRowsOfDataTable(Constants.DEFAULT_DATA_TABLE_NAME, null)); + DBCIterator documentIterator = (DBCIterator) dbc.withConnectionQuery(d -> d.queryDataTable(Constants.DEFAULT_DATA_TABLE_NAME, null)); + // check that the documents are actually in the database as expected + List docIds = StreamSupport.stream(Spliterators.spliteratorUnknownSize(documentIterator, 0), false).map(b -> new String(b[0], StandardCharsets.UTF_8)).collect(Collectors.toList()); + assertThat(docIds).containsExactlyInAnyOrder("PMC6949206", "PMC7511315"); + } + + @Test + public void next() throws Exception { + AnalysisEngine engine = AnalysisEngineFactory.createEngine(PMCDBMultiplier.class); + JCasIterator jCasIterator = engine.processAndOutputNewCASes(prepareCas()); + List documentTexts = new ArrayList<>(); + List docIds = new ArrayList<>(); + while (jCasIterator.hasNext()) { + JCas newCas = jCasIterator.next(); + documentTexts.add(newCas.getDocumentText()); + docIds.add(JCasUtil.selectSingle(newCas, Header.class).getDocId()); + newCas.release(); + } + assertThat(docIds).containsExactlyInAnyOrder("PMC6949206", "PMC7511315"); + } + + @Test + public void truncateText() throws Exception { + AnalysisEngine engine = AnalysisEngineFactory.createEngine(PMCDBMultiplier.class, PMCDBMultiplier.PARAM_TRUNCATE_AT_SIZE, 20); + JCasIterator jCasIterator = engine.processAndOutputNewCASes(prepareCas()); + List documentTexts = new ArrayList<>(); + List docIds = new ArrayList<>(); + while (jCasIterator.hasNext()) { + JCas newCas = jCasIterator.next(); + documentTexts.add(newCas.getDocumentText()); + final String docId = JCasUtil.selectSingle(newCas, Header.class).getDocId(); + docIds.add(docId); + newCas.release(); + } + assertThat(documentTexts).containsExactlyInAnyOrder("pmc\n" + + "Rescue of premat", "pmc\n" + + "Transcriptomic p"); + } + + /** + * Creates a JCas and adds a RowBatch for the test documents in the source XML table as well as the data table and subset table and schema names. + * + * @return A JCas prepared for the tests in this class. + * @throws UIMAException If some UIMA operation fails. + */ + private JCas prepareCas() throws UIMAException { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types", "de.julielab.jcore.types.jcore-casflow-types"); + RowBatch rowBatch = new RowBatch(jCas); + StringArray dataTable = new StringArray(jCas, 1); + dataTable.set(0, Constants.DEFAULT_DATA_TABLE_NAME); + rowBatch.setTables(dataTable); + StringArray tableSchema = new StringArray(jCas, 1); + tableSchema.set(0, "pmc"); + rowBatch.setTableSchemas(tableSchema); + rowBatch.setTableName("testsubset"); + FSArray pks = new FSArray(jCas, 2); + // Read all documents + List pkStrings = List.of("PMC6949206", "PMC7511315"); + for (String pkString : pkStrings) { + StringArray pk = new StringArray(jCas, 1); + pk.set(0, pkString); + pks = JCoReTools.addToFSArray(pks, pk); + } + rowBatch.setIdentifiers(pks); + rowBatch.setCostosysConfiguration(costosysConfig); + rowBatch.addToIndexes(); + return jCas; + } +} \ No newline at end of file diff --git a/jcore-pmc-db-reader/src/test/resources/costosys-errortest.xml b/jcore-pmc-db-reader/src/test/resources/costosys-errortest.xml new file mode 100644 index 000000000..e9788a2fa --- /dev/null +++ b/jcore-pmc-db-reader/src/test/resources/costosys-errortest.xml @@ -0,0 +1,24 @@ + + + + public + pmc_bulk_gzip + + + + + + + + + + + + + pmc_xml + 5 + + + + + \ No newline at end of file diff --git a/jcore-pmc-db-reader/src/test/resources/testdocs/PMC6949206.xml b/jcore-pmc-db-reader/src/test/resources/testdocs/PMC6949206.xml new file mode 100644 index 000000000..d7bbf8d2e --- /dev/null +++ b/jcore-pmc-db-reader/src/test/resources/testdocs/PMC6949206.xml @@ -0,0 +1,6 @@ + +
pmcProtein CellProtein CellProtein & Cell1674-800X1674-8018Higher Education PressBeijing31037510PMC694920662310.1007/s13238-019-0623-2Research ArticleRescue of premature aging defects in Cockayne syndrome stem cells by CRISPR/Cas9-mediated gene correctionWangSi125MinZheying113JiQianzhao24GengLingling5SuYao5LiuZunpeng34HuHuifang34WangLixia24ZhangWeiqi24567SuzuikiKeiichiro910HuangYu11ZhangPuyao1TangTie-Shan4612QuJing
qujing@ioz.ac.cn
346
YuYang
yuyang5012@hotmail.com
1
LiuGuang-Hui
ghliu@ibp.ac.cn
24568
QiaoJie
jie.qiao@263.net
113
grid.411642.40000 0004 0605 3760Department of Obstetrics and Gynecology, Center for Reproductive Medicine, Peking University Third Hospital, Beijing, 100191 China grid.9227.e0000000119573309National Laboratory of Biomacromolecules, CAS Center for Excellence in Biomacromolecules, Institute of Biophysics, Chinese Academy of Sciences, Beijing, 100101 China grid.9227.e0000000119573309State Key Laboratory of Stem Cell and Reproductive Biology, Institute of Zoology, Chinese Academy of Sciences, Beijing, 100101 China grid.410726.60000 0004 1797 8419University of Chinese Academy of Sciences, Beijing, 100049 China grid.413259.80000 0004 0632 3337Advanced Innovation Center for Human Brain Protection, National Clinical Research Center for Geriatric Disorders, Xuanwu Hospital Capital Medical University, Beijing, 100053 China grid.9227.e0000000119573309Institute for Stem cell and Regeneration, Chinese Academy of Sciences, Beijing, 100101 China grid.9227.e0000000119573309Key Laboratory of Genomic and Precision Medicine, Beijing Institute of Genomics, Chinese Academy of Sciences, Beijing, 100101 China grid.24696.3f0000 0004 0369 153XBeijing Institute for Brain Disorders, Beijing, 100069 China grid.136593.b0000 0004 0373 3971Institute for Advanced Co-Creation Studies, Osaka University, Osaka, 560-8531 Japan grid.136593.b0000 0004 0373 3971Graduate School of Engineering Science, Osaka University, Osaka, 560-8531 Japan grid.11135.370000 0001 2256 9319Department of Medical Genetics, School of Basic Medical Sciences, Peking University Health Science Center, Beijing, 100191 China grid.458458.00000 0004 1792 6416State Key Laboratory of Membrane Biology, Institute of Zoology, Chinese Academy of Sciences, Beijing, 100101 China grid.11135.370000 0001 2256 9319Peking-Tsinghua Center for Life Sciences, Academy for Advanced Interdisciplinary Studies, Peking University, Beijing, 100871 China
304201930420191202011112219220191232019© The Author(s) 2019https://creativecommons.org/licenses/by/4.0/Open AccessThis article is distributed under the terms of the Creative Commons Attribution 4.0 International License (http://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided you give appropriate credit to the original author(s) and the source, provide a link to the Creative Commons license, and indicate if changes were made.

Cockayne syndrome (CS) is a rare autosomal recessive inherited disorder characterized by a variety of clinical features, including increased sensitivity to sunlight, progressive neurological abnormalities, and the appearance of premature aging. However, the pathogenesis of CS remains unclear due to the limitations of current disease models. Here, we generate integration-free induced pluripotent stem cells (iPSCs) from fibroblasts from a CS patient bearing mutations in CSB/ERCC6 gene and further derive isogenic gene-corrected CS-iPSCs (GC-iPSCs) using the CRISPR/Cas9 system. CS-associated phenotypic defects are recapitulated in CS-iPSC-derived mesenchymal stem cells (MSCs) and neural stem cells (NSCs), both of which display increased susceptibility to DNA damage stress. Premature aging defects in CS-MSCs are rescued by the targeted correction of mutant ERCC6. We next map the transcriptomic landscapes in CS-iPSCs and GC-iPSCs and their somatic stem cell derivatives (MSCs and NSCs) in the absence or presence of ultraviolet (UV) and replicative stresses, revealing that defects in DNA repair account for CS pathologies. Moreover, we generate autologous GC-MSCs free of pathogenic mutation under a cGMP (Current Good Manufacturing Practice)-compliant condition, which hold potential for use as improved biomaterials for future stem cell replacement therapy for CS. Collectively, our models demonstrate novel disease features and molecular mechanisms and lay a foundation for the development of novel therapeutic strategies to treat CS.

Electronic supplementary material

The online version of this article (10.1007/s13238-019-0623-2) contains supplementary material, which is available to authorized users.

KeywordsCockayne syndromeCRISPR/Cas9gene correctiondisease modellingmesenchymal stem cellneural stem cellissue-copyright-statement© The Author(s) 2020
INTRODUCTION

Cockayne syndrome (CS) is an autosomal recessive disorder characterized by progressive multisystem clinical features, including cachectic dwarfism, clinical photosensitivity, progressive neurological degeneration, and premature aging (Karikkineth et al., 2017). Two genes that are defective in Cockayne syndrome, CSA/ERCC8 (ERCC excision repair 8, CSA ubiquitin ligase complex subunit) and CSB/ERCC6 (ERCC excision repair 6, chromatin remodeling factor), have been identified. To date, two-thirds of CS patients have been linked to mutations in the CSB/ERCC6 gene, and one-third of CS patients have been linked to mutations in the CSA/ERCC8 gene. At least 78 different mutations in ERCC6, including typical missense mutations, frameshifts, and deletions, have been identified (Cleaver et al., 2009; Laugel, 2013). However, the underlying molecular mechanisms linking genotype to phenotype need to be clarified.

DNA damage caused by exogenous ultraviolet (UV) radiation-induced photoproducts or similar chemically induced products is sensed by the cellular nucleotide excision repair (NER) system (Friedberg, 2001, 2003; Cleaver et al., 2009; McKay and Cabrita, 2013). The NER system consists of two pathways: global genomic repair (GGR), in which damage to DNA regions not undergoing transcription is repaired, and transcription-coupled repair (TCR), in which damage to transcribed DNA regions is repaired. Bulky DNA adducts usually block transcription elongation by RNA polymerase II (RNAPII); then, the arrested RNAPII initiates the repair of transcription-blocking DNA lesions by TCR to permit the efficient recovery of mRNA synthesis. If TCR cannot be executed, widespread sustained transcription blockage eventually leads to apoptosis (McKay and Cabrita, 2013). ERCC6 is an ATP-stimulated ATPase that is required for the ubiquitylation of the carboxyterminal domain of RNAPII in TCR and the recovery of mRNA synthesis. In addition, ERCC6 has been reported as a member of the SWI/SNF family of proteins that contain a nucleotide-binding site and play a role in chromatin maintenance and remodelling by modulating the negative supercoiling of DNA and facilitating DNA strand exchange, possibly through the recruitment of the histone acetyltransferase p300 (Newman et al., 2006; Cleaver et al., 2009; Velez-Cruz and Egly, 2013).

Mice deficient for Ercc6 or Ercc8 have been generated and used to mimic mild CS symptoms, including fat tissue reduction, photoreceptor cell loss, and mild but characteristic nervous system pathology (van der Horst et al., 1997, 2002; Gorgels et al., 2007; Jaarsma et al., 2011). These mild CS mouse models are converted to severe CS models with short life spans, progressive nervous system degeneration and cachectic dwarfism after synergistic complete inactivation of global genome NER. For example, previous studies have demonstrated the simultaneous deleterious effects of intercrossing xeroderma pigmentosum (XP) (Xpa−/− or Xpc−/−) mice with CS (Csa−/−, Csb−/−, XpdXPCS) mice, which results in double mutants with very short life spans and dramatic progeroid features (Murai et al., 2001; Andressoo et al., 2006; van der Pluijm et al., 2007). Due to the differences in genetic and anatomic features between humans and mice, a human CS model needs to be established to reveal the cellular defects and molecular mechanisms for translation into a CS treatment.

In this study, we report the generation of induced pluripotent stem cells (iPSCs) from the fibroblasts of a CS patient bearing two novel heterogeneous mutations in the ERCC6 gene: c.643G>T in exon 4 and c.3776C>A in exon 18. We further derived gene-corrected CS-iPSCs (GC-iPSCs) using the CRISPR/Cas9-mediated gene editing technique. CS-iPSCs and GC-iPSCs were further differentiated into mesenchymal stem cells (MSCs) and neural stem cells (NSCs). Gene correction resulted in the effective restoration of DNA repair abilities and the alleviation of apoptosis and premature senescence, especially after exposure to UV irradiation or replicative stress (Fig. 1A). RNA sequencing analysis indicated that the compromised DNA repair and cell cycle deregulation observed in CS cells account for various CS cellular pathologies. Finally, we obtained gene-corrected CS-iPSC-derived MSCs under a cGMP (Current Good Manufacturing Practice)-compliant condition, which display promising potential in autologous stem cell therapy.

Generation of CS-iPSCs and gene-corrected CS-iPSCs. (A) Schematic diagram of the generation of CS-iPSCs and GC-iPSCs, as well as their adult stem cell derivatives, for modelling Cockayne syndrome. “Mut” represents mutant, “GC” represents gene corrected. (B) Genotype validation of two heterozygous mutations in the ERCC6 gene by genomic DNA sequencing. Fibroblasts isolated from a healthy individual were used as a control. (C) Strategy for correcting the ERCC6+/G643T mutation by the CRISPR/Cas9 system. The sequence of the gRNA is shown with the PAM sequence. Red crosses represent mutations in exon 4 and exon 18. The single-stranded oligodeoxynucleotide (ssODN) carrying a silent mutation (marked in green) was used as a repair template. (D) The correction of the ERCC6+/G643T mutation was verified by genomic DNA sequencing. The red arrow highlights the corrected base pair. The green arrow indicates the inclusion of silent mutation introduced by the exogenous ssODN template. ERCC6mut represents CS-iPSCs, ERCC6GC represents GC-iPSCs. (E) Karyotyping analysis of CS-iPSCs and GC-iPSCs indicating their normal karyotypes. (F) No residual episomal vector element EBNA-1 was observed in CS-iPSCs or GC-iPSCs by qPCR analysis. CS-fibroblasts were electroporated with pCXLE-hOCT3/4-shp53-F, pCXLE-hSK and pCXLE-hUL. The fibroblasts were cultured for 4 more days after electroporation and then collected as the positive control, and human ESCs (line H9), GM00038-iPSCs and HFF-iPSCs were used as negative controls. Data are shown as the mean ± SEM, n = 3. (G) No off-target mutations were observed in GC-iPSCs. Whole-genome sequencing was applied to detect potential off-target mutations in the GC-iPSC sample. NA, not applicable

RESULTSGeneration of non-integrative iPSCs from a CS patient

We first isolated human primary fibroblasts from a Chinese CS patient and verified the presence of two nonsense mutations, c.643G>T (p.E215X) in exon 4 and c.3776C>A (p.S1259X) in exon 18, located at different alleles of the ERCC6 gene by genomic DNA sequencing analysis (Fig. 1B). To generate patient-specific iPSCs (CS-iPSCs), a cocktail of integration-free episomal vectors expressing the reprogramming factors OCT4, SOX2, KLF4, L-MYC, LIN28, and sh-p53 was electroporated into fibroblasts according to a modified reprogramming protocol, as previously described (Hishiya and Watanabe, 2004; Okita et al., 2011; Liu et al., 2014; Ding et al., 2015; Fu et al., 2016; Wang et al., 2017; Ling et al., 2019). The derived iPSCs displayed normal karyotypes, and no residual episomal reprogramming vector element was detected in established CS-iPSCs (Fig. 1E and 2F). In addition, CS-iPSCs expressed comparable levels of pluripotency markers, including NANOG, OCT4, and SOX2 (Fig. 2B and 2C). After being implanted subcutaneously into immunocompromised mice, CS-iPSCs were able to form teratomas comprising cells from three germ lineages, as indicated by TUJ1, SMA and FOXA2 expression (Fig. 2D). These observations indicated that iPSCs bearing the CS-specific ERCC6 mutation display normal pluripotency.

Characterization of CS-iPSCs and gene-corrected CS-iPSCs. (A) Western blot analysis showing increased protein levels of ERCC6 in GC-iPSCs. β-Actin was used as the loading control. (B) RT-PCR analysis of the pluripotency markers SOX2, OCT4, and NANOG in the CS-iPSCs and GC-iPSCs. 18S rRNA was used as the loading control. (C) Immunostaining of CS-iPSCs and GC-iPSCs for the pluripotency markers OCT4, NANOG, and SOX2. Nuclei were stained with Hoechst 33342. Scale bar, 50 μm. (D) Immunostaining of TUJ1 (ectoderm), SMA (mesoderm), and FOXA2 (endoderm) in teratomas derived from CS-iPSCs and GC-iPSCs. Nuclei were stained with Hoechst 33342. Scale bar, 50 μm. (E) The percentages of Ki67-positive cells in CS-iPSCs and GC-iPSCs were determined and compared. Nuclei were stained with Hoechst 33342. Scale bar, 50 μm. Data are presented as the mean ± SEM, n = 3, ns, not significant. (F) Cell cycle profiles showing comparable percentages of different cell cycle phases in CS-iPSCs and GC-iPSCs by PI staining. Data are presented as the mean ± SEM, n = 3

Targeted gene correction of the <italic>ERCC6</italic> mutation by CRISPR/Cas9 system

To better elucidate the pathogenic mechanism underlying CS, we generated isogenic gene-corrected iPSC lines by targeted gene editing of one of the two compound heterozygous ERCC6 mutations. Using the CRISPR/Cas9 system, we electroporated an expression vector encoding mCherry and a guide RNA targeting the mutation in exon 4, a plasmid for Cas9-2A-GFP, and the single-stranded oligodeoxynucleotide (ssODN) template into CS-iPSCs (Wang et al., 2017). After fluorescence-activated cell sorting (FACS) for mCherry (guide RNA) and GFP (Cas9) double-positive cells, gene-corrected CS-iPSC clones were successfully obtained (Fig. 1C). Site-specific gene correction of the c.643G>T mutation was confirmed by genomic DNA sequencing (Fig. 1D). As the exogenous repair template ssODN was designed to contain a silent mutation, the introduced silent mutation was also found in the GC-iPSC clones, further confirming successful gene editing at the corresponding genomic target sites (Fig. 1D). Similar to CS-iPSCs, we did not detect any residual episomal reprogramming vectors in GC-iPSCs (Fig. 1F). Whole-genome DNA sequencing indicated no mutations in potential off-target sites after gene editing (Fig. 1G). GC-iPSCs also showed a normal karyotype (Fig. 1E). Western blots demonstrated elevated levels of the ERCC6 protein in GC-iPSCs (Fig. 2A), implying that the correction of the pathogenic mutation recovered the protein expression of ERCC6. Additionally, GC-iPSCs normally expressed pluripotency markers, including OCT4, NANOG, and SOX2 (Fig. 2B and 2C), and formed teratomas in vivo (Fig. 2D). CS-iPSCs and GC-iPSCs were cultured for more than 50 passages without showing abnormal growth kinetics (Fig. 2E and 2F). Unlike the previous study (Andrade et al., 2012), we did not observe elevated cellular reactive oxygen species (ROS) in CS-iPSCs compared to GC-iPSCs (Fig. S3A). In addition, RT-qPCR demonstrated that the expression levels of genes involved in the oxidative stress response were comparable between GC-iPSCs and CS-iPSCs (Fig. S3B). Taken together, these results indicated that we successfully generated GC-iPSCs exhibiting normal pluripotency.

Alleviation of aging defects in gene-corrected CS-MSCs

CS patients frequently exhibit musculoskeletal abnormalities, such as kyphosis, contracture and osteoporosis (Hishiya and Watanabe, 2004; Karikkineth et al., 2017). MSCs are multipotent mesodermal cells that can differentiate into a variety of mesodermal cell types, including osteoblasts, chondrocytes, and adipocytes, which serve as a good cell model for investigating the accelerated degeneration of mesodermal tissues caused by genetic mutations (Liu et al., 2014; Zhang et al., 2015, 2019; Kubben et al., 2016; Li et al., 2016; Pan et al., 2016; Geng et al., 2018; Wang et al., 2018b; Wu et al., 2018; Yan et al., 2019). Therefore, we first differentiated CS-iPSCs and GC-iPSCs into MSCs to investigate whether ERCC6 mutations could result in accelerated attrition of the MSC pool. Both CS-MSCs and GC-MSCs were positive for mesenchymal progenitor markers, including CD73, CD90 and CD105 (Fig. 3A). Consistent with the successful correction of ERCC6 gene mutation, increased ERCC6 protein content was observed in GC-MSCs (Fig. 3B). Next, we investigated whether normal ERCC6 activity is required for maintaining the cellular homeostasis of MSCs. Compared to isogenic gene-corrected control cells, CS-MSCs displayed features characteristic of premature senescence under replicative stress, including the early onset of cell growth arrest, reduced Ki67-positive cells, and increased senescence-associated (SA)-β-Gal activity (Fig. 3C–E). In addition, the expression levels of senescence markers, including P16, P21 and IL-8, were upregulated, while the geroprotective proteins Lamin B1 and LAP2 were downregulated in CS-MSCs relative to GC-MSCs at late passages (Fig. 3F–H). In line with the essential role of ERCC6 in NER, CS-MSCs exhibited increased expression of the DNA damage marker γH2AX (Fig. 3I), indicating compromised DNA repair in ERCC6-deficient MSCs. Next, we investigated whether CS-MSCs underwent accelerated attrition in vivo. Implanting CS-MSCs and GC-MSCs expressing luciferase into the tibialis anterior (TA) muscle of immunodeficient mice resulted in accelerated in vivo decay in CS-MSCs compared to GC-MSCs (Fig. 3J). Furthermore, we compared the multipotent differentiation potential of CS-MSCs and GC-MSCs. Relative to GC-MSCs, CS-MSCs exhibited impaired differentiation abilities towards osteoblasts, chondrocytes and white adipocytes (Fig. 3K and 3L). Altogether, these results showed that CS-MSCs displayed typical premature cellular senescence, which was rescued by the targeted correction of mutant ERCC6.

Alleviated cellular senescence in gene-corrected CS-MSCs. (A) FACS analysis indicating the expression of the cell surface markers CD73, CD90 and CD105 in CS-MSCs and GC-MSCs. ERCC6mut represents CS-MSCs, ERCC6GC represents GC-MSCs. (B) Western blot analysis showing increased protein levels of ERCC6 in GC-MSCs. β-Actin was used as the loading control. (C) Growth curves showing the cumulative population doublings of CS-MSCs and GC-MSCs. (D) Immunostaining of Ki67 showing the decreased cell proliferation of CS-MSCs compared to GC-MSCs. The percentages of Ki67-positive cells are shown in the right panel. Scale bar, 20 μm. Data are presented as the mean ± SEM, n = 3, **P < 0.01, ***P < 0.001. EP, early passage (P6); LP, late passage (P28). (E) SA-β-Gal staining of CS-MSCs and GC-MSCs at EP (P6) and LP (P28), respectively. The percentages of SA-β-Gal-positive cells are shown in the right panel. Scale bar, 50 μm. Data are presented as the mean ± SEM, n = 3, **P < 0.01, ns, not significant. (F) RT-qPCR analysis of the expression of senescence markers in CS-MSCs and GC-MSCs at passage 28. The mRNA levels were normalized to CS-MSCs. (G) Western blot analysis of P16, LAP2 and Lamin B1 in CS-MSCs and GC-MSCs. GAPDH was used as the loading control. (H) Immunostaining of LAP2 and Lamin B in CS-MSCs and GC-MSCs. The relative intensity of LAP2 was measured with ImageJ software, and the data are shown as the mean ± SEM, ***P < 0.001. More than 300 nuclei for each group were used for calculations. Scale bar, 20 μm. a.u., arbitrary units. (I) Immunostaining of γH2AX in CS-MSCs and GC-MSCs. The relative intensity of γH2AX was measured with ImageJ software, and the data are shown as the mean ± SEM, ***P < 0.001. More than 300 nuclei for each group were used for calculations. Scale bar, 20 μm. a.u., arbitrary units. (J) Accelerated attrition of CS-MSCs in vivo was detected by an in vivo imaging system (IVIS). CS-MSCs (1 × 106, left) and GC-MSCs (1 × 106, right) (passage 25) infected with luciferase lentivirus were injected into the tibialis anterior (TA) muscles of immunodeficient mice. Luciferase activities were imaged and quantified at days 0, 2, 4, and 6 after transplantation. Data are presented as the ratios of the luciferase intensity of CS-MSCs to that of GC-MSCs (fold), mean ± SD, n = 3, **P < 0.01, ***P < 0.001. (K) Comparative analysis of the osteogenic, chondrogenic and adipogenic differentiation potential of CS-MSCs and GC-MSCs. Von Kossa, Alcian blue, and oil red O staining were used to characterize osteoblasts, chondrocytes, and adipocytes, respectively. Scale bar, 50 μm. (L) The intensity of von Kossa staining was calculated by ImageJ and compared in the left panel. Data are presented as the mean ± SEM, n = 3, **P < 0.01. The cross-sectional area of chondrocyte spheres was measured and is shown in the middle panel. Data are presented as the mean ± SD, n = 14, ***P < 0.001. The relative intensity of oil red O was measured and is shown in the right panel. Data are presented as the mean ± SEM, n = 3, ***P < 0.001

Gene-corrected CS-MSCs display recovered DNA repair ability and resistance to UV-induced apoptosis and cell cycle arrest

Next, we investigated whether mutations in ERCC6 genes lead to impaired DNA damage repair ability after UV irradiation in MSCs. UV radiation usually results in the covalent dimerization of adjacent pyrimidines, typically thymine residues (thymine dimers), including cyclobutane pyrimidine dimers (CPDs) and (6-4) photoproducts (6-4PPs), in DNA (Setlow and Setlow, 1962; Friedberg, 2003; Cadet et al., 2005). Accordingly, we treated CS-MSCs and GC-MSCs with 10 J/m2 UV irradiation and examined the levels of intranuclear CPDs by immunostaining. Both CS-MSCs and GC-MSCs showed low levels of CPDs in the absence of UV irradiation; however, CS-MSCs exhibited more CPD-positive cells than GC-MSCs did at 48 h after UV irradiation (Fig. 4A). These results demonstrated that CS-MSCs were deficient in eliminating CPD photolesions after UV-induced DNA damage, and this ability was restored by ERCC6 correction. We then explored whether CS-MSCs are hypersensitive to UV-induced cellular apoptosis. CS-MSCs and GC-MSCs were cultured in the presence or absence of 10 J/m2 UV irradiation. UV irradiation induced marked cellular apoptosis in CS-MSCs relative to GC-MSCs at 48 h after UV irradiation (Fig. 4B). Western blot analysis showed increased levels of cleaved PARP (c-PARP) in CS-MSCs following UV treatment (Fig. 4C). In addition, we treated MSCs with a lower dose (1 J/m2) of UV light at each passage starting from passage 4. In this context, relative to GC-MSCs, CS-MSCs displayed compromised self-renewal ability and increased SA-β-Gal-positive cells (Fig. 4D–F), indicating that the ERCC6 deficiency rendered MSCs sensitive to replicative stress under low-dose chronic UV irradiation. Thus, CS-specific MSCs exhibited impaired DNA repair ability and increased susceptibility to UV-induced injury, and these phenotypes were rescued by the genetic correction of the pathogenic mutation.

Gene-corrected CS-MSCs display recovered DNA repair ability and counteract UV-induced apoptosis and senescence. (A) CPD immunostaining in CS-MSCs and GC-MSCs in the absence or presence of 10 J/m2 UV exposure. Nuclei were stained with Hoechst 33342. Scale bar, 50 μm. More than 300 nuclei for each group were used for calculation. The data are shown as the mean ± SEM, ns, not significant, ***P < 0.001. a.u., arbitrary units. (B) Apoptosis analysis of CS-MSCs and GC-MSCs at 48 h after 10 J/m2 UV irradiation. Quantitative data are presented as the mean ± SEM, n = 3, **P < 0.01, ***P < 0.001. (C) Western blots showing PARP cleavage in CS-MSCs and GC-MSCs in the absence or presence of 10 J/m2 UV exposure. GAPDH was used as a loading control. Quantitative data are presented as the mean ± SD, n = 3, ns, not significant, *P < 0.05. (D) Growth curves showing the cumulative population doublings of CS-MSCs and GC-MSCs in the absence (control) or presence (UV) of 1 J/m2 UV exposure at each passage starting from passage 4. (E) Clonal expansion assay showing the cell proliferation ability of CS-MSCs and GC-MSCs in the absence (control) or presence (UV) of 1 J/m2 UV exposure at passage 10. The cells were stained with crystal violet after two weeks of culture, and the relative intensity of the crystal violet staining was quantified. Data are presented as the mean ± SEM, n = 3, *P < 0.05, ***P < 0.001. (F) SA-β-Gal staining of CS-MSCs and GC-MSCs in the absence (control) or presence (UV) of 1 J/m2 UV exposure at passage 10. The percentages of SA-β-Gal-positive cells are shown in the right panel. Data are presented as the mean ± SEM, n = 3, **P < 0.01, ns, not significant

Gene-corrected CS-NSCs display improved NER ability and reduced susceptibility to UV-induced apoptosis

Due to the presence of obvious symptoms of neurodegeneration in CS patients (Cleaver et al., 2009; Natale, 2011; Laugel, 2013; Shehata et al., 2014), we next differentiated CS-iPSCs and GC-iPSCs into NSCs (referred to as CS-NSCs and GC-NSCs, respectively). Both CS-NSCs and GC-NSCs showed typical neural progenitor morphology and expressed the NSC markers Nestin, PAX6 and SOX2 (Fig. 5A). Western blots confirmed the increased protein expression of ERCC6 in GC-NSCs compared to that in uncorrected CS-NSCs (Fig. 5B). To investigate whether mutations in the ERCC6 gene impair the DNA repair ability of NSCs, we treated CS-NSCs and GC-NSCs with 5 J/m2 UV irradiation and then examined the levels of intranuclear CPDs. Similar to the results obtained with MSCs, higher levels of CPDs were observed in CS-NSCs than in GC-NSCs at 48 h after UV irradiation, indicating that targeted gene correction effectively rescued the hypersensitivity of CS-NSCs to UV irradiation (Fig. 5C). Consistent with this finding, gene correction resulted in decreased cellular apoptosis in CS-NSCs in the presence of UV treatment (Fig. 5D and 5E). Altogether, these results indicated that CS-NSCs, which are characterized by a DNA repair deficit, were prone to UV-induced apoptosis, while genetic correction resulted in the restoration of these phenotypic defects.

Gene-corrected CS-NSCs show increased NER ability and decreased susceptibility to UV-induced apoptosis. (A) Immunostaining of the NSC markers Nestin, PAX6, and SOX2 in the CS-NSCs and GC-NSCs. The nuclei were stained with Hoechst 33342. Scale bar, 50 μm. ERCC6mut represents CS-NSCs, ERCC6GC represents GC-NSCs. (B) Western blot analysis showing increased protein levels of ERCC6 in GC-NSCs. β-Actin was used as the loading control. (C) CPD immunostaining in CS-NSCs and GC-NSCs in the absence or presence of 5 J/m2 UV exposure. Nuclei were stained with Hoechst 33342. Scale bar, 50 μm. Over 300 nuclei were used for calculations. The data are shown as the mean ± SEM, ***P < 0.001. a.u., arbitrary units. (D) Apoptosis analysis of CS-NSCs and GC-NSCs at 48 h after 5 J/m2 UV irradiation. Quantitative data are presented as the mean ± SEM, n = 3, *P < 0.05, ***P < 0.001. (E) Western blots showing PARP cleavage in CS-NSCs and GC-NSCs in the absence or presence of 5 J/m2 UV exposure. GAPDH was used as a loading control. Quantitative data are presented as the mean ± SD, n = 3, *P < 0.05, ns, not significant

The <italic>ERCC6</italic> mutation results in gene expression changes associated with impaired DNA damage repair, chromatin disorganization, and compromised cell proliferation

To investigate whether gene expression profiles were disrupted in CS-specific iPSCs, MSCs and NSCs, we performed genome-wide RNA sequencing (RNA-seq) analysis (Figs. 6, S1 and S2). Principal component analysis (PCA) showed that the RNA profiles of MSCs, iPSCs and NSCs were separated as three independent subgroups (Fig. 6A), implying the existence of unique RNA expression patterns in each cell type. While there were minimal gene expression changes between CS-iPSCs and GC-iPSCs and between CS-NSCs and GC-NSCs, the mutation of ERCC6 resulted in marked changes in the transcriptome of MSCs (Figs. 6B and S1C). These observations were in line with the most striking phenotypes in CS-MSCs relative to their gene-corrected counterparts under basal culture conditions (Fig. 3C–E). UV treatment results in an increased difference in transcriptional profiles between GC-MSCs and CS-MSCs and between GC-NSCs and CS-NSCs (Figs. 6B and S1C). Notably, UV treatment induced dramatic gene expression changes in CS-specific MSCs and CS-specific NSCs (Fig. S1E), which were associated with increased DNA damage, impaired transcription, and compromised cell growth; these changes, however, became insensitive in ERCC6-corrected MSCs and NSCs, indicating that gene correction resulted in the restoration of normal transcriptional and DNA repair activity under DNA damage stress (Fig. 6C). After extensive passaging, we also observed a panel of upregulated genes related to cell division and DNA damage repair in ERCC6-corrected MSCs compared to diseased MSCs (Fig. 6D), which is in line with the rescue of premature cellular senescence in gene-corrected MSCs (Fig. 3C–J). Collectively, these transcriptomic changes support the improved cell proliferation and increased DNA damage repair ability in ERCC6-corrected adult stem cells.

The global gene expression profiles of CS-iPSCs and gene-corrected CS-iPSCs and their adult stem cell derivatives. (A) PCA of CS cells and GC cells in the absence or presence of UV (Ctrl or UV), as well as under replicative senescence (RS) stress. Each point represents a sample. Data points were computed based on Log2(FPKM + 1). (B) Volcano plots showing the differentially expressed genes between CS-iPSCs and GC-iPSCs, between CS-MSCs and GC-MSCs, and between CS-NSCs and GC-NSCs in the absence of UV (the upper panel) or in the presence of UV (the lower panel, UV), or under RS stress (the lower panel, RS). Red represents upregulated genes, and blue represents downregulated genes. (C) Gene Ontology Biological Process (GO-BP) enrichment analysis of significantly upregulated/downregulated genes in GC-MSCs compared to CS-MSCs upon UV treatment. Red represents upregulated genes, and blue represents downregulated genes. (D) Gene Ontology Biological Process (GO-BP) enrichment analysis of significantly upregulated/downregulated genes in GC-MSCs compared to CS-MSCs under RS stress. Red represents upregulated genes, and blue represents downregulated genes

Gene-corrected CS-MSCs produced in accordance with cGMP compliance guidelines show alleviated senescence and increased resistance to UV-induced apoptosis

Human mesenchymal stem cells hold the potential to be used for the treatment of aging-related disorders (Orozco et al., 2011, 2013, 2014; Golpanian et al., 2016, 2017; Tompkins et al., 2017; Yang et al., 2017; Yan et al., 2019). We next tested whether ERCC6-corrected CS-MSCs can be produced under a cGMP-compliant condition. Accordingly, we derived MSCs from iPSCs using a serum-free, animal component-free differentiation medium. The differentiation protocol was slightly modified from the serum-containing procedure (see experimental method). FACS analysis demonstrated that the derived MSCs expressed the mesenchymal progenitor cell-specific markers CD73, CD90 and CD105 (Fig. 7A). The absence of pluripotent stem cell contamination in the derived MSCs was verified by RT-qPCR and immunostaining assays (Fig. 7B and 7C). Whole-genome DNA sequencing further validated the genomic integrity during somatic cell reprogramming, gene correction, and directed differentiation to MSCs (Fig. 7D and 7E). Sterility and pathogen testing demonstrated that there was no endotoxin, mycoplasma, bacteria, or virus contamination in the culture medium of the GC-MSCs (Fig. 7F). To evaluate any potential risk of tumorigenesis in vivo, immunodeficient mice were subcutaneously injected with the ERCC6-corrected MSCs. Human ESC (line H9) and U2-OS osteosarcoma cell lines were implanted independently as positive controls. We observed that the GC-MSCs failed to form tumors, even at 8 months after implantation, in contrast with the teratomas formed from hESCs and tumors formed from U2-OS cells at 2 months post-injection (Fig. 7G).

Safety analysis of gene-corrected CS-MSCs obtained under a cGMP-compliant condition. (A) FACS analysis indicated the expression of the cell surface markers CD73, CD90 and CD105 in CS-MSCs and GC-MSCs. (B) RT-qPCR analysis of the expression of pluripotency markers OCT4, NANOG, and SOX2 in CS-MSCs and GC-MSCs. GC-iPSCs and CS-fibroblasts were used as positive and negative controls, respectively. Data are presented as the mean ± SEM, n = 3. (C) Immunostaining of the pluripotency marker NANOG in CS-MSCs and GC-MSCs. GC-iPSCs were used as a positive control, Scale bar, 50 μm. (D) Whole-genome sequencing of single-nucleotide variants (SNVs) in CS-fibroblasts, CS-iPSCs, GC-iPSCs, CS-MSCs and GC-MSCs. Sites with a heterozygosity percentage ranging between 0% and 30% were considered as SNV sites, and sites with a heterozygosity of >30% were considered as single-nucleotide polymorphisms (SNPs). (E) Whole-genome sequencing of copy number variations (CNVs) in CS-fibroblasts, CS-iPSCs, GC-iPSCs, CS-MSCs and GC-MSCs. Each point represents normalized coverage depth of each 500-kb genomic region of each chromosome. (F) Sterility and pathogen testing of the conditioned medium of GC-MSCs. a Endotoxin was identified as negative when the concentration was < 0.25 EU/mL. b CMV was identified as negative when the ratio of the OD450 value of sample to the cut-off value (S/Co) was < 1.0. c HAV was identified as negative when the ratio of the cut-off value to the OD450 nm value of the sample (Co/S) was < 0.9. d HCV was identified as negative when the ratio of the OD450 value of the sample to the cut-off value (S/Co) was < 0.9. e HIV-1 was identified as negative when the concentration = 0 pg/mL. (G) Evaluation of the potential tumorigenesis risk of GC-MSCs in vivo. A subcutaneous injection of GC-MSCs was performed in immune-deficient mice. Human ESC (line H9) and U2-OS osteosarcoma cell lines were also implanted independently as positive controls. Representative images in the lower panel showing the teratoma and tumor formed from positive cells two months after transplantation, Scale bar, 0.5 cm. HE staining of a teratoma and tumor were shown in the upper panel. Scale bar, 100 μm. The in vivo tumor-formation incidence of each cell type was calculated. n = 4 for each positive cell group, n = 5 for the GC-MSC group

Phenotypically, compared to diseased MSCs, gene-corrected MSCs generated following the cGMP compliance standard displayed increased cell proliferation and attenuated cellular senescence (Fig. 8A and 8B). In addition, the GC-MSCs were insensitive to UV-induced apoptosis (Fig. 8C and 8D). Consistent with an improved activity, these GC-MSCs exhibited better tri-lineage differentiation potential towards osteoblasts, chondrocytes and adipocytes (Fig. S3C–D). A fat pad implantation assay further demonstrated the superior in vivo neovascularization ability of GC-MSCs (Fig. 8E). Altogether, we successfully generated ERCC6-corrected MSCs with normal functional activity under a cGMP-compliant condition.

Gene-corrected CS-MSCs generated under a cGMP-compliant condition displayed alleviated aging defects and decreased susceptibility to UV-induced apoptosis. (A) Clonal expansion assay showing the cell proliferation ability of CS-MSCs and GC-MSCs. The cells were stained with crystal violet after a two-week culture, and the relative intensity of the crystal violet was quantified. Data are presented as the mean ± SEM, n = 4, **P < 0.01. Scale bar, 50 μm. (B) SA-β-Gal staining of CS-MSCs and GC-MSCs. The percentages of SA-β-Gal-positive cells are shown in the right panel. Data are presented as the mean ± SEM, n = 3, **P < 0.01. Scale bar, 50 μm. (C) Apoptosis analysis of CS-MSCs and GC-MSCs 48 h after 10 J/m2 UV irradiation. Quantitative data are presented as the mean ± SEM, n = 3, ***P < 0.001. (D) Western blots showing PARP cleavage of CS-MSCs and GC-MSCs in the presence of 10 J/m2 UV exposure. β-Actin was used as a loading control. (E) Fat pad transplantation with CS-MSCs and GC-MSCs. Left: representative immunofluorescent images showing neovascularization; right: the number of hCD31-positive vessels calculated based on 24 slices from inconsecutive frozen sections. Data are presented as the mean ± SD, n = 3 for each group, **P < 0.01. Scale bar, 50 μm

DISCUSSION

Although several mouse models exhibiting the clinical symptoms of CS have been generated and have provided valuable insights into the disease mechanism, there are still many differences in clinical features between CS patients and mouse models. For instance, in contrast to human CS patients, who do not develop skin cancer, ERCC6 mutant mice show increased susceptibility to skin cancer (van der Horst et al., 1997, 2002). Thus, CS mouse models do not fully mimic the pathophysiology of CS patients, and the knowledge learned from animal models may be poorly translated to the clinic. CS patient-specific iPSCs were initially obtained by reprogramming fibroblasts from CS patients using retroviral vectors, and these cells exhibited an elevated cell death rate and increased ROS production (Andrade et al., 2012). Our study, however, did not identify increased oxidative stress or altered levels of TXNIP (Fig. S3A and S3B). These differences may be attributed to the reprogramming vectors. Luciana et al. used retroviral vectors, which may result in random genomic integration and genomic instability during the reprograming process. In addition, the same research group recently reported that CS-iPSC-derived neurons display reduced synapse density and altered neural network synchrony (Vessoni et al., 2016). Again, this study was based on a retroviral vector-mediated somatic reprograming technique. More importantly, due to the lack of an isogenic “disease-free” control iPSC line, it is hard to determine whether the phenotypic differences are caused by ERCC6 gene mutations or genetic background variations between CS patients and control individuals. To faithfully recapitulate human CS pathogenesis, a reliable human iPSC-based disease model with isogenic gene-corrected cells is required. In this study, we generated transgene-free iPSCs from the fibroblasts of a CS patient bearing newly identified heterozygous disease-causing mutations in the ERCC6 gene and obtained isogenic gene-corrected iPSCs using the CRISPR/Cas9 system. These iPSCs were further differentiated into two types of adult stem cells, MSCs and NSCs, which presented a panel of new disease phenotypes.

Although previous studies have reported that the deficiency of functional DNA repair proteins may hinder somatic cell reprogramming and teratoma formation in vivo (i.e., WRN (Shimamoto et al., 2014; Wang et al., 2018c), p53 (Kawamura et al., 2009), and Fanconi genes (Muller et al., 2012)), we did not observe any defects in the derivation or pluripotency of CS patient-specific iPSC lines. Moreover, ERCC6 gene mutations did not compromise the chromosomal integrity of iPSCs, as indicated by karyotype analysis. Our study also provides proof of concept that CRISPR/Cas9-mediated gene editing may be amenable to correcting ERCC6 mutation in a therapeutic context. Whole-genome DNA sequencing demonstrated minimal mutational load in patient iPSCs after targeted gene correction.

Although CS patients exhibit musculoskeletal abnormalities (Hishiya and Watanabe, 2004), there are limited reports concerning mesodermal cells. Using an iPSC-based system, we have for the first time generated CS-specific MSCs that display differentiation potential towards osteoblasts, chondrocytes and white adipocytes, and these cells serve as a good cell model to study mesodermal abnormalities in CS patients. Consistent with the premature degeneration of mesenchymal progenitor cells, CS-MSCs exhibit decreased cell proliferation, accelerated senescence, and compromised differentiation ability towards osteoblasts, chondrocytes and white adipocytes, which may constitute one of the causes of the observed defects in the musculoskeletal system. In addition, in agreement with previous reports showing confounding defects in the neural system in CS patients (Cleaver et al., 2009; Natale, 2011; Laugel, 2013; Sacco et al., 2013; Ciaffardini et al., 2014; Vessoni et al., 2016), our data indicated severe DNA repair defects and increased susceptibility to UV-induced apoptosis in CS-iPSC-derived NSCs, therefore providing in-depth mechanistic insights into CS-associated neurological disorders.

Regarding the molecular mechanism, we have generated the first ERCC6 mutation-associated disease transcriptome landscapes of human MSCs and NSCs using an isogenic iPSC-based research system. Under normal culture conditions, mutation of ERCC6 resulted in the most dramatic gene expression changes in MSCs relative to NSCs and iPSCs. Consistent with this finding, CS-specific MSCs demonstrated cell type-specific accelerated senescence after serial passaging. These results suggest that the attrition of the MSC pool and the resulting mesodermal defects are a major syndrome of CS. UV radiation generates photoproducts in genomic DNA that promote genetic mutations that contribute to skin carcinogenesis or cellular senescence (Amaro-Ortiz et al., 2014; Kemp et al., 2017). In this study, we found that ERCC6 mutant MSCs and NSCs were highly susceptible to UV radiation. A defect in the initiation of transcription by RNAPII in UV-treated CS and XP/CS cells has been observed in previous studies (Rockx et al., 2000; Yamada et al., 2002; Proietti-De-Santis et al., 2006; Velez-Cruz et al., 2013). In line with these results, we observed that transcriptional blockage was rescued in gene-corrected CS-MSCs after UV irradiation. In addition, the presence of the ERCC6 mutation is associated with defects in gene expression linked to “cellular response to DNA damage”, “cellular response to stress” and “cell division”, indicating that the defective DNA repair in CS-specific adult stem cells mediates UV-induced cell phenotypic abnormalities. In addition, the mutation of ERCC6 also led to gene expression changes related to “regulation of chromatin organization” in both NSCs and MSCs. Therefore, the pathogenesis of CS may involve a complex interplay among defects in DNA damage repair, chromatin organization, and cell cycle control.

In the context of disease therapy, stem cell-based replacement therapy holds great promise toward restoring tissue homeostasis, e.g., for premature aging disorders (Golpanian et al., 2017; Tompkins et al., 2017). We and others have produced adult stem cells and other terminally differentiated cells from iPSCs derived from various human aging-related disorders, including Hutchinson-Gilford progeria syndrome (HGPS), Werner syndrome (WS), Fanconi anemia (FA), XP, amyotrophic lateral sclerosis (ALS), and Parkinson’s disease (PD) (Liu et al., 2011a, 2012, 2014; Zhang et al., 2015; Fu et al., 2016; Wang et al., 2017). Using targeted gene editing techniques, we have also edited/corrected pathogenic mutations in these patient-derived iPSCs (Liu et al., 2011b, 2012, 2014; Wang et al., 2017). MSCs can differentiate into osteoblasts, chondrocytes, myocytes and adipocytes. Previous studies have shown that MSCs ameliorate aging frailty in clinical trials (Golpanian et al., 2016, 2017; Tompkins et al., 2017). Recently, the generation of allogeneic or autologous MSCs from pluripotent stem cells has emerged as a promising new strategy for stem cell-based therapy (Yang et al., 2017; Castro-Vinuelas et al., 2018; Soontararak et al., 2018; Yan et al., 2019). In the present study, we have derived MSCs from gene-corrected CS-iPSCs under a cGMP-compliant condition. These MSCs demonstrated superior cellular activity compared to uncorrected diseased cells, retained high genomic stability, and did not form tumors in vivo. Therefore, clinical-grade GC-MSCs may represent important biomaterials for achieving autologous stem cell treatment for CS.

In summary, the isogenic CS stem cell models established in this study provide a valuable platform for studying CS pathogenesis, discovering innovative drugs, and the development of new cell replacement therapies. The transcriptomic profiles underlying disease phenotypes may be useful for discovering biomarkers for diagnosis and the development of new therapeutic approaches.

MATERIALS AND METHODSAntibodies and reagents

The primary antibodies used were as follows (company, catalogue number): anti-ERCC6 (Abcam, ab96098), anti-NANOG (Abcam, ab21624), anti-SOX2 (Santa Cruz, sc-17320), anti-OCT4 (Santa Cruz, sc-5279), anti-SMA (Sigma, A5228), anti-TUJ1 (Sigma, T2200), anti-FOXA2 (Cell Signaling Technology, 8186S), anti-CD90-FITC (BD Bioscience, 555595), anti-CD73-PE (BD Bioscience, 550257), anti-CD105-APC (BD Bioscience, 17-1057-42), anti-IgG-FITC (BD Biosciences, 555748), anti-IgG-PE (BD Biosciences, 555749), anti-IgG-APC (BD Biosciences, 555751), anti-Lamin B (Santa Cruz, sc-6217), anti-LAP2 (BD Bioscience, 611000), anti-Ki67 (ZSGB-BIO, ZM0166), anti-P16 (BD Bioscience, 550834), anti-γ-H2AX (Millipore, 05-636), anti-Nestin (Millipore, MAB5326), anti-PAX6 (Covance, PRB-278P), anti-CPD (Cosmo Bio, TMD-2), anti-cleaved PARP (Cell Signaling Technology, 9541), anti-β-Actin (Santa Cruz, sc69879), anti-GAPDH (Santa Cruz, sc-25778), and anti-hCD31 (BD Bioscience, 555445).

Generation and genotyping of CS-specific fibroblasts

CS-specific fibroblasts were generated from the skin biopsy of a CS patient carrying two heterozygous ERCC6 mutations: c.643G>T in exon 4 and c.3776C>A in exon 18. Fibroblasts were cultured with high-glucose DMEM (HyClone) containing 10% fetal bovine serum (FBS, Gemcell), 1% penicillin/streptomycin (Gibco), and 0.1 mmol/L non-essential amino acids (Gibco). Genotyping of CS-specific fibroblasts was performed using a genomic DNA PCR assay with the primers listed in Table S1. Genomic DNA from the fibroblasts of healthy donor was used as a control, as previously described (Fu et al., 2016).

iPSC generation and culture

CS patient-specific iPSCs were generated by the electroporation of fibroblasts with episomal vectors, including pCXLE-hSK, pCXLE-hOCT3/4-shp53-F and pCXLE-hUL, as previously described (Okita et al., 2011; Liu et al., 2012, 2014; Fu et al., 2016; Wang et al., 2017). The derived iPSC lines were cultured on mitomycin C-treated MEF feeder cells in human ESC medium or on Matrigel (BD Biosciences)-coated plates in mTeSR medium (STEMCELL Technology). The ESC medium consisted of DMEM/F12 (Invitrogen) supplemented with 20% KnockOut Serum Replacement (Invitrogen), 0.1 mmol/L non-essential amino acids (NEAA, Invitrogen), 1% penicillin/streptomycin (Gibco), 2 mmol/L GlutaMAX (Invitrogen), 55 μmol/L β-mercaptoethanol (Invitrogen), and 10 ng/mL bFGF (Joint Protein Central).

Plasmid construction

Guide RNA (gRNA) was designed with http://crispr.mit.edu. The gRNAs were cloned into the pCAG-mCherry-gRNA vector (Addgene #87110). For the expression of Cas9 and GFP (Cas9-2A-GFP), the pCAG-1BPNLS-Cas9-1BPNLS-2AGFP plasmid (Addgene #87109) was used (Suzuki et al., 2016). The sequences for the gRNA target and ssODN used to repair mutant alleles are as follows: Exon 4-gRNA: GGATCACGCCAGTCTGGAGTAGG. ERCC6-ssODN, 5′-CTAAAGAGACACCCTCCACTGACTACAGGCATCAGGCATCAATTCAAGAACACAGAGAAACTGCTCCTAGCATCCTCACCTGCATCCTCtTCCAGACTGGCGTGATCTAGTTCAATTTTCACCTCTG-3′.

Targeted gene correction in CS-iPSCs via the CRISPR/Cas9 system

CRISPR/Cas9-mediated gene correction of ERCC6 mutation was performed as previously described with some modifications (Peters et al., 2008). Briefly, 5 × 106 iPSCs were resuspended in 100 μL of Opti-MEM (Gibco) supplemented with 8 μg of Cas9-2A-GFP, 4 μg of gRNA-mCherry, and 8 μg of ssODN. After electroporation, the cells were cultured on Matrigel-coated plates in mTeSR medium. At forty-eight hours after electroporation, mCherry+/GFP+ cells were collected by FACS and replated onto MEF feeder cells. Two weeks later, the iPSC clones were picked and identified by genomic DNA PCR and sequencing. The primers used are listed in Table S1.

MSC generation and characterization

The differentiation of CS-iPSCs and GC-iPSCs into MSCs was performed as previously described (Zhang et al., 2015; Pan et al., 2016; Wang et al., 2018b). Briefly, embryoid bodies were plated onto Matrigel-coated plates in differentiation medium (αMEM (Invitrogen) supplemented with 10% FBS (Gemcell), 10 ng/mL bFGF (Joint Protein Central, JPC), 5 ng/mL TGFβ (Human Zyme), 0.1 mmol/L NEAA (Gibco) and 1% penicillin/ streptomycin (Gibco)). The differentiated cells were then subjected to FACS to purify the CD73/CD90/CD105 (MSC-specific surface markers) triple-positive MSCs. The purified MSCs were then cultured in αMEM medium supplemented with 10% FBS, 1 ng/mL bFGF, 1% penicillin/streptomycin, and 0.1 mmol/L NEAA.

Clinical-grade MSC differentiation and culture were performed in the cGMP level cell culture facility (Clinical-grade Stem Cell Research Center, Peking University Third Hospital) following the cGMP compliance guidelines. First, differentiation of iPSCs into MSCs was achieved using process similar to that used for general MSCs except prepared in a xeno-free and serum-free condition. Briefly, embryoid bodies were plated onto vitronectin (Gibco, A14700)-coated plates in differentiation medium (BM MSC medium (Dakewe, DKW34-BM20500) supplemented with 5% serum replacement (Helios, GMP grade, HPCFDCGL50), 5 ng/mL TGFβ (Human Zyme), 6 ng/mL bFGF (Joint Protein Central, JPC), 10 ng/mL EGF (Joint Protein Central, JPC), 10 ng/mL PDGF (Joint Protein Central, JPC) and 1% penicillin/streptomycin (Gibco)). Next, the differentiated cells were subjected to FACS to purify the CD73/CD90/CD105 triple-positive MSCs. The purified MSCs were then cultured in BM MSC medium supplemented with 5% serum replacement and 1% penicillin/streptomycin.

The differentiation potential of the MSCs towards chondrocytes, osteoblasts and adipocytes was evaluated by staining with Alcian blue (chondrogenesis), von Kossa (osteogenesis) and an oil red O (adipogenesis) kit (IHC World) after differentiation of the indicated lineage, as previously described (Zhang et al., 2015; Pan et al., 2016; Wang et al., 2018b).

Sterility and pathogen testing of MSCs generated under a cGMP-compliant condition

The conditioned medium of GC-MSCs was collected for the following test. Cell debris in the conditioned medium was removed by centrifugation at 12,000 rpm and 4 °C for 5 min. In addition, the cell culture supernatant was immediately assayed. For CMV, HAV, HCV and HIV-1 ELISA detection, the optical density (O.D.) value for each sample was determined using a microplate reader set to 450 nm (OD450). The duplicate readings for each standard, control, and experimental sample were averaged, and the average zero standard O.D. was subtracted.

Mycoplasma detection

Mycoplasma in the supernatant of the conditioned medium was detected by PCR. The primer sequences are listed in Table S1.

Endotoxin detection

Endotoxin in the supernatant of the conditioned medium was detected with the ToxinSensor Gel Clot Endotoxin Assay Kit (GenScript, Cat. No. L00351) according to the manufacturer’s protocol. Briefly, 100 μL of the supernatants from the positive control (PC), negative control (NC) or experimental samples was transferred to the LAL reagent. The vials were capped and mixed thoroughly. All vials were placed in the incubation rack and incubated at 37 °C for 60 min. Then, the vials were inverted and checked to determine whether a gel was formed. The formation of the gel was considered endotoxin positive. The endotoxin level in the positive sample was higher than 0.25 EU/mL.

CMV detection

CMV IgM in the conditioned medium was detected by ELISA (MEDSON) according to the manufacturer’s instructions. Briefly, 100 μL of the supernatants from the PC, NC or experimental samples was pipetted onto the microplate. After incubation with antigen and conjugate solution, the absorbance of the samples was determined at 450 nm. The test results are interpreted as a ratio of the sample (S) OD450 nm and the cut-off (Co) value (S/Co) according to the following standard: S/Co < 1.0 was considered negative; S/Co > 1.2 was considered positive. Co = NC + 0.25.

HAV detection

HAV IgM and IgG in the conditioned medium were detected by ELISA (DIA. PRO) following the manufacturer’s protocol. Briefly, 100 μL of the supernatants from the PC, NC or experimental samples was pipetted onto the microplate. After incubation with antigen and conjugate solution, the absorbance of the samples was determined at 450 nm. The test results are interpreted as the ratio of the cut-off value to the sample OD450 (Co/S) according to the following standard: Co/S < 0.9 was considered negative; Co/S > 1.1 was considered positive. Co = (NC + PC) / 3.

HCV detection

HCV IgM and IgG in the conditioned medium were detected by ELISA (DIA. PRO) according to the manufacturer’s guidelines. First, 100 μL of the supernatants from the PC, NC or experimental samples was pipetted onto the microplate. After incubation with antigen and conjugate solution, the test results are interpreted as the ratio of OD450 of the sample to the cut-off value (S/Co) according to the following standard: S/Co < 0.9 was considered negative; S/Co > 1.1 was considered positive. Co = NC + 0.35.

HIV-1 detection

HIV-1 Gap p24 in the conditioned medium was detected by ELISA (R&D SYSTEMS) according to the manufacturer’s protocol. Briefly, 100 μL of the supernatants from the standard, control or experimental samples was pipetted onto the microplate. After incubation with conjugate solution, the concentration of each sample was calculated by OD450. The minimum detectable dose of HIV-1 Gag p24 ranged from 0.24–3.25 pg/mL.

Febrile pathogen detection

Pathogens in the conditioned medium were detected by the Febrile Antigens Kit (Rapid Labs). Briefly, 80 μL of the supernatants from the PC, NC or experimental samples was dispensed onto a 3 cm diameter circle. One drop of the antigen suspension was added to the sample. The reaction mixture was mixed well using a stirring stick, and the slide was rocked gently by hand for 1 min. The slides were immediately observed under suitable light for any degree of agglutination. Nonreactive: smooth suspension with no visible agglutination, as shown by the NC. Reactive: any degree of agglutination visible macroscopically.

NSC generation and characterization

NSC differentiation was conducted as previously described (Liu et al., 2012; Duan et al., 2015). In brief, iPSCs cultured on MEF feeder cells were differentiated with NIM-1 medium [50% Advanced DMEM/F12 (Invitrogen), 50% Neurobasal Medium (Invitrogen), 1× N2 Supplement (Invitrogen), 1× B27 Supplement (Invitrogen), 4 µmol/L CHIR99021 (Cellagentech), 3 µmol/L SB431542 (Cellagentech), 10 ng/mL human leukemia inhibitory factor (hLIF, Millipore), 2 µmol/L dorsomorphin (Sigma), 0.1 µmol/L Compound E (EMD Chemicals Inc.) and 2 mmol/L GlutaMAX (Invitrogen)]. Two days later, the medium was changed to NIM-2 medium (50% Advanced DMEM/F12, 50% Neurobasal Medium, 1× N2 Supplement, 1× B27 Supplement, 4 µmol/L CHIR99021, 3 µmol/L SB431542, 10 ng/mL hLIF, 0.1 µmol/L Compound E and 2 mmol/L GlutaMAX) for five more days. The NSCs were then generated and further cultured in NSC maintenance medium containing 50% Neurobasal Medium, 50% Advanced DMEM/F12, 1× N2 Supplement, 1× B27 Supplement, 2 mmol/L GlutaMAX, 3 μmol/L CHIR99021, 2 μmol/L SB431542 and 10 ng/mL hLIF.

Animal experiments

All animal experiments performed in this study were approved by the Chinese Academy of Science Institutional Animal Care and Use Committee. For the teratoma formation assay, 6-week-old male NOD-SCID mice were injected subcutaneously with 3 × 106 CS-iPSCs or GC-iPSCs in a Matrigel/mTeSR solution, as previously described (Zhang et al., 2015). Teratomas with a size of approximately 10 mm in diameter were collected and subjected to immunostaining. For the MSC in vivo imaging assay, 106 CS-MSCs or GC-MSCs expressing luciferase were transplanted into the TA muscle of 6-week-old male nude mice. The grafted cells were imaged with an IVIS spectrum imaging system (XENOGEN, Caliper) by detecting luciferase activity. To evaluate the potential tumorigenesis risk of GC-MSCs in vivo, a subcutaneous injection of GC-MSCs was performed in NSG mice. Human ESC (line H9) and U2-OS osteosarcoma cell lines were also implanted independently as positive controls. Fat pad transplantation was performed as previously described (Yu et al., 2016; Geng et al., 2018). CS-MSCs or GC-MSCs (1.5 × 105) were freshly collected and resuspended in Matrigel mixture containing 50% Matrigel, 20% FBS in PBS, and 0.01% Trypan Blue (Sigma). The mixture was then injected into the fat pads of 3-week-old female NOD-SCID mice. Four weeks later, the fat pads were harvested for measuring MSC-derived vessel regeneration by immunofluorescence staining.

<bold>Senescence</bold>-<bold>associated β</bold>-<bold>galactosidase (SA</bold>-<bold>β</bold>-<bold>Gal) staining assay</bold>

SA-β-Gal staining was performed according to a previously described method (Debacq-Chainiaux et al., 2009; Zhang et al., 2015; Pan et al., 2016; Geng et al., 2018; Wang et al., 2018b). Each experiment was performed in three independent replicates.

Clonal expansion assay

Approximately 2000 cells were seeded into each well of 12-well plates and cultured for 2 weeks. Then, the cells were stained with 0.2% crystal violet, and the intensity of the crystal violet staining was quantified by ImageJ software. Each experiment was performed in three independent replicates.

<bold>RT</bold>-<bold>qPCR</bold>

Total RNA was extracted with TRIzol reagent (Invitrogen), and 2 μg of total RNA was used for cDNA synthesis using a reverse transcription master mix (Promega). Quantitative real-time PCR was conducted with the iTaq Universal SYBR Green Super Mix (Bio-Rad) with the CFX384 Real-Time PCR system (Bio-Rad). All data were normalized to the 18S rRNA transcript and calculated using the ΔΔCq method. All RT-qPCR primer pairs are listed in Table S1.

Western blot

Western blot was performed as previously described (Wang et al., 2015, 2016). Briefly, protein quantification was conducted using a BCA Kit. Protein lysates were subjected to SDS-PAGE and subsequently electrotransferred to a polyvinylidene fluoride membrane (Millipore). The membrane was incubated with the indicated primary antibodies overnight at 4 °C and HRP-conjugated secondary antibodies for 1 h at room temperate (RT), followed by visualization using the ChemiDoc XRS system (Bio-Rad). Quantification was performed with ImageJ software.

Immunofluorescence

Immunofluorescence was conducted as previously described (Wang et al., 2016). Briefly, the cells were fixed with 4% paraformaldehyde for 25 min, permeabilized with Triton X-100 (0.3% in PBS) for 25 min, incubated with blocking buffer (10% donkey serum in PBS) for 1 h at RT, and stained with primary antibodies overnight at 4 °C. Then, the cells were incubated with secondary antibodies for 1 h at RT. Hoechst 33342 (Invitrogen) was used to stain nuclear DNA.

Analysis of apoptosis by flow cytometry

A FACS-based apoptosis analysis was performed as previously described (Fu et al., 2016; Pan et al., 2016). For ROS measurement, cells were collected and incubated with 1 μmol/L H2DCFDA for 30 min using ROS Detection Reagents (Molecular Probes, C6827). The cells were later analysed using the BD LSRFortessa cell analyser.

RNA sequencing library construction

Total RNA for each sample was extracted using the RNeasy Mini Kit (Qiagen) according to the manufacturer’s instructions. After quantification of the RNA by a fragment analyzer (Advanced Analytical), RNA sequencing libraries were constructed using the TruSeq RNA Sample Preparation Kit (Illumina) according to the manufacturer’s protocols. Paired-end sequencing was performed using Illumina Hiseq X Ten platform.

RNA sequencing data processing

RNA-seq data processing was performed as previously described (Zhang et al., 2015, 2019; Geng et al., 2018; Wang et al., 2018a; Ling et al., 2019). In brief, sequencing reads were trimmed and mapped to the H. sapiens reference genome (hg19) with HISAT2 software (v2.0.4) (Kim et al., 2015). HTSeq (v0.10.0) was used to determine the transcriptional expression level of each gene (Anders et al., 2015). Differentially expressed genes (DEGs) were computed at a cut-off adjusted P value (Benjamini-Hochberg) less than 0.05 and |Log2(fold change)| more than 1 using DESeq2 (Love et al., 2014). Pearson’s correlation coefficient (R) and the Euclidian distance were calculated using R to evaluate the correlation between the replicates of each sample, which were based on Log2(FPKM + 1). PCA was also performed using R based on Log2(FPKM + 1). Gene ontology (GO) enrichment analysis was computed by Metascape (Tripathi et al., 2015). The enrichment networks were visualized using Cytoscape (Shannon et al., 2003). Protein-protein interaction networks of overlapping genes were drawn based on the search tool for the retrieval of interacting genes (STRING) database (Szklarczyk et al., 2017). The aging-associated genes were obtained from the human aging genomic resources (HAGR) database (Tacutu et al., 2013).

DNA extraction, library construction and sequencing

Genomic DNA was extracted from each sample using the QIAamp® DNA Mini Kit (Qiagen), according to the manufacturer’s protocol. DNA was randomly fragmented into ~300 bp lengths using a Covaris ultrasonic processor. DNA libraries were prepared with the NEBNext® UltraTM DNA library Prep Kit (Illumina) and quantified using a Qubit 2.0 Fluorometer (Life Technologies). The insert sizes of the fragments in the libraries were determined by the Agilent Bioanalyzer 2100. Paired-end sequencing was performed using the Illumina HiSeq X Ten platform.

Bioinformatics analyses of copy number variations, single-nucleotide variants and off-target sites

The pipeline of whole genome sequencing data processing used in this study has been described previously (Zhang et al., 2018). In brief, sequencing data were mapped to the H. sapiens reference genome (hg19) without repeat regions using the Burrows-Wheeler Aligner (BWA, version 0.7.17) (Li and Durbin, 2009). The genomic coverage for each 500 kb bin window was calculated and normalized by the average sequencing depth. The copy number variation (CNV) scatterplot was drawn by ggplot2. For the single-nucleotide variant (SNV) analysis, the read base sites with an incorrect base probability >0.001 were masked with N, and base distribution for each chromosomal location was calculated by pysamstats (version 1.0.1) (https://github.com/alimanfoo/pysamstats). The heterozygosity of each site was defined as the percentage of the second enriched base depth. SNV sites were defined by base heterozygosity (0%–30%). Potential indel sites were extracted with pysamstats (version 1.0.1) under default setting. Then indel sites were screened with sites existing in CS-iPSC genomic sequencing datasets, repeats and low-complexity regions annotated by RepeatMasker (db20170127), indel-type SNPs in humans and homopolymers. Simultaneously, 2034 off-target sites with no more than five mismatched sites were identified by Cas-OFFinder (Bae et al., 2014). None of these regions included indel sites identified by whole genome sequencing.

Statistical analysis

All results are presented as the mean ± SEM or mean ± SD. The data were statistically analysed using a two-tailed Student’s t-test to compare differences between treatments assuming equal variance with PRISM software (GraphPad 5 Software). P values <0.05, <0.01, and <0.001 were considered statistically significant (*, **, and ***, respectively).

Accession numbers

The sequencing data have been deposited in the NCBI Gene Expression Omnibus (GEO) under the accession number GSE124208, NCBI Sequence Read Archive under accession number SRP174074. +

Electronic supplementary material

Below is the link to the electronic supplementary material. +

Supplementary material 1 (PDF 3822 kb)

Supplementary material 2 (XLSX 13 kb)

Si Wang, Zheying Min, and Qianzhao Ji have contributed equally.

Change history

1/15/2022

A Correction to this paper has been published: 10.1007/s13238-021-00901-3

Acknowledgements

The authors acknowledge L. Bai, R. Bai, Q. Chu, J. Lu, S. Ma and Y. Yang for administrative assistance and W. Li, J. Jia and X. Zhang for assistance with animal experiments. This work was supported by the National Key Research and Development Program of China (2018YFC2000100), the Strategic Priority Research Program of the Chinese Academy of Sciences (XDA16010100), the National Key Research and Development Program of China (2018YFA0107203, 2017YFA0103304, 2017YFA0102802, 2016YFC1000601, 2015CB964800, 2014CB910503, and 2018YFA0108500), the National Natural Science Foundation of China (Grant Nos. 81625009, 81330008, 91749202, 91749123, 31671429, 81671377, 81771515, 31601109, 31601158, 81701388, 81601233, 81822018, 81801399, 31801010, 81801370, 81861168034, 81571400, and 81771580), the Program of the Beijing Municipal Science and Technology Commission (Z151100003915072), the Key Research Program of the Chinese Academy of Sciences (KJZDEWTZ-L05), the Beijing Municipal Commission of Health and Family Planning (PXM2018_026283_000002) and the Advanced Innovation Center for Human Brain Protection (117212, 3500-1192012).

ReferencesAmaro-OrtizAYanBD’OrazioJAUltraviolet radiation, aging and the skin: prevention of damage by topical cAMP manipulationMolecules2014196202621924838074AndersSPylPTHuberWHTSeq: a Python framework to work with high-throughput sequencing dataBioinformatics20153116616925260700AndradeLNNathansonJLYeoGWMenckCFMuotriAREvidence for premature aging due to oxidative stress in iPSCs from Cockayne syndromeHum Mol Genet2012213825383422661500AndressooJOMitchellJRde WitJHoogstratenDVolkerMToussaintWSpeksnijderEBeemsRBvan SteegHJansJAn Xpd mouse model for the combined xeroderma pigmentosum/Cockayne syndrome exhibiting both cancer predisposition and segmental progeriaCancer Cell20061012113216904611BaeSParkJKimJSCas-OFFinder: a fast and versatile algorithm that searches for potential off-target sites of Cas9 RNA-guided endonucleasesBioinformatics2014301473147524463181CadetJSageEDoukiTUltraviolet radiation-mediated damage to cellular DNAMutat Res200557131715748634Castro-VinuelasRSanjurjo-RodriguezCPineiro-RamilMHermida-GomezTFuentes-BoqueteIMde Toro-SantosFJBlanco-GarciaFJDiaz-PradoSMInduced pluripotent stem cells for cartilage repair: current status and future perspectivesEur Cell Mater2018369610930204229CiaffardiniFNicolaiSCaputoMCanuGPaccosiECostantinoMFrontiniMBalajeeASProietti-De-SantisLThe cockayne syndrome B protein is essential for neuronal differentiation and neuritogenesisCell Death Dis20145e126824874740CleaverJELamETRevetIDisorders of nucleotide excision repair: the genetic and molecular basis of heterogeneityNat Rev Genet20091075676819809470Debacq-ChainiauxFErusalimskyJDCampisiJToussaintOProtocols to detect senescence-associated beta-galactosidase (SA-betagal) activity, a biomarker of senescent cells in culture and in vivoNat Protoc200941798180620010931DingZSuiLRenRLiuYXuXFuLBaiRYuanTHaoYZhangWA widely adaptable approach to generate integration-free iPSCs from non-invasively acquired human somatic cellsProtein Cell2015638638925412771DuanSYuanGLiuXRenRLiJZhangWWuJXuXFuLLiYPTEN deficiency reprogrammes human neural stem cells towards a glioblastoma stem cell-like phenotypeNat Commun201561006826632666FriedbergECHow nucleotide excision repair protects against cancerNat Rev Cancer20011223311900249FriedbergECDNA damage and repairNature200342143644012540918FuLNXuXLRenRTWuJZhangWQYangJPRenXQWangSZhaoYSunLModeling xeroderma pigmentosum associated neurological pathologies with patients-derived iPSCsProtein Cell2016721022126874523GengLLiuZZhangWLiWWuZWangWRenRSuYWangPSunLChemical screen identifies a geroprotective role of quercetin in premature agingProtein Cell201810.1007/s13238-018-0567-y30069858GolpanianSDiFedeDLPujolMVLoweryMHLevis-DusseauSGoldsteinBJSchulmanIHLongsomboonBWolfAKhanARationale and design of the allogeneiC human mesenchymal stem cells (hMSC) in patients with aging fRAilTy via intraveno US delivery (CRATUS) study: A phase I/II, randomized, blinded and placebo controlled trial to evaluate the safety and potential efficacy of allogeneic human mesenchymal stem cell infusion in patients with aging frailtyOncotarget20167118991191226933813GolpanianSDiFedeDLKhanASchulmanIHLandinAMTompkinsBAHeldmanAWMikiRGoldsteinBJMushtaqMAllogeneic human mesenchymal stem cell infusions for aging frailtyJ Gerontol A20177215051512GorgelsTGvan der PluijmIBrandtRMGarinisGAvan SteegHvan den AardwegGJansenGHRuijterJMBergenAAvan NorrenDRetinal degeneration and ionizing radiation hypersensitivity in a mouse model for Cockayne syndromeMol Cell Biol2007271433144117145777HishiyaAWatanabeKProgeroid syndrome as a model for impaired bone formation in senile osteoporosisJ Bone Miner Metab20042239940315316860JaarsmaDvan der PluijmIde WaardMCHaasdijkEDBrandtRVermeijMRijksenYMaasAvan SteegHHoeijmakersJHAge-related neuronal degeneration: complementary roles of nucleotide excision repair and transcription-coupled repair in preventing neuropathologyPLoS Genet20117e100240522174697KarikkinethACScheibye-KnudsenMFivensonECroteauDLBohrVACockayne syndrome: clinical features, model systems and pathwaysAgeing Res Rev20173331727507608KawamuraTSuzukiJWangYVMenendezSMoreraLBRayaAWahlGMIzpisua BelmonteJCLinking the p53 tumour suppressor pathway to somatic cell reprogrammingNature20094601140114419668186KempMGSpandauDFTraversJBImpact of age and insulin-like growth factor-1 on DNA damage responses in UV-irradiated human skinMolecules201722356KimDLangmeadBSalzbergSLHISAT: a fast spliced aligner with low memory requirementsNat Methods20151235736025751142KubbenNZhangWWangLVossTCYangJQuJLiuGHMisteliTRepression of the antioxidant NRF2 pathway in premature agingCell20161651361137427259148LaugelVCockayne syndrome: the expanding clinical and mutational spectrumMech Ageing Dev201313416117023428416LiHDurbinRFast and accurate short read alignment with Burrows–Wheeler transformBioinformatics2009251754176019451168LiYZhangWChangLHanYSunLGongXTangHLiuZDengHYeYVitamin C alleviates aging defects in a stem cell model for Werner syndromeProtein Cell2016747848827271327LingCLiuZSongMZhangWWangSLiuXMaSSunSFuLChuQModeling CADASIL vascular pathologies with patient-derived induced pluripotent stem cellsProtein Cell.20191024927130778920LiuGHBarkhoBZRuizSDiepDQuJYangSLPanopoulosADSuzukiKKurianLWalshCRecapitulation of premature ageing with iPSCs from Hutchinson–Gilford progeria syndromeNature201147222122521346760LiuGHSuzukiKQuJSancho-MartinezIYiFLiMKumarSNivetEKimJSoligallaRDTargeted gene correction of laminopathy-associated LMNA mutations in patient-specific iPSCsCell Stem Cell2011868869421596650LiuGHQuJSuzukiKNivetELiMMontserratNYiFXuXRuizSZhangWProgressive degeneration of human neural stem cells caused by pathogenic LRRK2Nature201249160360723075850LiuGHSuzukiKLiMQuJMontserratNTarantinoCGuYYiFXuXZhangWModelling Fanconi anemia pathogenesis and therapeutics using integration-free patient-derived iPSCsNat Commun20145433024999918LoveMIHuberWAndersSModerated estimation of fold change and dispersion for RNA-seq data with DESeq2Genome Biol20141555025516281McKayBCCabritaMAArresting transcription and sentencing the cell: the consequences of blocked transcriptionMech Ageing Dev201313424325223542592MullerLUMilsomMDHarrisCEVyasRBrummeKMParmarKMoreauLASchambachAParkIHLondonWBOvercoming reprogramming resistance of Fanconi anemia cellsBlood20121195449545722371882MuraiMEnokidoYInamuraNYoshinoMNakatsuYvan der HorstGTHoeijmakersJHTanakaKHatanakaHEarly postnatal ataxia and abnormal cerebellar development in mice lacking Xeroderma pigmentosum Group A and Cockayne syndrome Group B DNA repair genesProc Natl Acad Sci USA200198133791338411687625NataleVA comprehensive description of the severity groups in Cockayne syndromeAm J Med Genet A2011155A1081109521480477NewmanJCBaileyADWeinerAMCockayne syndrome group B protein (CSB) plays a general role in chromatin maintenance and remodelingProc Natl Acad Sci USA20061039613961816772382OkitaKMatsumuraYSatoYOkadaAMorizaneAOkamotoSHongHNakagawaMTanabeKTezukaKA more efficient method to generate integration-free human iPS cellsNat Methods2011840941221460823OrozcoLSolerRMoreraCAlbercaMSanchezAGarcia-SanchoJIntervertebral disc repair by autologous mesenchymal bone marrow cells: a pilot studyTransplantation20119282282821792091OrozcoLMunarASolerRAlbercaMSolerFHuguetMSentisJSanchezAGarcia-SanchoJTreatment of knee osteoarthritis with autologous mesenchymal stem cells: a pilot studyTransplantation2013951535154123680930OrozcoLMunarASolerRAlbercaMSolerFHuguetMSentisJSanchezAGarcia-SanchoJTreatment of knee osteoarthritis with autologous mesenchymal stem cells: two-year follow-up resultsTransplantation201497e66e6824887752PanHGuanDLiuXLiJWangLWuJZhouJZhangWRenRLiYSIRT6 safeguards human mesenchymal stem cells from oxidative stress by coactivating NRF2Cell Res20162619020526768768Peters DT, Cowan CA, Musunuru K (2008) Genome editing in human pluripotent stem cells. In: StemBook, CambridgeProietti-De-SantisLDranePEglyJMCockayne syndrome B protein regulates the transcriptional program after UV irradiationEMBO J2006251915192316601682RockxDAMasonRvan HoffenABartonMCCitterioEBregmanDBvan ZeelandAAVrielingHMullendersLHUV-induced inhibition of transcription involves repression of transcription initiation and phosphorylation of RNA polymerase IIProc Natl Acad Sci USA200097105031050810973477SaccoRTamblynLRajakulendranNBralhaFNTropepeVLaposaRRCockayne syndrome b maintains neural precursor functionDNA Repair20131211012023245699SetlowRBSetlowJKEvidence that ultraviolet-induced thymine dimers in DNA cause biological damageProc Natl Acad Sci USA1962481250125713910967ShannonPMarkielAOzierOBaligaNSWangJTRamageDAminNSchwikowskiBIdekerTCytoscape: a software environment for integrated models of biomolecular interaction networksGenome Res2003132498250414597658ShehataLSimeonovDRRaamsAWolfeLVanderverALiXHuangYGarnerSBoerkoelCFThurmAERCC6 dysfunction presenting as progressive neurological decline with brain hypomyelinationAm J Med Genet A2014164A2892290025251875ShimamotoAKagawaHZenshoKSeraYKazukiYOsakiMOshimuraMIshigakiYHamasakiKKodamaYReprogramming suppresses premature senescence phenotypes of Werner syndrome cells and maintains chromosomal stability over long-term culturePLoS ONE20149e11290025390333SoontararakSChowLJohnsonVCoyJWheatWReganDDowSMesenchymal stem cells (MSC) derived from induced pluripotent stem cells (iPSC) equivalent to adipose-derived MSC in promoting intestinal healing and microbiome normalization in mouse inflammatory bowel disease modelStem Cells Transl Med2018745646729635868SuzukiKTsunekawaYHernandez-BenitezRWuJZhuJKimEJHatanakaFYamamotoMAraokaTLiZIn vivo genome editing via CRISPR/Cas9 mediated homology-independent targeted integrationNature201654014414927851729SzklarczykDMorrisJHCookHKuhnMWyderSSimonovicMSantosADonchevaNTRothABorkPThe STRING database in 2017: quality-controlled protein-protein association networks, made broadly accessibleNucleic Acids Res201745D362D36827924014TacutuRCraigTBudovskyAWuttkeDLehmannGTaranukhaDCostaJFraifeldVEde MagalhaesJPHuman ageing genomic resources: integrated databases and tools for the biology and genetics of ageingNucleic Acids Res201341D1027D103323193293TompkinsBADiFedeDLKhanALandinAMSchulmanIHPujolMVHeldmanAWMikiRGoldschmidt-ClermontPJGoldsteinBJAllogeneic mesenchymal stem cells ameliorate aging frailty: a phase II randomized, double-blind, placebo-controlled clinical trialJ Gerontol A20177215131522TripathiSPohlMOZhouYRodriguez-FrandsenAWangGSteinDAMoultonHMDeJesusPCheJMulderLCMeta- and orthogonal integration of influenza “OMICs” data defines a role for UBR4 in virus buddingCell Host Microbe20151872373526651948van der HorstGTvan SteegHBergRJvan GoolAJde WitJWeedaGMorreauHBeemsRBvan KreijlCFde GruijlFRDefective transcription-coupled repair in Cockayne syndrome B mice is associated with skin cancer predispositionCell1997894254359150142van der HorstGTMeiraLGorgelsTGde WitJVelasco-MiguelSRichardsonJAKampYVreeswijkMPSmitBBootsmaDUVB radiation-induced cancer predisposition in Cockayne syndrome group A (Csa) mutant miceDNA Repair2002114315712509261van der PluijmIGarinisGABrandtRMGorgelsTGWijnhovenSWDiderichKEde WitJMitchellJRvan OostromCBeemsRImpaired genome maintenance suppresses the growth hormone–insulin-like growth factor 1 axis in mice with Cockayne syndromePLoS Biol20075e217326724Velez-CruzREglyJMCockayne syndrome group B (CSB) protein: at the crossroads of transcriptional networksMech Ageing Dev201313423424223562425Velez-CruzRZadorinASCoinFEglyJMSirt1 suppresses RNA synthesis after UV irradiation in combined xeroderma pigmentosum group D/Cockayne syndrome (XP-D/CS) cellsProc Natl Acad Sci USA2013110E212E22023267107VessoniATHeraiRHKarpiakJVLealAMTrujilloCAQuinetAAgnez LimaLFMenckCFMuotriARCockayne syndrome-derived neurons display reduced synapse density and altered neural network synchronyHum Mol Genet2016251271128026755826WangSWangXWuYHanCIGF-1R signaling is essential for the proliferation of cultured mouse spermatogonial stem cells by promoting the G2/M progression of the cell cycleStem Cells Dev20152447148325356638WangSWangXMaLLinXZhangDLiZWuYZhengCFengXLiaoSRetinoic acid is sufficient for the in vitro induction of mouse spermatocytesStem Cell Rep201678094WangLXYiFFuLNYangJPWangSWangZXSuzukiKSunLXuXLYuYCRISPR/Cas9-mediated targeted gene correction in amyotrophic lateral sclerosis patient iPSCsProtein Cell2017836537828401346WangPLiuZZhangXLiJSunLJuZLiJChanPLiuGHZhangWCRISPR/Cas9-mediated gene knockout reveals a guardian role of NF-kappaB/RelA in maintaining the homeostasis of human vascular cellsProtein Cell2018994596529968158WangSHuBDingZDangYWuJLiDLiuXXiaoBZhangWRenRATF6 safeguards organelle homeostasis and cellular aging in human mesenchymal stem cellsCell Discov20184229423270WangSLiuZYeYLiBLiuTZhangWLiuGHZhangYAQuJXuDEctopic hTERT expression facilitates reprograming of fibroblasts derived from patients with Werner syndrome as a WS cellular modelCell Death Dis2018992330206203WuZZhangWSongMWangWWeiGLiWLeiJHuangYSangYChanPDifferential stem cell aging kinetics in Hutchinson–Gilford progeria syndrome and Werner syndromeProtein Cell2018933335029476423YamadaAMasutaniCHanaokaFDetection of reduced RNA synthesis in UV-irradiated Cockayne syndrome group B cells using an isolated nuclear systemBiochim Biophys Acta2002159212913412379475YanPLiQWangLLuPSuzukiKLiuZLeiJLiWHeXWangSFOXO3-engineered human ESC-derived vascular cells Promote vascular protection and regenerationCell Stem Cell201910.1016/j.stem.2018.12.00231173712YangJLiJSuzukiKLiuXWuJZhangWRenRZhangWChanPIzpisua BelmonteJCGenetic enhancement in cultured human adult stem cells conferred by a single nucleotide recodingCell Res2017271178118128685772YuQCSongWWangDZengYAIdentification of blood vascular endothelial stem cells by the expression of protein C receptorCell Res2016261079109827364685ZhangWLiJSuzukiKQuJWangPZhouJLiuXRenRXuXOcampoAAging stem cells. A Werner syndrome stem cell model unveils heterochromatin alterations as a driver of human agingScience20153481160116325931448ZhangWWanHFengGQuJWangJJingYRenRLiuZZhangLChenZSIRT6 deficiency results in developmental retardation in cynomolgus monkeysNature201856066166530135584ZhangXLiuZLiuXWangSZhangYHeXSunSMaSShyh-ChangNLiuFTelomere-dependent and telomere-independent roles of RAP1 in regulating human stem cell homeostasisProtein Cell201910.1007/s13238-019-0610-731781970
diff --git a/jcore-pmc-db-reader/src/test/resources/testdocs/PMC7511315.xml b/jcore-pmc-db-reader/src/test/resources/testdocs/PMC7511315.xml new file mode 100644 index 000000000..b28626ba1 --- /dev/null +++ b/jcore-pmc-db-reader/src/test/resources/testdocs/PMC7511315.xml @@ -0,0 +1,28 @@ + +
pmcNat CommunNat CommunNature Communications2041-1723Nature Publishing Group UKLondon32968055PMC75113151839610.1038/s41467-020-18396-7ArticleTranscriptomic profiling of human cardiac cells predicts protein kinase inhibitor-associated cardiotoxicityhttp://orcid.org/0000-0002-1664-7314van HasseltJ. G. Coen12RahmanRayees1http://orcid.org/0000-0002-1362-6534HansenJens1SternAlan1ShimJaehee V.1XiongYuguang1PickardAmanda1JayaramanGomathi1HuBin1MahajanMilind3GalloJames M.14GoldfarbJoseph1SobieEric A.1http://orcid.org/0000-0002-0341-0705BirtwistleMarc R.15http://orcid.org/0000-0003-4007-7814SchlessingerAvner
avner.schlessinger@mssm.edu
1
http://orcid.org/0000-0001-6137-109XAzelogluEvren U.
evren.azeloglu@mssm.edu
16
http://orcid.org/0000-0002-7814-0180IyengarRavi
ravi.iyengar@mssm.edu
1
grid.59734.3c0000 0001 0670 2351Department of Pharmacological Sciences and Systems Biology Center New York, Icahn School of Medicine at Mount Sinai, New York, NY USA grid.5132.50000 0001 2312 1970Division of Systems Biomedicine and Pharmacology, Leiden Academic Centre for Drug Research, Leiden University, Leiden, Netherlands grid.59734.3c0000 0001 0670 2351Department of Genetics and Genomic Sciences, and Icahn Institute for Genomic Sciences and Multiscale Biology, Icahn School of Medicine at Mount Sinai, New York, NY USA grid.273335.30000 0004 1936 9887Department of Pharmaceutical Sciences, School of Pharmacy and Pharmaceutical Sciences, University at Buffalo, Buffalo, NY USA grid.26090.3d0000 0001 0665 0280Department of Chemical and Biomolecular Engineering, Clemson University, Clemson, SC USA grid.59734.3c0000 0001 0670 2351Deparment of Medicine, Division of Nephrology, Icahn School of Medicine at Mount Sinai, New York, NY USA
2392020239202020201148091220171882020© The Author(s) 2020https://creativecommons.org/licenses/by/4.0/Open Access This article is licensed under a Creative Commons Attribution 4.0 International License, which permits use, sharing, adaptation, distribution and reproduction in any medium or format, as long as you give appropriate credit to the original author(s) and the source, provide a link to the Creative Commons license, and indicate if changes were made. The images or other third party material in this article are included in the article’s Creative Commons license, unless indicated otherwise in a credit line to the material. If material is not included in the article’s Creative Commons license and your intended use is not permitted by statutory regulation or exceeds the permitted use, you will need to obtain permission directly from the copyright holder. To view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/.

Kinase inhibitors (KIs) represent an important class of anti-cancer drugs. Although cardiotoxicity is a serious adverse event associated with several KIs, the reasons remain poorly understood, and its prediction remains challenging. We obtain transcriptional profiles of human heart-derived primary cardiomyocyte like cell lines treated with a panel of 26 FDA-approved KIs and classify their effects on subcellular pathways and processes. Individual cardiotoxicity patient reports for these KIs, obtained from the FDA Adverse Event Reporting System, are used to compute relative risk scores. These are then combined with the cell line-derived transcriptomic datasets through elastic net regression analysis to identify a gene signature that can predict risk of cardiotoxicity. We also identify relationships between cardiotoxicity risk and structural/binding profiles of individual KIs. We conclude that acute transcriptomic changes in cell-based assays combined with drug substructures are predictive of KI-induced cardiotoxicity risk, and that they can be informative for future drug discovery.

Cardiotoxic adverse events associated with kinase inhibitors are a growing concern in clinical oncology. Here the authors use cellular transcriptomic responses of human cardiomyocytes treated with protein kinase inhibitors and the associated drug structural signatures to determine an integrated predictive signature of cardiotoxicity.

Subject termsToxicologyPredictive markersCardiologyhttps://doi.org/10.13039/100000051U.S. Department of Health & Human Services | NIH | National Human Genome Research Institute (NHGRI)U54HG008098IyengarRaviissue-copyright-statement© The Author(s) 2020
Introduction

Protein kinase inhibitors (KIs) are an important class of therapeutics used for the treatment of various forms of cancer1,2 and other diseases. There are currently more than 48 KIs approved for clinical use by the U.S. Food and Drug Administration (FDA) and other regulatory agencies3, and more than 250 KIs are undergoing clinical trials or are in development46. The clinical effectiveness of KIs as cancer drugs has led to a broad effort to develop drugs that are more efficacious and have reduced the propensity for adverse events. Cardiotoxicity (CT) is a clinically important adverse event associated with several KIs710. KI-associated CT manifests as loss of cardiomyocyte function, which can lead to heart failure11. Given the extensive therapeutic potential of KIs, approaches to identify and subsequently mitigate the risk for CT during early development of novel KIs and during clinical administration are urgently required.

We do not yet sufficiently understand the mechanisms underlying KI-associated CT. The human kinome consists of more than 500 protein kinases12. Given that many KIs exhibit multitarget pharmacology13, inhibition of multiple protein kinases in cardiomyocytes may lead to adverse drug effects such as CT14. For individual KIs, pathways involved in mitochondrial function8,15,16, endoplasmic reticulum stress response16, and AMPK inhibition17, have been shown to be associated with KI-induced CT18. Overall, however, the general mechanisms of KI-induced CT are still poorly understood18.

Obtaining quantitative clinical risk scores for KI-associated CT is also challenging, as the risk for KI-associated CT has not been systematically studied. The FDA adverse event report system (FAERS) database has been previously applied to quantify the risk of ADRs1921. The FAERS database contains over 9 million individual drug-associated adverse-event reports reported by industry and physicians. Through statistical analyses of the FAERS database, relatively unbiased estimates for the relative risk for specific ADRs can be computed. Such risk scores are clinically relevant as they are based on real-life patient population, and they are not solely based on selected patient cohorts. We previously used such analyses of the FAERS database in combination with systems’ pharmacology-based approaches to obtain mechanistic insights into adverse-event mechanisms21,22.

In the current study, generated as part of the NIH-funded Library of Integrated Network Based Cellular Signatures (LINCS) Drug Toxicity Signature Generation Center (DToxS), we take a top–down global approach to determine if a comprehensive profiling of gene expression changes in human cardiomyocytes can provide insight into pathways associated with KI-induced CT, and to potentially predict the risk of CT. The rationale for this approach is based on the central assumption that CT largely originates from cardiomyocytes where one or more protein kinases contribute to the pathophysiology. Since progression to heart failure takes several months to manifest, it is not immediately obvious if gene expression changes measured after drug treatment for a few days would have predictive value. Thus, a second important assumption is that early changes in gene expression upon drug treatment of cardiomyocytes are indicative of later physiological events. We test the validity of our assumptions by experimentally obtaining gene-expression patterns for the different KIs, and if these patterns could be selectively associated with the clinical risk of CT for each KI, thereby providing gene-expression signatures for KI-associated CT.

We report the generation of transcriptomic profiles from four human primary cardiomyocyte-like cell lines. These profiles are generated using 23 KIs that were FDA-approved and used extensively at the time of experimental design, such that an adequate number of clinical reports have been collected. Drugs are used at their imputed therapeutic concentrations. Through this pan-KI transcriptomic profiling, we obtained insights into the affected pathways that may be related to KI-associated CT. We show that selective patterns of gene expression can be associated with the FAERS-derived clinical risk for KI-associated CT, which may be highly relevant to identify KI drug candidates at risk for showing clinical CT. We also describe the relationships between KI CT risk and structural properties of KIs, highlighting the potential for re-engineering small molecules that exhibit a high risk for CT.

ResultsDifferences in CT risk of kinase inhibitors

In order to obtain unbiased estimates of clinical risk of KI-associated CT, we analyzed individual adverse-event reporting data from FAERS (Fig. 1a). Reporting odds ratios (RORs) were derived based on the relative frequencies of AE occurrence of each KI compared to all KIs. These risk scores provide a relative ranking of KI-associated toxicity. Kinase inhibitors were shown to have pronounced differences in the relative risk of CT (Fig. 1b). When comparing the ranking of risk scores derived from FAERS with adverse drug-reaction (ADR) reporting data from the World Health Organization (WHO) ADR reporting database, we find that the ranking from these databases largely agrees (Fig. 1c), indicating the general consistency of the clinical risk scores across databases.Cardiotoxicity of protein kinase inhibitors.

a Approach to quantify relative clinical cardiotoxicity risk scores for kinase inhibitors from the FDA Adverse Event Reporting System (FAERS) database. b Reporting odds ratio (mean and 95% confidence interval of computed odds ratio) for cardiotoxicity across kinase inhibitors from FAERS. c Comparison of ranking derived from FAERS and WHO Pharmacovigilance data shows agreement. d Literature-reported in vitro and in vivo preclinical assays to predict KI-associated cardiotoxicity poorly correlated with clinical FAERS-derived risk scores for cardiotoxicity at clinical drug concentrations. e In vitro dose–response experiments for selected KIs for viability and mitochondrial stress poorly correlate with clinical FAERS-derived risk scores for cardiotoxicity. Source data are provided in source data file.

Phenotypic assays poorly correlate with CT

We performed a literature review for in vitro and in vivo experimental datasets that aimed to predict CT risk based on phenotypic readouts, such as cell viability or beating rate from in vitro cardiomyocyte or animal models, to determine if such phenotypic experiments can predict the clinical risk scores for CT. Studies in which drugs at the clinical concentration induced more than a 20% change in various phenotypic readouts compared to control experiments were classified as predicting potential CT (Fig. 1d). Across these studies, it was apparent that there was no identifiable relationship between apparent experimental toxicity in comparison to the relative incidence of CT in patients as derived from FAERS.

We conducted dose–response experiments with selected KIs that had varying risks for CT using the cardiomyocyte cell lines that were used in the current study for transcriptomic profiling, quantifying cell viability, and mitochondrial stress after 48 h of exposure to the selected KIs. We again assessed if drugs caused more than a 20% change in cell viability and mitochondrial stress at the typical clinically used concentration (Supplementary Table 1). These studies showed a similar lack of correlation with clinical risk (Fig. 1e, Supplementary Fig. 1). These findings underscore the need for alternative approaches such as early molecular signatures for CT. This identified lack of the predictiveness of preclinical in vitro and in vivo phenotypic assays, as has been noted by others7.

Transcriptomic profiling of human primary cardiomyocyte-like cell lines

To study the transcriptomic response to KIs associated with CT, we obtained four primary cardiomyocyte lines that were isolated from ventricles of healthy adult human heart (two male and two female, PromoCell GmbH, Germany). Culture conditions, detailed phenotypic characterization of each cell line, including gene and protein expression, morphology, and functional assays, can be found on the DToxS Center website (www.dtoxs.org) under the “Cellular Metadata” section.

Confluent cardiomyocyte-like cells were treated with drugs for 48 h at concentrations similar to their clinical concentration (Supplementary Table 1) with 3–4 replicates and 3–4 cell lines (Supplementary Table 2), after which RNA was extracted and sequenced using the 3′ digital gene-expression method23 (Fig. 2a).Overview of pan-KI transcriptomic profiling in human primary cardiomyocyte-like cells.

a Overview of experimental approach to generate transcriptomic data. For each drug, genes were ranked by absolute mean fold-change gene-expression value across replicates (>3 biological replicates) and cell lines (a total of 1309 experiments), and the top 250 genes for each KI were kept. Information about the total number of replicates can be found in the source data file. b Jaccard similarity of gene-expression signature of PromoCell cardiomyocyte cell lines (102 samples) to gene-expression signatures of tissues available in the GTeX database (17,382 total samples). Boxplot whiskers refer to the upper and lower quartile of all pairwise Jaccard coefficients between each sample, within each tissue type. Information about each boxplot’s sample size, minima, maxima, and center is provided in the source data file. c Heatmap depicting the Jaccard index that indicates the magnitude of similarity in top-ranking differentially expressed genes for all KI pairs. d First three principal components (PCs) based on full mean fold-change gene-expression profiles across KIs. Source data are provided in source data file.

We investigated if transcriptomic profiles of PromoCell cardiomyocytes are related to human heart tissue and hence a good model to study CT. We compared the gene-expression similarity of untreated PromoCell cardiomyocytes against tissues available in the Genotype-Tissue Expression (GTEx) project, which contains gene-expression data from many human tissues, including the heart (Fig. 2b)24. Using the Jaccard distance for the top expressed 250 genes (based on transcript per million counts) for both untreated PromoCell and GTEx tissues, we observe that PromoCell cardiomyocytes’ expression exhibits a gene expression similar to blood (rank 2), muscle (rank 4), and heart (rank 10) tissue. Based on these results, we conclude that the PromoCell cardiomyocytes can offer comparable gene-expression changes to that of cardiomyocytes.

Limited overlap in differentially expressed genes across KIs

Differential gene-expression fold-change values were computed across the four cell lines. Initial analyses showed that the DEGs generally clustered more strongly by drugs than by cells. We calculated median fold-change values for each KI across cell lines, resulting in a single gene- expression profile for each KI. Ranked gene lists for each KI were generated by ranking by differential gene-expression p value and keeping the top 250 genes. To assess the similarity between genes present in the top 250 genes for each KI, the Jaccard index was calculated for each ranked list of KI-specific genes, which indicated a limited overlap (<0.25) between the top 250 genes across KIs (Fig. 2c). Principal component analysis showed variable gene-expression patterns for nine KIs, while for the remaining KIs, little variation in gene expression was seen (Fig. 2d), even though these remaining KIs included drugs for which CT is well established. We concluded that ranked differential gene-expression values would not be sufficient to provide clear insights into gene-expression profiles associated with CT.

Pathways correlated with KI-associated CT

To identify pathways and subcellular processes across KIs and their potential involvement with CT, we performed enrichment analysis for protein kinases and KEGG terms using the top 250 differentially expressed genes ranked by p value across cell lines and KIs. We then correlated p values of enriched terms with clinical FAERS-derived risk scores to identify potential kinases and pathways associated with CT risk (Fig. 3a). The protein kinase LIMK2, which is involved in actin cytoskeleton reorganization pathways, ranked the highest in its correlation specifically enriched for KIs with a higher risk score (Fig. 3b). Sucrose- and pyruvate-metabolism pathways were the most strongly enriched pathways correlating with high risk scores (Fig. 3c). However, since no directionality in pathways is considered in these enrichment analyses, both the positively and negatively correlated processes may play a role in the development of CT. When considering enriched protein kinases and KEGG processes across all KIs without considering correlation to CT risk, multiple pathways were identified (Supplementary Fig. 2). These findings indicate that there is likely substantial complexity underlying the action of KI in cardiomyocytes, although currently these analyses remain correlational and do not offer proof of causal relationships.Analysis of transcriptomic profiling data in relation to cardiotoxicity risk.

a Flowchart indicating ranked lists of top 250 differentially expressed genes ranked by p value for each kinase inhibitor across cell lines from the transcriptomic cardiomyocyte profiling, which were then enriched and subsequently related to clinical cardiotoxicity risk scores. Enriched kinases (b) and enriched KEGG pathways (c) (p < 0.05) that show a correlation coefficient > |0.25| with cardiotoxicity risk scores and the associated enrichment p values. Source data are provided in source data file.

Transcriptomic signature to predict CT risk

We tested if our KI-wide fold-change gene-expression profiles correlated with the KI-specific clinical risk scores for CT to identify a predictive transcriptomic signature for CT risk. Given the limited similarity between top-ranking gene-expression profiles across KIs, the entirety of the gene- expression profiles for different KIs were considered as potential predictors for KI-associated CT risk. KI-specific expression profiles of 10,749 genes were available as potential predictors for KI-specific CT risk scores. To identify genes most strongly associated with CT risk, we used an elastic net-penalized regression approach, which aims to select the most predictive variables while avoiding overfitting25.

A two-stage regression analysis was performed (Fig. 4a). From the available 23 KIs with the associated clinical CT risk scores, we randomly left out 2 KIs for external validation of the model (test set, 10% of data). The differential gene-expression profiles of 21 remaining KIs were then used to train the model. Given the limited number of available drugs, small changes in expression patterns for drug were expected to affect the identity of the overall set of predictor genes. Therefore, we generated bootstrap datasets by random resampling of KI risk and the associated gene-expression profiles. These bootstrapped datasets were then fit using elastic net models. This first step was performed to identify gene-based predictors that could consistently predict CT risk and contributed significantly to the prediction of this risk. The bootstrap analysis resulted in stable selection of potential predictors. Predictors to be included in the final elastic net regression model were selected based on their minimal root-mean-squared prediction error (RMSE) after cross- validation. Based on this cross-validation, the gene-expression-based predictors in the final elastic net models consisted of 26 genes with the associated variable importance values (Fig. 4b).Regression analysis for transcriptomic signatures to predict clinical risk.

a Overview of processing and elastic net regression analysis of transcriptomic data in combination with FAERS-derived clinical risk scores. b Transcriptomic signature genes selected to predict cardiotoxicity risk score indicating their variable importance. c Observed and predicted risk scores from the elastic net cross- validation analysis (mean and standard deviation). d External validation of the signature for six kinase inhibitors: regorafenib (REG), sunitinib (SUN), ibrutinib (IBR), lenvatinib (LEN), nintendinib (NIN), and ceritinib (CER).

Repeated cross-validation analyses indicated good predictive performance of the model for left-out KIs (Fig. 4c). We evaluated our 26-gene signature for predicting CT risk on an independent validation set of six KIs, of which three KIs were previously untested (Fig. 4d). We note that the independent validation set was performed 1 year after the original signatures were generated, using a different experimental protocol for the transcriptomic assay that was based on mRNA detection using random primers. We observed accurate predictive performance for five out of six KIs tested. The outlier, ibrutinib, had the lowest, albeit acceptable, predictive performance, with an error of 0.493 between the predicted and observed risk scores. Taken together, the developed signature can be of relevance to support risk prioritization of newly developed KIs. When we tested which of the 21 KIs drove the prediction strength of the model, we found that excluding any of four low-CT risk drugs (cabozantinib, tofacitinib, pazopanib, and erlotinib) increased the error substantially, indicating that these KIs contribute distinct information to the signature. In contrast, several of the high-ranking CT drugs could be excluded without sacrificing accuracy (Supplementary Fig. 3).

We then used the 26-gene signature to construct a protein–protein interaction network analysis to identify protein kinases and transcription factors associated with the signature (Supplementary Fig. 4). Several protein kinases were retrieved that are both known targets of the studied KIs, and which may be associated with the occurrence of KI-induced CT.

Chemical structures of KIs inform CT risk

Off-target binding or polypharmacology is commonly observed in KIs23. Since off-target binding is dependent on the structure of the drug, we investigated the relationship between kinase inhibitor chemical structure, binding target profile, and CT risk. To do this, we generated a structure–activity–similarity (SAS) map of the 26 tested inhibitors (in both the training and validation set) and their CT-risk score (Fig. 5A)26. SAS maps can be divided into four quadrants: the upper-left quadrant shows KI pairs with low chemical similarity and large changes in CT risk. The lower-left quadrant describes largely dissimilar KI pairs with small changes in CT risk. The lower-right quadrant describes KI pairs that exhibit a “smooth” structure–activity relationship, that is, small changes in chemical similarity are associated with small changes in CT risk. Finally, the upper-right quadrant indicates highly chemically similar compounds with large changes in CT risk.Structure–activity–similarity (SAS) maps of kinase inhibitor activity and cardiotoxicity.

a A SAS map relating pairwise chemical similarity measured by Tanimoto coefficient (Tc) derived from a weighted average of 4 chemical fingerprints (ECFP4, ECFP2, Daylight, and MACCS), between pairs of 26 kinase inhibitors (Table 1) and their difference in cardiotoxicity scores (DCS). The threshold for chemical similarity was the top 10% value in the distribution of Tc values: 0.38. The threshold value for DCS was half of the maximum DCS score: 0.82. b Highlighted chemical scaffolds for distinct kinase inhibitors observed in the upper- and lower-right regions. c Binding profile of kinase inhibitors based on data from Klaeger et al.5. Kinase inhibitors were hierarchically clustered based on chemical similarity, and kinase inhibitors are annotated by their binding mode (e.g., type I, type I1/2, type II, type III, type IV, or type VI)6. d Kinase inhibitor selectivity scores at 500 nM Kd. e Observed cardiotoxicity risk scores were normalized to zero and ordered based on hierarchical clustering of the kinase inhibitors. f Predicted cardiotoxicity risk scores were normalized to zero and ordered based on hierarchical clustering of the kinase inhibitors. g Absolute error from observed and predicted cardiotoxicity risk scores. Source data are provided in source data file.

KI pairs in the upper-right region represent activity cliffs, that is, that small changes in chemical structure are associated with large changes in CT risk. In this region, we find several KI pairs, in particular, we observe large activity cliffs between afatinib and bosutinib as well as bosutinib and erlotinib. Here, all four compounds have the same chemical core (Fig. 5b); however, both afatinib and erlotinib show respectively lower CT risk scores compared to bosutinib. We hypothesized that harmonization of drug substructure, similarity, and promiscuity in the context of kinase inhibitor type may inform on our ability to predict CT risk (Fig. 5c).

By investigating their KI target profiles, we observe that both afatinib and erlotinib are less promiscuous KIs compared to bosutinib (which is one of the most promiscuous KIs in this set, Fig. 5d), and they both inhibit EGFR at nanomolar concentrations. On the other hand, less promiscuous KIs, such as lapatinib and gefitinib, exhibit a comparably lower CT risk score (Fig. 5e). Indeed, we observe a correlation between kinase inhibitor promiscuity and the observed CT risk score (Supplementary Fig. 5). However, KI promiscuity may not be the sole determinant of CT risk. For example, KIs such as imatinib and nilotinib are not as promiscuous as bosutinib; however, both exhibit relatively high CT risk scores. In this case, both imatinib and nilotinib CT may be explained due to their similar chemical structure and high specificity for protein kinases such as DDR1 and ABL.

Finally, kinase inhibitors have distinct binding modes against their targets6,27,28. Kinase inhibitors that bind their kinase targets can be classified based on their binding mode, including the kinase conformation they bind and/or type of interactions they make with their kinase targets (e.g., covalent vs. noncovalent)6,27,29. For example, type I inhibitors bind an active kinase conformation, while Type I1/2, II–V bind distinct inactive states (Methods); type VI KI binds the kinase target covalently. We do not observe a clear relationship between kinase inhibitor-binding mode and CT. For example, the type II inhibitors imatinib and nilotinib are observed to have a high CT risk, while the type II inhibitors sorafenib and regorafenib have comparatively lower observed CT risk. However, both pairs of inhibitors are highly chemically similar and have similar binding targets. Taken together, the observed CT risk of a KI may be related to both a kinase inhibitor’s selectivity and its chemical structure. Furthermore, we observe a relationship between chemical structure and binding target similarity to the predictive performance of our signature (Fig. 5e–g).

Discussion

The occurrence of drug treatment-associated CT, leading to decreased cardiac function, follows the therapeutic effects of the drugs, and is only observed in a subset of the patients using the drug. This raises the question of whether it would be possible to obtain early cell-based signatures predictive for drug toxicity. Here we addressed this question by attempting associating drug treatment-induced gene-expression patterns with the clinical risk for the adverse events of interest.

By estimating clinical risk from the FAERS database, our method utilizes a relevant and unbiased approach for the quantification of CT risk. As a result, our CT risk scores lack notable pitfalls such as selection bias associated with tightly controlled clinical trials, which underestimate adverse-event risks due to cohort size, trial duration, and selective inclusion criteria for subjects. Nevertheless, there are limitations to the FAERS database as well, which we have discussed and addressed in previous work22. Specifically, use of the FAERS resource may confound demographics information such as age and sex, which was observed not to vary across different KIs. Moreover, CT risk score does not reflect absolute risk for developing CT. Rather, it reflects the relative risk for a subset of patients for which drug-associated adverse events were reported. In addition, there may be some systematic biases based on the sampling frequency of drugs by institution.

It remains unclear if all KIs induce CT through similar mechanisms, and to what extent ultimate clinical pathologies are similar. While the FAERS database allows us to distinguish between different types of CT, the annotation is not uniform and may either refer to distinct pathophysiological descriptions or rather more general clinical presentations of heart failure. To this end, we chose to lump all forms of heart failure, while excluding cardiac AEs that have known and unrelated origin such as coronary artery disease and arrhythmias.

We compared KI-associated transcriptomic response profiles generated from cultured human primary cardiomyocyte-like cells with clinical CT risk scores to obtain a reduced set of genes that may predict the relative risk for KI-associated CT. Using the clinically weighted signatures and the associated regression coefficients identified in the elastic net model, the relative risk for CT can be predicted. The risks predicted by our signatures and the associated regression model can be used in drug development to rank the potential risk of novel KIs with respect to existing KIs with better characterized clinical risks for CT.

The signatures generally showed good prediction of CT risk during cross-validation as well as on an independent set of KIs (Fig. 4), while the only poorly performant KI, ibrutinib, inhibitor of Bruton nonreceptor protein-tyrosine kinase, represents a unique KI in terms of binding mode (i.e., type VI inhibitor) and high promiscuity (Fig. 5). Specifically, it is a member of an emerging class of kinase inhibitor drugs that bind their targets covalently (type VI KIs). These drugs are highly underrepresented in the databases used in this analysis, explaining the misclassification of ibrutinib30.

The four cell lines we studied are insufficient to fully capture such human variability to KIs. Therefore, in our analysis, we used median fold-change gene-expression profiles across multiple cell lines. The resulting averaged gene-expression profiles thus reflect relatively consistent changes in gene expression across cell lines, i.e., changes in gene expression that are less likely to be highly variable across cell lines, yet may also reflect a set of predictors that may be more consistent in the population. Given that the FAERS CT risk scores also reflect a population-level CT risk, the use of these median values in fold-change gene-expression values is a reasonable starting point for our analyses.

The experimental underpinning of the transcriptomic profiles generated in this study makes them likely to be of value in selecting drug candidates with a low risk for CT as an adverse event. Our analysis is based on primary human heart-derived cardiomyocyte-like cells. Although these cell lines do have phenotypic limitations due to dedifferentiation, the signatures obtained from the cells could be relevant for prediction of clinical drug effects. These cell lines may be reflective of human cardiac pharmacology, i.e., in comparison with animal-derived cardiomyocytes, even though further characterization and standardization are still needed. Detailed characterization of these cell lines is available as metadata to the RNAseq datasets at www.dtoxs.org. Our analyses used drug exposures similar to clinically reported maximum plasma concentrations of the individual KIs, rather than using the same concentrations for all KIs, even though we did not correct for protein binding. We expect that the duration of 48-h exposure may reflect transcriptomic changes that are likely related to early changes in subcellular processes associated with the adverse event of interest.

Unfortunately, in this study, it is not feasible or ethically possible, due to lack of prior informed consent, to compare cardiac gene-expression signatures with gene-expression profiles from patients who receive KI-therapy and/or who developed KI-associated CT. We considered whether we could compare our gene-expression signatures to cardiac gene-expression data from patients with heart failure who undergo surgery. Typically these are patients with advanced disease, and the gene expression in tissue from advanced disease is not likely to be of relevance to acute drug-induced CT.

By investigating the chemical structure and binding profile similarity of KIs, we are able to observe that chemical components and scaffolds that lead to promiscuous binding of KIs to multiple binding targets are correlated with higher CT values. This is consistent with the notion that a portion of CT risk of KIs can be attributed to higher levels of off-target interactions. Indeed, when we investigate the binding profile of three chemically similar KIs: afatinib, erlotinib, and gefitinib, we find that their binding profiles are fairly specific compared to other KIs, and they have a lower normalized CT risk score. One limitation we have observed with our approach is that chemically distinct KIs (e.g., in terms of binding profile, substructural similarity, and type), such as the type IV inhibitor ibrutinib, exhibit diminished predictive performance. However, we think that using the guidelines we provide herein, this signature could still assist in the development and prioritization of KIs with lower toxicity risks.

We cautiously anticipate that clinically weighted transcriptomic signatures such as those developed in this study may be of relevance to guide safety assessment in early drug development. Unlike the relatively well-established assessment of electrophysiological safety issues such as QT prolongation, the assessment of non-QT type of CT associated with KI16 and other novel drugs31, lacks reliable biomarkers. The transcriptomic signature for CT identified in this study may help fill this gap, especially if its structure and binding profiles are closely represented within the inhibitors in this study. One could anticipate that after initial selection of promising KIs with apparent efficacy in preclinical screens, transcriptomic profiling using the signatures developed here may possibly be used to rank drugs for the expected CT risk and exclude those with high CT risk scores (Supplementary Fig. 6).

While beyond the scope of this study, future extension of our studies could explore the idea of studying individualized risk scores for CT. That is, do baseline gene-expression profiles of larger libraries of patient-derived cardiomyocyte cell lines predict the difference in risk for CT between individual patients? Ideally, such an analysis would be conducted using induced pluripotent stem cell-derived cardiomyocytes from patients, who have received KIs and experienced different levels of CT, such as was recently described for anthracycline chemotherapeutics32. This would then further enable the development of precision medicine approaches to KI therapy that could minimize the risk for CT.

MethodsCell culture and drug treatment

Adult human cardiomyocytes (Cat #: C12810) were purchased from PromoCell GmbH (Heidelberg, Germany) and grown in culture as per the manufacturer’s instructions. Four different cell line lots (Lot #: 3042901.2, 4031101.3, 2082801.2, and 2120301.2) isolated from two male and two female subjects were cultured under serum-free differentiation conditions for at least 28 days prior to drug treatment. Details regarding metadata information, including cell line metadata and the quality control and assurance metrics, can be found on www.dtoxs.org.

Dose–response experiments

For two of the four cell lines, dose–response experiments were conducted treating cells for 48 h with eight increasing perturbagen concentrations (5 nM, 50 nM, 100 nM, 500 nM, 1 µM, 5 µM, 10 µM, and 100 µM) and vehicle-treated control, in quadruplicates. We assayed for viability through image-based analysis of nuclear counts with Hoechst 33342 (Thermo Fisher, Cat #: H3570) and MitoTracker Red (Thermo Fisher, Cat #: M22426) for mitochondrial toxicity. Details of the experimental protocols for cell culture, drug treatment, and transcriptomics have been described as step-by-step standard operating procedures for the various experiments available on www.dtoxs.org.

Transcriptomics

Cells were treated for 48 h with a single perturbagen concentration around the maximal concentration (Supplementary Table 1). After drug treatment, the cells were lysed, RNA was collected using TRIzol, and gene-expression profiles were measured using the 3′ digital gene-expression method33,34.

Sequence alignment and processing of gene-expression data

The raw sequences were demultiplexed. Combined standard RNAseq files were aligned to the reference human genome hg38 using the STAR software suite35. The resulting alignment files were parsed to identify the fragments with acceptable alignment quality, to remove duplicate fragments, and to assign accepted fragments to the corresponding genes. The resulting read-count (i.e., transcript count) table was then subjected to correlation analysis at each treatment condition, to identify and remove outlier samples, determined by predefined thresholds. The gene read-count tables were then subjected to differential gene-expression analysis using the R package EdgeR36. Details of these computational procedures are described elsewhere23, and step-by-step protocols are available on www.dtoxs.org. The resulting normalized and log-transformed fold-change gene- expression values for each sample are also deposited for public access to the DToxS data repository (www.dtoxs.org).

Processing and exploratory analysis of gene-expression data

The median log-transformed gene-expression fold-change value was calculated across all cell lines for each individual KI. The resulting matrix of gene fold-change values by KIs was used for the regression analysis. To obtain insight into the general patterns present in this KI-perturbed transcriptomics dataset, we generated rankings of the top 500 genes for each drug, by their absolute mean fold-change value, i.e., whether positive or negative. For each of these KI-associated rankings, we determined the frequency of these changes being also present in the ranking of other drugs, e.g., the similarity in genes present in the top 250 gene lists for each KI. This was visualized using the Jaccard index, and by plotting the most highly drug-connected genes against the associated drugs. Principal component analysis for the first three principal components on the absolute mean fold-change values for each drug was performed to further assess similarity between drugs in their gene-expression values.

Calculation of tissue cell line expression similarity

Pairwise expression similarity scores were computed based on the Jaccard coefficient of a binary matrix based on RNA sequencing data from PromoCell cardiomyocyte exposures to kinase inhibitors. The top 500 genes for a KI were set as 1, while genes that were not in the top 500 were set as 0.

Calculation of clinical risk RORs

Adverse-event frequencies from the FDA Adverse Event Reporting System (FAERS) were obtained from the AERSmine resource37, which contains a curated version of the FAERS database. ADRs in the FAERS database are organized according to MedDRA38, which is a hierarchical ontology to classify ADRs from high-level organs associated with the pathology to reported low-level specific pathological conditions. We downloaded the frequencies of the occurrence of ADRs for all protein KIs available in FAERS, together with all other frequencies of ADRs reported for these KIs. A time-stamped record of this download to reproduce this analysis was retained. RORs were then computed for each KI using the frequency fdt of the ADR of interest, the frequency fdn of any other ADR occurring, the frequencies fnt of occurrence of the ADR of interest for any other protein kinase inhibitor, and the frequency fnn for all other ADRs and KIs. The ROR was calculated using Eq. (1)\documentclass[12pt]{minimal} + \usepackage{amsmath} + \usepackage{wasysym} + \usepackage{amsfonts} + \usepackage{amssymb} + \usepackage{amsbsy} + \usepackage{mathrsfs} + \usepackage{upgreek} + \setlength{\oddsidemargin}{-69pt} + \begin{document}$${\rm{ROR}} = \frac{{f_{dt}/f_{dn}}}{{f_{nt}/f_{nn}}},$$\end{document}ROR=fdt/fdnfnt/fnn,whereas the standard error (SE) of the log ROR was calculated using Eq. (2)\documentclass[12pt]{minimal} + \usepackage{amsmath} + \usepackage{wasysym} + \usepackage{amsfonts} + \usepackage{amssymb} + \usepackage{amsbsy} + \usepackage{mathrsfs} + \usepackage{upgreek} + \setlength{\oddsidemargin}{-69pt} + \begin{document}$${\rm{SE}}_{{\rm{logROR}}} = \sqrt {\frac{1}{{f_{dt}}} + \frac{1}{{f_{dn}}} + \frac{1}{{f_{nt}}} + \frac{1}{{f_{nn}}}},$$\end{document}SElogROR=1fdt+1fdn+1fnt+1fnn,with the log-transformed confidence interval (CI) being calculated as follows: CI = log(ROR) ± 1.96*SElogROR.

Adverse events in FAERS are mapped to the MEDDRA dictionary38. CT events related to heart failures and cardiomyopathies, excluding arrhythmogenic ADRs and coronary artery disorders, were selected from the main MEDDRA cardiac ADR group. The selected ADRs primarily reflected different stages of heart failure, which were grouped together.

Elastic net regression analysis

The FAERS-derived risk RORs for CT were regressed against the KI-associated vectors of mean fold-change values across the four cell lines. A two-step regression procedure was then used to select predictor genes reducing the sensitivity to changes in dataset composition. For this, we first generated 1000 bootstrap datasets with replacements for gene expression–KI risk score pairs. Each of these bootstrap datasets was fit using an elastic net regression model (R version 3.4.3, package glmnet, version 2.0-16). The genes that were selected as predictors (i.e., nonzero regression coefficient) and the scaled values of the gene-associated coefficients were saved for each bootstrap dataset. Across all bootstrap datasets, the relative frequency of the selection of gene-based predictors, and the mean-scaled coefficient value was computed. We then calculated the product of the mean frequency and scaled coefficient value, rank predictors by their importance with respect to robustness (selection frequency). A large number of percentiles of these rankings were evaluated using leave-one-out cross-validation. The selection percentile (99.755%) resulting in optimal prediction errors (RMSE) was then used to select a subset of gene-based predictors, and the model that generated the final gene-expression signatures. The selected predictor genes were then ranked by their relative importance, and by their median fold-change values, and displayed as clustered heatmaps. We finally evaluated the predictive value of the resulting regression model to predict CT risk scores for the two left-out KIs.

When using this approach to analyze similar datasets of cardiomyocyte transcriptomes together with risk scores, it is possible that potentially different genes are identified than those described in the current report. This difference associated with the intrinsic property of penalized regression approaches that select predictors from potentially highly correlated sets of predictor candidates. Hence, small changes in either risk scores or gene-expression datasets may affect correlation structures of the data and thereby the list of genes for a signature.

Enrichment and network analyses

Enrichment analysis was performed based on a one-tailed Fisher’s exact test using R (package stats), in order to identify enrichment of specific genes in predefined gene lists. For enrichment of pathways and biological processes, we used the KEGG database (2016), and for enrichment of protein kinases, we used the KEA database (2015). Diseases were excluded from the KEGG list of processes (e.g., diabetes, depression, and cancer), in order to only evaluate general biological processes or pathways. We used the top 250 DEGs ranked by p value for each KI to perform enrichment analysis. Subsequently enriched term p values were correlated with CT risk scores to identify kinases and pathways associated with CT risk.

The gene part of the signature for CT identified in the regression analysis was used as seed note to perform a protein–protein interaction network (PPI) analysis, conducted using the web application X2K39, which aims to identify associated kinases and transcription factors based on multiple PPI databases.

Calculation of chemical similarity

RDkit (www.rdkit.org)40 was used to generate chemical fingerprints and compute pairwise Tanimoto coefficients (Tc) between the 26 tested kinase inhibitors. For each pair of inhibitors, we first calculated the Tc using four chemical fingerprints, including Morgan_2 2,048-bit (ECFP4)41, Morgan_1 2,048-bit (ECFP2)41, Daylight-like42, and MACCS43. Because each of these fingerprints capture distinct chemical properties, we computed a weighted Tc average of the three fingerprints: 30% ECFP4, 30% ECFP2, 30% Daylight-like, and 10% MACCS, which exhibited the most optimal spread of the distribution of the pairwise distances. To generate the SAS maps (Fig. 5a), we plotted the pairwise-weighted Tc values with their difference in CT scores (DCT). Finally, 0.35 was set as the threshold for chemical similarity, while half of the maximum difference was set as the threshold for DCS. Chemical structures were drawn using Marvin (www.chemaxon.com)44 based on SMILES strings obtained from PubChem.

Calculation of KI-binding target similarity

Kinome-wide kinase inhibitor-binding (Kd) profiling data were obtained from Klaegar et al.5, which consisted of kinome-binding (Kd) profiling data for all of the tested kinase inhibitors across 242 kinases. A heatmap was generated for selected kinase inhibitors based on the negative log of the Kd values from Klaegar et al. (Fig. 5c)5. Notably, the Kd values were scaled by 100,000 to avoid negative log values.

Overview of KIs included in this analysis.

DrugThree-letter codeApproval yearaTherapeutic targetsConcentration (µM)b
AfatinibAFA2013ErbB2 and EGFR0.05
AxitinibAXI2012VEGFR1/VEGFR2/VEGFR3/PDGFRB/c-KIT0.2
BosutinibBOS2012Bcr-Abl and SRC0.1
CabozantinibCAB2012c-Met and VEGFR22
CeritinibCER2014ALK1
CrizotinibCRI2011ALK and HGFR0.25
DabrafenibDAB2013BRAF2.5
DasatinibDAS2006ABL, ARG, KIT, PDGFRα/β, and SRC0.1
ErlotinibERL2004ErbB13
GefitinibGEF2003ErbB11
ImatinibIMA2001Bcr-Abl5
LapatinibLAP2007ErbB12
NilotinibNIL2007Bcr-Abl3
PazopanibPAZ2009VEGFR2, PDGFRα/β, and KIT10
PonatinibPON2012Bcr-Abl, BEGFR, PDGFR, FGFR, EPH, SRC, c-KIT, RET, TIE2, and FLT30.1
RegorafenibREG2012RET, VEGFR, and PDGFR1
RuxolitinibRUX2011JAK1
SorafenibSOR2005BRAF, VEGFRs, PDGFRα/β, FLT3, and KIT0.5
SunitinibSUN2006VEGFR, PDGFR, CSF1R, FLT3, and KIT1
TrametinibTRA2013MEK1 and MEK20.1
TofacitinibTOF2012JAK1
VandetanibVAN2011RET, VEGFR, and EGFR0.33
VemurafenibVEM2011BRAF2

aUS approval date, first indication.

bDerived from maximum total (bound  +  free) plasma concentrations in humans as reported in the literature.

Table S3 lists the purity and literature references to clinical concentrations.

Reporting summary

Further information on research design is available in the Nature Research Reporting Summary linked to this article.

Supplementary information

+

Supplementary Information

+

Peer Review File

+

Reporting Summary

+

Source data

+

Source Data

+

Peer review information +Nature Communications thanks the anonymous reviewers for their contribution to the peer review of this work. Peer reviewer reports are available.

Publisher’s note Springer Nature remains neutral with regard to jurisdictional claims in published maps and institutional affiliations.

These authors contributed equally: J. G. Coen van Hasselt, Rayees Rahman.

These authors jointly supervised this work: Avner Schlessinger, Evren U. Azeloglu, Ravi Iyengar.

Supplementary information

Supplementary information is available for this paper at 10.1038/s41467-020-18396-7.

Acknowledgements

This project was supported in part by the NIH LINCS center grant (U54 HG008098) and the Systems Biology Center grant (P50 GM071558). J.G.C.H. received funding from the European Union MSCA program (Project ID 661588). This work was partially carried out using the Dutch national e-infrastructure with the support of SURF Foundation.

Author contributions

J.G.C.H. and R.R. performed the data analysis; J.G.C.H., R.R., J.H., M.R.B., E.S., A. Sc., E.U.A., and R.I. wrote the paper; Y.X. performed RNAseq data processing; A.P. and J.M.G. performed the mass spectrometry drug purity analyses; A.St., B.H., G.J., and J.V.S. performed the cell culture, drug perturbation, and the RNA isolation; E.U.A. supervised the experimental efforts; E.U.A. and J.M.G. determined the experimental drug concentrations and purity; M.M. supervised the RNA sequencing; J.G. supervised the quality assurance and assay reproducibility; A.Sc. supervised the cheminformatics analysis; R.I. conceived the project; all authors reviewed the paper.

Data availability

All processed RNAseq data and the curated version-controlled standard operating procedures featured in this study can be downloaded freely at (www.dtoxs.org)22 or the LINCS Data Portal (http://lincsportal.ccs.miami.edu/dcic-portal/). Raw transcriptomics data can be accessed through the Gene Expression Omnibus (GEO) repository with accession numbers GSE146096 and GSE146097. Source data for each figure are provided with this paper. All remaining data will be available from the corresponding author upon reasonable request. Source data are provided with this paper.

Code availability

All scripts are open-source and available from the DToxS GitHub repository (https://github.com/dtoxs).

Competing interests

R.R. and A.S. are co-founders of Aichemy Inc. The remaining authors declare no competing interests.

ReferencesCohenPThe role of protein phosphorylation in human health and disease: delivered on June 30th 2001 at the FEBS meeting in LisbonEur. J. Biochem.20012685001501010.1046/j.0014-2956.2001.02473.x11589691GiamasGKinases as targets in the treatment of solid tumorsCell. Signal.201022984100210.1016/j.cellsig.2010.01.01120096351KnappSSundströmMRecently targeted kinases and their inhibitors-the path to clinical trialsCurr. Opin. Pharmacol.201417C586310.1016/j.coph.2014.07.015FabbroDCowan-JacobSWMöbitzHMartiny-BaronGTargeting cancer with small-molecular-weight kinase inhibitorsMethods Mol. Biol.201279513410.1007/978-1-61779-337-0_121960212KlaegerSThe target landscape of clinical kinase drugsScience2017358eaan436810.1126/science.aan436829191878Roskoski, R. Properties of FDA-approved small molecule protein kinase inhibitors. Pharmacol. Res.10.1016/j.phrs.2019.03.006 (2019).ForceTKolajaKLCardiotoxicity of kinase inhibitors: the prediction and translation of preclinical models to clinical outcomesNat. Rev. Drug Discov.2011101112610.1038/nrd325221283106ChuTFCardiotoxicity associated with tyrosine kinase inhibitor sunitinibLancet20073702011201910.1016/S0140-6736(07)61865-018083403OrphanosGSIoannidisGNArdavanisAGCardiotoxicity induced by tyrosine kinase inhibitorsActa Oncol.20094896497010.1080/0284186090322912419734999MoslehiJJCardiovascular toxic effects of targeted cancer therapiesN. Engl. J. Med.20163751457146710.1056/NEJMra110026527732808ForceTKerkeläRCardiotoxicity of the new cancer therapeutics—mechanisms of, and approaches to, the problemDrug Discov. Today2008137788410.1016/j.drudis.2008.05.01118617014DavisMIComprehensive analysis of kinase inhibitor selectivityNat. Biotechnol.20112910465110.1038/nbt.199022037378ElkinsJMComprehensive characterization of the Published Kinase Inhibitor SetNat. Biotechnol.2016349510310.1038/nbt.337426501955HasinoffBBPatelDThe lack of target specificity of small molecule anticancer kinase inhibitors is correlated with their ability to damage myocytes in vitroToxicol. Appl. Pharmacol.201024913213910.1016/j.taap.2010.08.02620832415WillYEffect of the multitargeted tyrosine kinase inhibitors imatinib, dasatinib, sunitinib, and sorafenib on mitochondrial function in isolated rat heart mitochondria and H9c2 cellsToxicol. Sci.200810615316110.1093/toxsci/kfn15718664550KerkeläRCardiotoxicity of the cancer therapeutic agent imatinib mesylateNat. Med.20061290891610.1038/nm144616862153DohertyKRMulti-parameter in vitro toxicity testing of crizotinib, sunitinib, erlotinib, and nilotinib in human cardiomyocytesToxicol. Appl. Pharmacol.20132722455510.1016/j.taap.2013.04.02723707608ForceTKrauseDSVan EttenRAMolecular mechanisms of cardiotoxicity of tyrosine kinase inhibitionNat. Rev. Cancer2007733234410.1038/nrc210617457301BaiJPFAbernethyDRSystems pharmacology to predict drug toxicity: integration across levels of biological organizationAnnu. Rev. Pharmacol. Toxicol.2013534517310.1146/annurev-pharmtox-011112-14024823140241BergerSIIyengarRRole of systems pharmacology in understanding drug adverse eventsWiley Interdiscip. Rev.20113129135BergerSIMa’ayanAIyengarRSystems pharmacology of arrhythmiasSci. Signal.20103ra3020407125ZhaoSSystems pharmacology of adverse event mitigation by drug combinationsSci. Transl. Med.20135206ra14010.1126/scitranslmed.3006548XiongYA comparison of mRNA sequencing with random primed and 3′-directed librariesSci. Rep.201771462610.1038/s41598-017-14892-x29116112LonsdaleJThe genotype-tissue expression (GTEx) projectNat. Genet.20134558058510.1038/ng.265323715323ZouHHastieTRegularization and variable selection via the elastic net.Journal of the Royal Statistical Society20056730132010.1111/j.1467-9868.2005.00503.xGiulianottiMAWelmakerGSHoughtenRAShifting from the single to the multitarget paradigm in drug discoveryDrug Discov. Today20131849550110.1016/j.drudis.2013.01.00823340113UngP. M.-U.RahmanRSchlessingerARedefining the protein kinase conformational space with machine learningCell Chem. Biol.201825916924.e210.1016/j.chembiol.2018.05.00229861272RahmanRUngPM-USchlessingerAKinaMetrix: a web resource to investigate kinase conformations and inhibitor spaceNucleic Acids Res.201947D361D36610.1093/nar/gky91630321373DarACShokatKMThe evolution of protein kinase inhibitors from antagonists to agonists of cellular signalingAnnu. Rev. Biochem.20118076979510.1146/annurev-biochem-090308-17365621548788ZhangTHatcherJMTengMGrayNSKosticMRecent advances in selective and irreversible covalent ligand development and validationCell Chem. Biol.2019261486150010.1016/j.chembiol.2019.09.01231631011SchnellDPharmacokinetics of afatinib in subjects with mild or moderate hepatic impairmentCancer Chemother. Pharm.20147426727510.1007/s00280-014-2484-yBurridgePWHuman induced pluripotent stem cell-derived cardiomyocytes recapitulate the predilection of breast cancer patients to doxorubicin-induced cardiotoxicityNat. Med.2016225475610.1038/nm.408727089514Soumillon, M., Cacchiarelli, D., Semrau, S., van Oudenaarden, A. & Mikkelsen, T. S. Characterization of directed differentiation by high-throughput single-cell RNA-Seq. Preprint at https://www.biorxiv.org/content/10.1101/003236v1 (2014).KiviojaTCounting absolute numbers of molecules using unique molecular identifiersNat. Methods20119727410.1038/nmeth.177822101854DobinASTAR: ultrafast universal RNA-seq alignerBioinformatics201329152110.1093/bioinformatics/bts63523104886RobinsonMDMcCarthyDJSmythGKedgeR: a Bioconductor package for differential expression analysis of digital gene expression dataBioinformatics2010261394010.1093/bioinformatics/btp61619910308SarangdharMData mining differential clinical outcomes associated with drug regimens using adverse event reporting dataNat. Biotechnol.20163469770010.1038/nbt.362327404875BrownEGWoodLWoodSThe Medical Dictionary for Regulatory Activities (MedDRA)Drug Saf.19992010911710.2165/00002018-199920020-0000210082069ClarkeDJBEXpression2Kinases (X2K) Web: linking expression signatures to upstream cell signaling networksNucleic Acids Res.201846W171W17910.1093/nar/gky45829800326RDKit. http://www.rdkit.org/.RogersDHahnMExtended-connectivity fingerprintsJ. Chem. Inf. Model.20105074275410.1021/ci100050t20426451Daylight. https://www.daylight.com/.DurantJLLelandBAHenryDRNourseJGReoptimization of MDL keys for use in drug discoveryJ. Chem. Inf. Comput. Sci.2002421273128010.1021/ci010132r12444722ChemAxon - Software Solutions and Services for Chemistry & Biology. https://chemaxon.com/.
diff --git a/jcore-pmc-reader/LICENSE b/jcore-pmc-reader/LICENSE index fbbd41e05..d0f946a29 100644 --- a/jcore-pmc-reader/LICENSE +++ b/jcore-pmc-reader/LICENSE @@ -1,6 +1,6 @@ BSD 2-Clause License -Copyright (c) 2017, JULIE Lab +Copyright (c) 2022, JULIE Lab All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/jcore-pmc-reader/README.md b/jcore-pmc-reader/README.md index 4fb82a46a..f42e43e76 100644 --- a/jcore-pmc-reader/README.md +++ b/jcore-pmc-reader/README.md @@ -102,7 +102,9 @@ The following properties are currently supported: | paths | list of objects | Allows to specify a relative or absolute XPath like sequence of element names in the form `abstract/sec/title` and properties that should be applied to elements matching this path. | | type | string | The UIMA type that should be used to annotate the text contents of the element | -The `attribute` and `path` properties define criteria where the base properties are overwritten by the properties specified in association with the given attribute-value combination or path. For example, it is possible to include a certain element for document text but omit it if has a specific element as parent or some attribute value. +The `attribute` and `path` properties define criteria where the base properties are overwritten by the properties specified in association with the given attribute-value combination or path. Attributes are addressed by specifying `name` and `value` keys. The `name` is the name of the attribute to test and `value` is the value the attribute must have for the property override to take effect. Paths require the `path` key followed by a slash-separated sequence of element names that ends with the name of the XML element for which the rule should hold. The path does not need to start from the root, it should just be long enough to identify the element distinctively. + +For example, it is possible to include a certain element for document text but omit it if it has a specific element as parent or some attribute value. Here is an example taken directly from the `elementproperties.yml` file: ```yml diff --git a/jcore-pmc-reader/component.meta b/jcore-pmc-reader/component.meta index b71a1930d..9f322da43 100644 --- a/jcore-pmc-reader/component.meta +++ b/jcore-pmc-reader/component.meta @@ -23,7 +23,7 @@ "maven-artifact": { "artifactId": "jcore-pmc-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe PubMed Central Reader" } diff --git a/jcore-pmc-reader/pom.xml b/jcore-pmc-reader/pom.xml index 0d6a0dfa9..a8369ce37 100644 --- a/jcore-pmc-reader/pom.xml +++ b/jcore-pmc-reader/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -154,8 +154,8 @@ jcore-descriptor-creator - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCMultiplier.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCMultiplier.java index b723f6215..e4b80fac7 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCMultiplier.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCMultiplier.java @@ -1,8 +1,11 @@ package de.julielab.jcore.multiplier.pmc; import de.julielab.jcore.reader.pmc.CasPopulator; +import de.julielab.jcore.reader.pmc.NoDataAvailableException; +import de.julielab.jcore.reader.pmc.PMCMultiplierReader; import de.julielab.jcore.reader.pmc.parser.ElementParsingException; import de.julielab.jcore.types.casmultiplier.JCoReURI; +import de.julielab.jcore.types.casmultiplier.MultiplierConfigParameters; import org.apache.uima.analysis_component.JCasMultiplier_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.AbstractCas; @@ -11,6 +14,7 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.StringArray; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -20,13 +24,13 @@ import java.util.Iterator; @ResourceMetaData(name = "JCoRe Pubmed Central NXML Multiplier", description = "This multiplier expect to receive URIs to NXML documents in the form of JCoReURI feature structures. All JCoReURI FS in the annotation indexes are read and output as new CASes.") -@OperationalProperties(outputsNewCases = true, multipleDeploymentAllowed = true, modifiesCas = false) +@OperationalProperties(outputsNewCases = true, modifiesCas = false) @TypeCapability(outputs = {"de.julielab.jcore.types.TitleType", "de.julielab.jcore.types.Title", "de.julielab.jcore.types.TextObject", "de.julielab.jcore.types.Table", "de.julielab.jcore.types.SectionTitle", "de.julielab.jcore.types.Section", "de.julielab.jcore.types.PubType", "de.julielab.jcore.types.Paragraph", "de.julielab.jcore.types.OtherPub", "de.julielab.jcore.types.pubmed.OtherID", "de.julielab.jcore.types.pubmed.ManualDescriptor", "de.julielab.jcore.types.Keyword", "de.julielab.jcore.types.Journal", "de.julielab.jcore.types.pubmed.Header", "de.julielab.jcore.types.Footnote", "de.julielab.jcore.types.Figure", "uima.tcas.DocumentAnnotation", "de.julielab.jcore.types.Date", "de.julielab.jcore.types.CaptionType", "de.julielab.jcore.types.Caption", "de.julielab.jcore.types.AutoDescriptor", "de.julielab.jcore.types.AuthorInfo", "de.julielab.jcore.types.AbstractText", "de.julielab.jcore.types.AbstractSectionHeading", "de.julielab.jcore.types.AbstractSection"}) public class PMCMultiplier extends JCasMultiplier_ImplBase { private final static Logger log = LoggerFactory.getLogger(PMCMultiplier.class); private Iterator currentUriBatch; private CasPopulator casPopulator; - + private Boolean omitBibReferences = null; @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { @@ -34,14 +38,36 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { if (log.isDebugEnabled()) log.debug("Received batch of {} NXML URIs", jcoreUris.size()); currentUriBatch = jcoreUris.stream().map(JCoReURI::getUri).map(URI::create).iterator(); + determineOmitBibReferences(aJCas); try { - casPopulator = new CasPopulator(currentUriBatch); + casPopulator = new CasPopulator(currentUriBatch, omitBibReferences); } catch (IOException e) { - log.error("Exception occurred when trying to inizialize the NXML parser", e); + log.error("Exception occurred when trying to initialize the NXML parser", e); throw new AnalysisEngineProcessException(e); } } + private void determineOmitBibReferences(JCas aJCas) throws AnalysisEngineProcessException { + try { + MultiplierConfigParameters multiplierConfigParameters = JCasUtil.selectSingle(aJCas, MultiplierConfigParameters.class); + StringArray parameters = multiplierConfigParameters.getParameters(); + for (int i = 0; i < parameters.size(); ++i) { + String[] paramPair = parameters.get(i).split("\\s*=\\s*"); + if (paramPair.length != 2) { + String msg = "Error while parsing multiplier configuration parameters passed from the multiplier reader. The parameter array contains the entry \"" + parameters.get(i) + "\". The expected format is =."; + log.error(msg); + throw new AnalysisEngineProcessException(new IllegalArgumentException(msg)); + } + if (paramPair[0].equals(PMCMultiplierReader.PARAM_OMIT_BIB_REFERENCES)) { + omitBibReferences = Boolean.parseBoolean(paramPair[1]); + } + } + } catch (IllegalArgumentException e) { + omitBibReferences = false; + // nothing further; there were no parameters given + } + } + @Override public boolean hasNext() { @@ -60,6 +86,8 @@ public AbstractCas next() throws AnalysisEngineProcessException { return cas; } catch (ElementParsingException e) { log.error("Exception occurred why trying to parse {}", next, e); + } catch (NoDataAvailableException e) { + log.error("Could not populate the CAS due to preceding error. Returning null."); } } return null; diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java index 481e4db4c..5841694e8 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java @@ -1,27 +1,49 @@ package de.julielab.jcore.reader.pmc; import de.julielab.jcore.reader.pmc.parser.*; +import de.julielab.jcore.types.Header; +import org.apache.commons.lang3.StringUtils; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.io.InputStream; import java.net.URI; +import java.util.ArrayList; import java.util.Iterator; +import java.util.List; public class CasPopulator { private final static Logger log = LoggerFactory.getLogger(CasPopulator.class); private NxmlDocumentParser nxmlDocumentParser; private Iterator nxmlIterator; + private int truncationSize; + private static final String LINESEP =System.getProperty("line.separator"); - public CasPopulator(Iterator nxmlIterator) throws IOException { + + public CasPopulator(Iterator nxmlIterator, Boolean omitBibReferences, int truncationSize) throws IOException { this.nxmlIterator = nxmlIterator; + this.truncationSize = truncationSize; nxmlDocumentParser = new NxmlDocumentParser(); - nxmlDocumentParser.loadElementPropertyFile("/de/julielab/jcore/reader/pmc/resources/elementproperties.yml"); + String settings = omitBibReferences ? "/de/julielab/jcore/reader/pmc/resources/elementproperties-no-bib-refs.yml" : "/de/julielab/jcore/reader/pmc/resources/elementproperties.yml"; + nxmlDocumentParser.loadElementPropertyFile(settings); + } + + public CasPopulator(Boolean omitBibReferences, int truncationSize) throws IOException { + this(null, omitBibReferences, truncationSize); + } + + public CasPopulator(Boolean omitBibReferences) throws IOException { + this(null, omitBibReferences, Integer.MAX_VALUE); } - public void populateCas(URI nxmlUri, JCas cas) throws ElementParsingException { + public CasPopulator(Iterator pmcFiles, boolean omitBibReferences) throws IOException { + this(pmcFiles, omitBibReferences, Integer.MAX_VALUE); + } + + public void populateCas(URI nxmlUri, JCas cas) throws ElementParsingException, NoDataAvailableException { ElementParsingResult result = null; URI currentUri = nxmlUri; while (currentUri != null && result == null) { @@ -29,13 +51,60 @@ public void populateCas(URI nxmlUri, JCas cas) throws ElementParsingException { nxmlDocumentParser.reset(currentUri, cas); result = nxmlDocumentParser.parse(); } catch (DocumentParsingException e) { - log.warn("Error occurred: {}. Skipping document.", e.getMessage()); - if (nxmlIterator.hasNext()) + log.warn("Error occurred when trying to read from URI {} (ASCII string: {}): {}. Skipping document.", currentUri, currentUri.toASCIIString(), e.getMessage()); + if (nxmlIterator.hasNext()) { currentUri = nxmlIterator.next(); + } else { + String msg = "Cannot just skip the errored document because there is no next document currently available. Returning without adding any data to the CAS."; + log.warn(msg); + throw new NoDataAvailableException(msg); + } } } StringBuilder sb = populateCas(result, new StringBuilder()); - cas.setDocumentText(sb.toString()); + truncateTextAndAnnotations(sb.toString(), cas); + } + + private void truncateTextAndAnnotations(String documentText, JCas cas) { + String text = documentText.length() > truncationSize ? documentText.substring(0, truncationSize) : documentText; + cas.setDocumentText(text); + // if truncation happened, we need to remove annotations exceeding the valid text span + List toRemove = new ArrayList<>(); + if (text.length() < documentText.length()) { + for (Annotation a : cas.getAnnotationIndex()) { + if (a.getEnd() > text.length()) { + if (a instanceof Header) { + // We don't want to remove the header. It is not really a text-anchored annotation anyway, + // just shrink its span. + a.removeFromIndexes(); + if (a.getBegin() > text.length()) + a.setBegin(0); + a.setEnd(text.length()); + a.addToIndexes(); + } else { + toRemove.add(a); + } + } + } + } + toRemove.forEach(Annotation::removeFromIndexes); + } + + private String truncateText(String documentText) { + // Truncate the document text to the given length + return documentText.length() > truncationSize ? documentText.substring(0, truncationSize) : documentText; + } + + public void populateCas(InputStream is, JCas cas) throws ElementParsingException, NoDataAvailableException { + ElementParsingResult result; + try { + nxmlDocumentParser.reset(is, cas); + result = nxmlDocumentParser.parse(); + } catch (DocumentParsingException e) { + throw new NoDataAvailableException(e); + } + String documentText = populateCas(result, new StringBuilder()).toString(); + truncateTextAndAnnotations(documentText, cas); } /** @@ -87,7 +156,16 @@ private StringBuilder populateCas(ParsingResult result, StringBuilder sb) { break; case TEXT: TextParsingResult textParsingResult = (TextParsingResult) result; - sb.append(textParsingResult.getText()); + final String text = textParsingResult.getText(); + // some special handling for documents that contain formatting tabs, newlines or no-break-spaces in the text + boolean textBeginsWithWhitespace = text.isEmpty() ? false : Character.isWhitespace(text.charAt(0)) && !text.startsWith(LINESEP); + boolean textEndsWithWhitespace = text.isEmpty() ? false : Character.isWhitespace(text.charAt(text.length() - 1)) && !text.endsWith(LINESEP); + boolean sbEndsWithWhitespace = sb.length() == 0 ? false : Character.isWhitespace(sb.charAt(sb.length() - 1)); + if (textBeginsWithWhitespace && !sbEndsWithWhitespace) + sb.append(" "); + sb.append(StringUtils.normalizeSpace(text)); + if (textEndsWithWhitespace) + sb.append(" "); break; case NONE: // do nothing diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java index 02b5d7feb..1a4010576 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java @@ -9,15 +9,24 @@ import java.net.URI; import java.net.URISyntaxException; import java.net.URL; +import java.net.URLEncoder; import java.nio.file.Path; import java.util.*; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.CompletableFuture; +import java.util.stream.Collectors; import java.util.stream.Stream; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * Searches over directories and, optionally, the contents of ZIP archives for files with an (n)xml extension. + * Returns URIs that either point to single files or to entries into ZIP archives. Both can equally be accessed via + * "uri.toURL().openStream()" which is done in the NxmlDocumentParser. + */ public class NXMLURIIterator implements Iterator { private final static Logger log = LoggerFactory.getLogger(NXMLURIIterator.class); private final static Logger logFileSearch = LoggerFactory.getLogger(NXMLURIIterator.class.getCanonicalName() + ".FileSearch"); @@ -44,7 +53,7 @@ public boolean hasNext() { // The beginning: The currentDirectory is null and we start at // the given path (which actually might be a single file to // read). - log.debug("Starting background thread to search for PMC (.nxml) files at {}", basePath); + log.debug("Starting background thread to search for PMC (.xml) files at {}", basePath); CompletableFuture.runAsync(() -> setFilesAndSubDirectories(basePath, false)); fileSearchRunning = true; } @@ -74,7 +83,7 @@ private void setFilesAndSubDirectories(File directory, boolean recursiveCall) { if ((searchRecursively || directory.equals(basePath)) && !isZipFile(directory)) { logFileSearch.debug("Identified {} as a directory, reading files and subdirectories", directory); // set the files in the directory - for (File file : directory.listFiles(f -> f.isFile() && f.getName().contains(".nxml") && !isZipFile(f) && isInWhitelist(f))) { + for (File file : directory.listFiles(f -> f.isFile() && (f.getName().contains(".xml") || f.getName().contains(".nxml")) && !isZipFile(f) && isInWhitelist(f))) { URI toURI = file.toURI(); try { uris.put(toURI); @@ -83,35 +92,45 @@ private void setFilesAndSubDirectories(File directory, boolean recursiveCall) { throw new UncheckedPmcReaderException(e); } } + // Save the subdirectories and potentially ZIP files for a recursive reading call further below Stream.of(directory.listFiles(f -> f.isDirectory())).forEach(pendingSubdirs::push); if (searchZip) Stream.of(directory.listFiles(f -> f.isFile() && isZipFile(f))).forEach(pendingSubdirs::push); + logFileSearch.trace("Added subdirectories and/or ZIP files to the list of pending directories and archives. There are now {} pending.", pendingSubdirs.size()); } else if (searchZip && isZipFile(directory)) { logFileSearch.debug("Identified {} as a ZIP archive, retrieving its inventory", directory); logFileSearch.debug("Searching ZIP archive {} for eligible documents", directory); try (ZipFile zf = new ZipFile(directory)) { final Enumeration entries = zf.entries(); + int numEntries = 0; while (entries.hasMoreElements()) { final ZipEntry e = entries.nextElement(); - if (!e.isDirectory() && e.getName().contains(".nxml") && isInWhitelist(new File(e.getName()))) { - final String urlStr = "jar:" + directory.toURI().toString() + "!/" + e.getName(); - URL url = new URL(urlStr); + if (!e.isDirectory() && (e.getName().contains(".xml") || e.getName().contains(".nxml")) && isInWhitelist(new File(e.getName()))) { + final String urlStr = "jar:" + directory.toURI() + "!/" + e.getName(); + int exclamationIndex = urlStr.indexOf('!'); + final String urlEncodedStr = urlStr.substring(0, exclamationIndex + 2) + Stream.of(urlStr.substring(exclamationIndex + 2).split("/")).map(x -> URLEncoder.encode(x, UTF_8)).collect(Collectors.joining("/")); + URL url = new URL(urlEncodedStr); try { final URI uri = url.toURI(); logFileSearch.trace("Waiting to put URI {} into queue", uri); uris.put(uri); - logFileSearch.trace("Successfully put URI {} into queue", uri); + ++numEntries; + logFileSearch.trace("Successfully put URI {} into queue. Queue size: {}", uri, uris.size()); } catch (InterruptedException e1) { logFileSearch.error("Putting URI for URL {} into the queue was interrupted", url); throw new UncheckedPmcReaderException(e1); } catch (URISyntaxException e1) { logFileSearch.error("Could not convert URL {} to URI.", url, e); + throw new UncheckedPmcReaderException(e1); } } } + logFileSearch.trace("Finished retrieving files from ZIP archive {}. {} eligible documents were read.", directory, numEntries); } catch (IOException e) { logFileSearch.error("Could not read from {}", directory); throw new UncheckedPmcReaderException(e); + } catch (Throwable t) { + logFileSearch.error("Unexpected error:", t); } } else { logFileSearch.debug("Recursive search is deactivated, skipping subdirectory {}", directory); @@ -164,7 +183,7 @@ private boolean isInWhitelist(File file) { private boolean isInWhitelist(String name) { boolean inWhitelist = whitelist.contains(name) || (whitelist.size() == 1 && whitelist.contains("all")); if (!inWhitelist) - log.trace("Skipping document with name/id {} because it is not contained in the white list.", name); + logFileSearch.trace("Skipping document with name/id {} because it is not contained in the white list.", name); return inWhitelist; } diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NoDataAvailableException.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NoDataAvailableException.java new file mode 100644 index 000000000..41a611d26 --- /dev/null +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NoDataAvailableException.java @@ -0,0 +1,23 @@ +package de.julielab.jcore.reader.pmc; + +public class NoDataAvailableException extends Exception { + + public NoDataAvailableException() { + } + + public NoDataAvailableException(String message) { + super(message); + } + + public NoDataAvailableException(String message, Throwable cause) { + super(message, cause); + } + + public NoDataAvailableException(Throwable cause) { + super(cause); + } + + public NoDataAvailableException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCMultiplierReader.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCMultiplierReader.java index 5527a249c..4c349098c 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCMultiplierReader.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCMultiplierReader.java @@ -1,6 +1,7 @@ package de.julielab.jcore.reader.pmc; import de.julielab.jcore.types.casmultiplier.JCoReURI; +import de.julielab.jcore.types.casmultiplier.MultiplierConfigParameters; import org.apache.uima.UimaContext; import org.apache.uima.collection.CollectionException; import org.apache.uima.ducc.Workitem; @@ -8,6 +9,7 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.StringArray; import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -24,6 +26,7 @@ public class PMCMultiplierReader extends PMCReaderBase { public static final String PARAM_WHITELIST = PMCReaderBase.PARAM_WHITELIST; public static final String PARAM_SEND_CAS_TO_LAST = "SendCasToLast"; public static final String PARAM_BATCH_SIZE = "BatchSize"; + public static final String PARAM_OMIT_BIB_REFERENCES = PMCReaderBase.PARAM_OMIT_BIB_REFERENCES; private final static Logger log = LoggerFactory.getLogger(PMCMultiplierReader.class); @ConfigurationParameter(name = PARAM_SEND_CAS_TO_LAST, mandatory = false, defaultValue = "false", description = "UIMA DUCC relevant parameter when using a CAS multiplier. When set to true, the worker CAS from the collection reader is forwarded to the last component in the pipeline. This can be used to send information about the progress to the CAS consumer in order to have it perform batch operations. For this purpose, a feature structure of type WorkItem from the DUCC library is added to the worker CAS. This feature structure has information about the current progress.") private boolean sendCasToLast; @@ -51,9 +54,16 @@ public void getNext(JCas jCas) throws CollectionException { log.error("Exception with URI: " + uri.toString(), e); throw new CollectionException(e); } - completed++; } + // Send configuration parameters to the multiplier if necessary + if (omitBibReferences) { + MultiplierConfigParameters parameters = new MultiplierConfigParameters(jCas); + StringArray paramArray = new StringArray(jCas, 1); + paramArray.set(0, PMCReaderBase.PARAM_OMIT_BIB_REFERENCES+"="+omitBibReferences); + parameters.setParameters(paramArray); + parameters.addToIndexes(); + } if (sendCasToLast) { Workitem workitem = new Workitem(jCas); // Send the work item CAS also to the consumer. Normally, only the CASes emitted by the CAS multiplier diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReader.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReader.java index d58f3f939..86a5fac26 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReader.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReader.java @@ -33,6 +33,7 @@ public class PMCReader extends PMCReaderBase { public static final String PARAM_SEARCH_ZIP = PMCReaderBase.PARAM_SEARCH_ZIP; public static final String PARAM_WHITELIST = PMCReaderBase.PARAM_WHITELIST; public static final String PARAM_EXTRACT_ID_FROM_FILENAME = PMCReaderBase.PARAM_EXTRACT_ID_FROM_FILENAME; + public static final String PARAM_OMIT_BIB_REFERENCES = PMCReaderBase.PARAM_OMIT_BIB_REFERENCES; private static final Logger log = LoggerFactory.getLogger(PMCReader.class); private CasPopulator casPopulator; @@ -40,7 +41,7 @@ public class PMCReader extends PMCReaderBase { public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); try { - casPopulator = new CasPopulator(pmcFiles); + casPopulator = new CasPopulator(pmcFiles, omitBibReferences); } catch (IOException e) { log.error("Exception occurred when trying to initialize NXML parser", e); throw new ResourceInitializationException(e); @@ -54,10 +55,12 @@ public void getNext(JCas cas) throws CollectionException { next = pmcFiles.next(); casPopulator.populateCas(next, cas); if (extractIdFromFilename) - ((Header)cas.getAnnotationIndex(Header.type).iterator().next()).setDocId(getIdFromFilename(next)); + ((Header) cas.getAnnotationIndex(Header.type).iterator().next()).setDocId(getIdFromFilename(next)); } catch (ElementParsingException e) { log.error("Exception occurred when trying to parse {}", next, e); throw new CollectionException(e); + } catch (NoDataAvailableException e) { + log.error("Could not populate CAS due to preceding error."); } completed++; } diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReaderBase.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReaderBase.java index a9fdd3890..73e16a0a0 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReaderBase.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReaderBase.java @@ -28,6 +28,7 @@ public abstract class PMCReaderBase extends JCasCollectionReader_ImplBase { public static final String PARAM_SEARCH_ZIP = "SearchInZipFiles"; public static final String PARAM_WHITELIST = "WhitelistFile"; public static final String PARAM_EXTRACT_ID_FROM_FILENAME = "ExtractIdFromFilename"; + public static final String PARAM_OMIT_BIB_REFERENCES = "OmitBibliographyReferences"; private final static Logger log = LoggerFactory.getLogger(PMCReaderBase.class); @ConfigurationParameter(name = PARAM_INPUT, description = "The path to an NXML file or a directory with NXML files and possibly subdirectories holding more NXML files.") protected File input; @@ -44,6 +45,9 @@ public abstract class PMCReaderBase extends JCasCollectionReader_ImplBase { @ConfigurationParameter(name = PARAM_EXTRACT_ID_FROM_FILENAME, mandatory = false, description = "Used for NXML documents that carry their ID in the file name but not in the document itself. Extracts the string after the last path separator and the first dot after the separator and sets it to the docId feature of the Header annotation.") protected boolean extractIdFromFilename; + @ConfigurationParameter(name = PARAM_OMIT_BIB_REFERENCES, mandatory = false, defaultValue = "false", description = "If set to true, references to the bibliography are omitted from the CAS text.") + protected boolean omitBibReferences; + protected Iterator pmcFiles; protected int completed; @@ -60,6 +64,7 @@ public void initialize(UimaContext context) throws ResourceInitializationExcepti searchRecursively = Optional.ofNullable((Boolean) getConfigParameterValue(PARAM_RECURSIVELY)).orElse(false); searchZip = Optional.ofNullable((Boolean) getConfigParameterValue(PARAM_SEARCH_ZIP)).orElse(false); whitelistFile = Optional.ofNullable((String) getConfigParameterValue(PARAM_WHITELIST)).map(File::new).orElse(null); + omitBibReferences = Optional.ofNullable((Boolean) getConfigParameterValue(PARAM_OMIT_BIB_REFERENCES)).orElse(false); log.info("Reading PubmedCentral NXML file(s) from {}", input); try { Set whitelist = readWhitelist(whitelistFile); diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/DefaultElementParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/DefaultElementParser.java index ac2f3cd23..42e1dc5a6 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/DefaultElementParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/DefaultElementParser.java @@ -54,7 +54,7 @@ public DefaultElementParser(NxmlDocumentParser nxmlDocumentParser) { @Override protected void beforeParseElement() throws ElementParsingException { - // since this parser does not know the element is is used upon, set + // since this parser does not know the element it is used upon, set // it first for the parsing result creation try { elementName = vn.toString(vn.getCurrentIndex()); @@ -138,12 +138,6 @@ protected void editResult(ElementParsingResult result) throws NavException { if (typeName.equals(ElementProperties.TYPE_NONE)) return; - // @SuppressWarnings("unchecked") - // Map defaultFeatureValues = (Map) - // nxmlDocumentParser - // .getTagProperties(elementName) - // .getOrDefault(ElementProperties.DEFAULT_FEATURE_VALUES, - // Collections.emptyMap()); @SuppressWarnings("unchecked") Map defaultFeatureValues = (Map) getApplicableProperties() .orElse(Collections.emptyMap()) @@ -276,8 +270,6 @@ private Optional> getApplicableProperties() throws NavExcept String attributeValue = attributesOfElement.get(attribute.get(ElementProperties.NAME)); if (attributeValue != null && attributeValue.equals(attribute.get(ElementProperties.VALUE)) && attribute.containsKey(ElementProperties.OMIT_ELEMENT)) { - // omitElement = (boolean) - // attribute.get(ElementProperties.OMIT_ELEMENT); applicableProperties = Optional.of(attribute); } } diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FigParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FigParser.java index 9149d8af9..428903fbb 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FigParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FigParser.java @@ -30,7 +30,7 @@ public FigParser(NxmlDocumentParser nxmlDocumentParser) { @Override protected void parseElement(ElementParsingResult figResult) throws ElementParsingException { try { - Optional tableWrapId = getXPathValue("@id"); + Optional figureId = getXPathValue("@id"); Optional labelResult = parseXPath("label"); Optional labelString = getXPathValue("label"); Optional captionResult = parseXPath("caption"); @@ -38,7 +38,7 @@ protected void parseElement(ElementParsingResult figResult) throws ElementParsin captionResult.ifPresent(r -> { ElementParsingResult result = (ElementParsingResult) r; Caption caption = (Caption) result.getAnnotation(); - caption.setCaptionType("table"); + caption.setCaptionType("figure"); figResult.addSubResult(r); }); labelResult.ifPresent(figResult::addSubResult); @@ -52,7 +52,7 @@ protected void parseElement(ElementParsingResult figResult) throws ElementParsin labelString.ifPresent(figure::setObjectLabel); captionResult.map(r -> (Caption) ((ElementParsingResult) r).getAnnotation()) .ifPresent(figure::setObjectCaption); - tableWrapId.ifPresent(figure::setObjectId); + figureId.ifPresent(figure::setObjectId); figResult.setAnnotation(figure); } catch (NavException | XPathParseException | XPathEvalException e) { diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java index 4823fed54..19a848902 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java @@ -1,11 +1,10 @@ -/** - * +/** * Copyright (c) 2017, JULIE Lab. - * All rights reserved. This program and the accompanying materials + * All rights reserved. This program and the accompanying materials * are made available under the terms of the BSD-2-Clause License - * - * Author: - * + *

+ * Author: + *

* Description: **/ package de.julielab.jcore.reader.pmc.parser; @@ -21,6 +20,7 @@ import de.julielab.jcore.types.pubmed.OtherID; import org.apache.uima.jcas.cas.FSArray; +import java.io.File; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; @@ -28,162 +28,172 @@ public class FrontParser extends NxmlElementParser { - public FrontParser(NxmlDocumentParser nxmlDocumentParser) { - super(nxmlDocumentParser); - elementName = "front"; - } - - @Override - protected void parseElement(ElementParsingResult frontResult) throws ElementParsingException { - try { - // Only handle the front matter of the actual article, not sub-articles - final String elementPath = getElementPath(); - if (!elementPath.endsWith("/article/front")) { - int firstIndexAfterElement = skipElement(); - frontResult.setLastTokenIndex(firstIndexAfterElement); - frontResult.setResultType(ParsingResult.ResultType.NONE); - return; - } - - // title and abstract - parseXPath("/article/front/article-meta/title-group/article-title").ifPresent(r -> { - ElementParsingResult er = (ElementParsingResult) r; - Title articleTitle = (Title) er.getAnnotation(); - articleTitle.setTitleType("document"); - frontResult.addSubResult(r); - }); - parseXPath("/article/front/article-meta/abstract").ifPresent(r -> { - ElementParsingResult er = (ElementParsingResult) r; - AbstractText abstractText = (AbstractText) er.getAnnotation(); - List abstractSections = er.getSubResultAnnotations(AbstractSection.class); - FSArray fsArray = new FSArray(nxmlDocumentParser.cas, abstractSections.size()); - IntStream.range(0, abstractSections.size()).forEach(i -> fsArray.set(i, abstractSections.get(i))); - abstractText.setStructuredAbstractParts(fsArray); - frontResult.addSubResult(r); - }); - - // article IDs - Optional pmid = getXPathValue("/article/front/article-meta/article-id[@pub-id-type='pmid']"); - Optional pmcid = getXPathValue("/article/front/article-meta/article-id[@pub-id-type='pmc']"); - Optional doi = getXPathValue("/article/front/article-meta/article-id[@pub-id-type='doi']"); - - // publication details - String pubType = ""; - String pubDateFmt = "/article/front/article-meta/pub-date[@pub-type='%s']"; - if (xPathExists(String.format(pubDateFmt, "epub"))) - pubType = "epub"; - else if (xPathExists(String.format(pubDateFmt, "ppub"))) - pubType = "ppub"; - else if (xPathExists(String.format(pubDateFmt, "pmc-release"))) - pubType = "pmc-release"; - Optional year = getXPathValue(String.format("/article/front/article-meta/pub-date[@pub-type='%s']/year", pubType)); - Optional month = getXPathValue(String.format("/article/front/article-meta/pub-date[@pub-type='%s']/month", pubType)); - Optional day = getXPathValue(String.format("/article/front/article-meta/pub-date[@pub-type='%s']/day", pubType)); - Optional journalTitle = nxmlDocumentParser.getTagset() == Tagset.NLM_2_3 - ? getXPathValue("/article/front/journal-meta/journal-title") - : getXPathValue("/article/front/journal-meta/journal-title-group/journal-title"); - // there actually might be several abbreviated titles but here, we - // only use the first; our type system currently cannot represent - // more anyway. One could try decide for an preferred one since the - // abbrev-type attribute disposes the source of the abbreviated - // title (e.g. publisher or nlm-ta). - Optional abbrevJournalTitle = nxmlDocumentParser.getTagset() == Tagset.NLM_2_3 - ? getXPathValue("/article/front/journal-meta/abbrev-journal-title") - : getXPathValue("/article/front/journal-meta/journal-title-group/abbrev-journal-title"); - Optional volume = getXPathValue("/article/front/article-meta/volume"); - Optional issue = getXPathValue("/article/front/article-meta/issue"); - Optional firstPage = getXPathValue("/article/front/article-meta/fpage"); - Optional lastPage = getXPathValue("/article/front/article-meta/lpage"); - Optional elocation = getXPathValue("/article/front/article-meta/elocation-id"); - Optional issn = getXPathValue("/article/front/journal-meta/issn[@pub-type='ppub']"); - - // copyright statement - Optional copyrightStatement = getXPathValue( - "/article/front/article-meta/permissions/copyright-statement"); - - // keywords - Optional> keywords = getXPathValues("/article/front/article-meta/kwd-group/kwd"); - - assert volume.isPresent(); - - Header header = new Header(nxmlDocumentParser.cas); - header.setComponentId(PMCReader.class.getName()); - - pmcid.ifPresent(header::setDocId); - pmid.ifPresent(p -> { - OtherID otherID = new OtherID(nxmlDocumentParser.cas); - otherID.setComponentId(PMCReader.class.getName()); - otherID.setId(p); - otherID.setSource("PubMed"); - FSArray otherIDs = new FSArray(nxmlDocumentParser.cas, 1); - otherIDs.set(0, otherID); - header.setOtherIDs(otherIDs); - }); - doi.ifPresent(header::setDoi); - - copyrightStatement.ifPresent(header::setCopyright); - - Journal journal = new Journal(nxmlDocumentParser.cas); - journal.setComponentId(PMCReader.class.getName()); - journalTitle.ifPresent(journal::setTitle); - abbrevJournalTitle.ifPresent(journal::setShortTitle); - volume.ifPresent(journal::setVolume); - issue.ifPresent(journal::setIssue); - issn.ifPresent(journal::setISSN); - String pages = null; - if (firstPage.isPresent() && lastPage.isPresent()) - pages = firstPage.get() + "--" + lastPage.get(); - else if (firstPage.isPresent()) - pages = firstPage.get(); - else if (elocation.isPresent()) - pages = elocation.get(); - journal.setPages(pages); - FSArray pubTypes = new FSArray(nxmlDocumentParser.cas, 1); - pubTypes.set(0, journal); - Date pubDate = new Date(nxmlDocumentParser.cas); - pubDate.setComponentId(PMCReader.class.getName()); - day.map(Integer::parseInt).ifPresent(pubDate::setDay); - month.map(Integer::parseInt).ifPresent(pubDate::setMonth); - year.map(Integer::parseInt).ifPresent(pubDate::setYear); - journal.setPubDate(pubDate); - header.setPubTypeList(pubTypes); - - // authors (more general: contributors; but for the moment we - // restrict ourselves to authors) - parseXPath("/article/front/article-meta/contrib-group").map(ElementParsingResult.class::cast) - .ifPresent(r -> { - // currently only authors - List authors = r.getSubResults().stream().map(ElementParsingResult.class::cast) - .map(e -> e.getAnnotation()).filter(AuthorInfo.class::isInstance) - .map(AuthorInfo.class::cast).collect(Collectors.toList()); - FSArray aiArray = new FSArray(nxmlDocumentParser.cas, authors.size()); - IntStream.range(0, authors.size()).forEach(i -> { - aiArray.set(i, authors.get(i)); - }); - if (aiArray.size() > 0) - header.setAuthors(aiArray); - }); - - frontResult.setAnnotation(header); - - if (keywords.isPresent()) { - List keywordList = keywords.get(); - FSArray fsArray = new FSArray(nxmlDocumentParser.cas, keywordList.size()); - IntStream.range(0, keywordList.size()).forEach(i -> { - Keyword keyword = new Keyword(nxmlDocumentParser.cas); - keyword.setComponentId(PMCReader.class.getName()); - keyword.setName(keywordList.get(i)); - fsArray.set(i, keyword); - }); - ManualDescriptor manualDescriptor = new ManualDescriptor(nxmlDocumentParser.cas); - manualDescriptor.setComponentId(PMCReader.class.getName()); - manualDescriptor.setKeywordList(fsArray); - manualDescriptor.addToIndexes(); - } - - } catch (XPathParseException | XPathEvalException | NavException e) { - throw new ElementParsingException(e); - } - } + public FrontParser(NxmlDocumentParser nxmlDocumentParser) { + super(nxmlDocumentParser); + elementName = "front"; + } + + @Override + protected void parseElement(ElementParsingResult frontResult) throws ElementParsingException { + try { + // Only handle the front matter of the actual article, not sub-articles + final String elementPath = getElementPath(); + if (!elementPath.endsWith("/article/front")) { + int firstIndexAfterElement = skipElement(); + frontResult.setLastTokenIndex(firstIndexAfterElement); + frontResult.setResultType(ParsingResult.ResultType.NONE); + return; + } + + // title and abstract + parseXPath("/article/front/article-meta/title-group/article-title").ifPresent(r -> { + ElementParsingResult er = (ElementParsingResult) r; + Title articleTitle = (Title) er.getAnnotation(); + articleTitle.setTitleType("document"); + frontResult.addSubResult(r); + }); + parseXPath("/article/front/article-meta/abstract").ifPresent(r -> { + ElementParsingResult er = (ElementParsingResult) r; + AbstractText abstractText = (AbstractText) er.getAnnotation(); + List abstractSections = er.getSubResultAnnotations(AbstractSection.class); + FSArray fsArray = new FSArray(nxmlDocumentParser.cas, abstractSections.size()); + IntStream.range(0, abstractSections.size()).forEach(i -> fsArray.set(i, abstractSections.get(i))); + abstractText.setStructuredAbstractParts(fsArray); + frontResult.addSubResult(r); + }); + + // article IDs + Optional pmid = getXPathValue("/article/front/article-meta/article-id[@pub-id-type='pmid']"); + Optional pmcid = getXPathValue("/article/front/article-meta/article-id[@pub-id-type='pmc']"); + Optional doi = getXPathValue("/article/front/article-meta/article-id[@pub-id-type='doi']"); + + // publication details + String pubType = ""; + String pubDateFmt = "/article/front/article-meta/pub-date[@pub-type='%s']"; + if (xPathExists(String.format(pubDateFmt, "epub"))) + pubType = "epub"; + else if (xPathExists(String.format(pubDateFmt, "ppub"))) + pubType = "ppub"; + else if (xPathExists(String.format(pubDateFmt, "pmc-release"))) + pubType = "pmc-release"; + Optional year = getXPathValue(String.format("/article/front/article-meta/pub-date[@pub-type='%s']/year", pubType)); + Optional month = getXPathValue(String.format("/article/front/article-meta/pub-date[@pub-type='%s']/month", pubType)); + Optional day = getXPathValue(String.format("/article/front/article-meta/pub-date[@pub-type='%s']/day", pubType)); + Optional journalTitle = nxmlDocumentParser.getTagset() == Tagset.NLM_2_3 || nxmlDocumentParser.getTagset() == Tagset.NLM_3_0 + ? getXPathValue("/article/front/journal-meta/journal-title") + : getXPathValue("/article/front/journal-meta/journal-title-group/journal-title"); + // there actually might be several abbreviated titles but here, we + // only use the first; our type system currently cannot represent + // more anyway. One could try to decide for a preferred one since the + // abbrev-type attribute disposes the source of the abbreviated + // title (e.g. publisher or nlm-ta). + Optional abbrevJournalTitle = nxmlDocumentParser.getTagset() == Tagset.NLM_2_3 || nxmlDocumentParser.getTagset() == Tagset.NLM_3_0 + ? getXPathValue("/article/front/journal-meta/abbrev-journal-title") + : getXPathValue("/article/front/journal-meta/journal-title-group/abbrev-journal-title"); + Optional volume = getXPathValue("/article/front/article-meta/volume"); + Optional issue = getXPathValue("/article/front/article-meta/issue"); + Optional firstPage = getXPathValue("/article/front/article-meta/fpage"); + Optional lastPage = getXPathValue("/article/front/article-meta/lpage"); + Optional elocation = getXPathValue("/article/front/article-meta/elocation-id"); + Optional issn = getXPathValue("/article/front/journal-meta/issn[@pub-type='ppub']"); + + // copyright statement + Optional copyrightStatement = getXPathValue( + "/article/front/article-meta/permissions/copyright-statement"); + + // keywords + Optional> keywords = getXPathValues("/article/front/article-meta/kwd-group/kwd"); + + assert volume.isPresent(); + + Header header = new Header(nxmlDocumentParser.cas); + header.setSource("PubMed Central"); + header.setComponentId(PMCReader.class.getName()); + + pmcid.ifPresentOrElse(id -> header.setDocId(id.startsWith("PMC") ? id : "PMC" + id), () -> { + // try to extract the PMCID from the file name + // For now, let the dot indicate that this is, indeed, a file name; the source also be an InputStream, + // then we don't have access to the file name + int dotIndex = nxmlDocumentParser.getCurrentSource().toString().lastIndexOf('.'); + if (dotIndex > 0) { + String filenameId = nxmlDocumentParser.getCurrentSource().toString().substring(nxmlDocumentParser.getCurrentSource().toString().lastIndexOf(File.separatorChar) + 1, dotIndex); + header.setDocId(filenameId.startsWith("PMC") ? filenameId : "PMC" + filenameId); + } + }); + pmid.ifPresent(p -> { + OtherID otherID = new OtherID(nxmlDocumentParser.cas); + otherID.setComponentId(PMCReader.class.getName()); + otherID.setId(p); + otherID.setSource("PubMed"); + FSArray otherIDs = new FSArray(nxmlDocumentParser.cas, 1); + otherIDs.set(0, otherID); + header.setOtherIDs(otherIDs); + }); + doi.ifPresent(header::setDoi); + + copyrightStatement.ifPresent(header::setCopyright); + + Journal journal = new Journal(nxmlDocumentParser.cas); + journal.setComponentId(PMCReader.class.getName()); + journalTitle.ifPresent(journal::setTitle); + abbrevJournalTitle.ifPresent(journal::setShortTitle); + volume.ifPresent(journal::setVolume); + issue.ifPresent(journal::setIssue); + issn.ifPresent(journal::setISSN); + String pages = null; + if (firstPage.isPresent() && lastPage.isPresent()) + pages = firstPage.get() + "--" + lastPage.get(); + else if (firstPage.isPresent()) + pages = firstPage.get(); + else if (elocation.isPresent()) + pages = elocation.get(); + journal.setPages(pages); + FSArray pubTypes = new FSArray(nxmlDocumentParser.cas, 1); + pubTypes.set(0, journal); + Date pubDate = new Date(nxmlDocumentParser.cas); + pubDate.setComponentId(PMCReader.class.getName()); + day.map(Integer::parseInt).ifPresent(pubDate::setDay); + month.map(Integer::parseInt).ifPresent(pubDate::setMonth); + year.map(Integer::parseInt).ifPresent(pubDate::setYear); + journal.setPubDate(pubDate); + header.setPubTypeList(pubTypes); + + // authors (more general: contributors; but for the moment we + // restrict ourselves to authors) + parseXPath("/article/front/article-meta/contrib-group").map(ElementParsingResult.class::cast) + .ifPresent(r -> { + // currently only authors + List authors = r.getSubResults().stream().map(ElementParsingResult.class::cast) + .map(e -> e.getAnnotation()).filter(AuthorInfo.class::isInstance) + .map(AuthorInfo.class::cast).collect(Collectors.toList()); + FSArray aiArray = new FSArray(nxmlDocumentParser.cas, authors.size()); + IntStream.range(0, authors.size()).forEach(i -> { + aiArray.set(i, authors.get(i)); + }); + if (aiArray.size() > 0) + header.setAuthors(aiArray); + }); + + frontResult.setAnnotation(header); + + if (keywords.isPresent()) { + List keywordList = keywords.get(); + FSArray fsArray = new FSArray(nxmlDocumentParser.cas, keywordList.size()); + IntStream.range(0, keywordList.size()).forEach(i -> { + Keyword keyword = new Keyword(nxmlDocumentParser.cas); + keyword.setComponentId(PMCReader.class.getName()); + keyword.setName(keywordList.get(i)); + fsArray.set(i, keyword); + }); + ManualDescriptor manualDescriptor = new ManualDescriptor(nxmlDocumentParser.cas); + manualDescriptor.setComponentId(PMCReader.class.getName()); + manualDescriptor.setKeywordList(fsArray); + manualDescriptor.addToIndexes(); + } + + } catch (XPathParseException | XPathEvalException | NavException e) { + throw new ElementParsingException(e); + } + } } diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java index d85e133c2..7bafb1a39 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java @@ -40,20 +40,26 @@ public class NxmlDocumentParser extends NxmlParser { private DefaultElementParser defaultElementParser; private Map> tagProperties; private Tagset tagset; - private URI uri; + private Object currentSource; public void reset(File nxmlFile, JCas cas) throws DocumentParsingException { reset(nxmlFile.toURI(), cas); + currentSource = nxmlFile; + } + + public Object getCurrentSource() { + return currentSource; } public void reset(URI uri, JCas cas) throws DocumentParsingException { - this.uri = uri; - boolean gzipped = uri.toString().endsWith(".gz") || this.uri.toString().endsWith(".gzip"); + boolean gzipped = uri.toString().endsWith(".gz") || uri.toString().endsWith(".gzip"); try { + log.debug("Reading from URL {}", uri.toURL()); InputStream is = uri.toURL().openStream(); if (gzipped) is = new GZIPInputStream(is); reset(is, cas); + currentSource = uri; } catch (IOException e) { throw new DocumentParsingException(e); } @@ -74,6 +80,7 @@ public void reset(InputStream is, JCas cas) throws DocumentParsingException { vn = vg.getNav(); setTagset(); setupParserRegistry(); + currentSource = ""; } catch (IOException | VTDException e) { throw new DocumentParsingException(e); } @@ -87,7 +94,7 @@ public void reset(InputStream is, JCas cas) throws DocumentParsingException { * @throws NavException * @throws DocTypeNotFoundException */ - private void setTagset() throws NavException, DocTypeNotFoundException, DocTypeNotSupportedException { + private void setTagset() throws NavException, DocTypeNotFoundException { for (int i = 0; i < vn.getTokenCount(); i++) { if (vn.getTokenType(i) == VTDNav.TOKEN_DTD_VAL) { String docType = StringUtils.normalizeSpace(vn.toString(i)).replaceAll("'", "\""); @@ -95,16 +102,23 @@ private void setTagset() throws NavException, DocTypeNotFoundException, DocTypeN tagset = Tagset.JATS_1_0; else if (docType.contains("JATS-archivearticle1-mathml3.dtd")) tagset = Tagset.JATS_1_2_MATH_ML_3; + else if (docType.contains("JATS-archivearticle1-3-mathml3.dtd")) + tagset = Tagset.JATS_1_3; else if (docType.contains("journalpublishing.dtd") || docType.contains("archivearticle.dtd")) tagset = Tagset.NLM_2_3; else if (docType.contains("journalpublishing3.dtd") || docType.contains("archivearticle3.dtd")) tagset = Tagset.NLM_3_0; - else - throw new DocTypeNotSupportedException("Unsupported document type: " + docType); + else if (docType.contains("JATS")) { + log.warn("Unknown document type: {}. Assigning the latest JATS tagset in assumption of backward compatibility.", docType); + tagset = Tagset.JATS_1_3; + } else if (docType.contains("journalpublishing") || docType.contains("archivearticle")) { + log.warn("Unknown document type: {}. Assigning the latest NLM tagset in assumption of backward compatibility.", docType); + tagset = Tagset.NLM_3_0; + } return; } } - throw new DocTypeNotFoundException("Could not find a doctype."); + throw new DocTypeNotFoundException("Could not find a known doctype."); } private void setupParserRegistry() { @@ -146,9 +160,14 @@ public Map getParserRegistry() { } public ElementParsingResult parse() throws ElementParsingException, DocumentParsingException { - String startingElement = moveToNextStartingTag(); - assert startingElement.equals("article") : "Did not encounter an article element as first start element"; - return getParser(startingElement).parse(); + try { + String startingElement = moveToNextStartingTag(); + assert startingElement.equals("article") : "Did not encounter an article element as first start element"; + return getParser(startingElement).parse(); + } catch (Exception e) { + log.error("Exception while parsing document from source {}", currentSource); + throw e; + } } public NxmlElementParser getParser(String tagName) { @@ -211,6 +230,12 @@ public enum Tagset { * @see https://jats.nlm.nih.gov/publishing/tag-library/1.2/index.html */ JATS_1_2_MATH_ML_3, + /** + * NISO JATS Version 1.3 (ANSI/NISO Z39.96-2021) + * + * @see https://jats.nlm.nih.gov/publishing/tag-library/1.3/index.html + */ + JATS_1_3, /** * NLM Journal Publishing DTD v. 2.3 * diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/SectionParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/SectionParser.java index 6283db703..787e0e8aa 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/SectionParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/SectionParser.java @@ -66,15 +66,17 @@ protected void parseElement(ElementParsingResult parsingResult) throws ElementPa if (!secTitleAnnotations.isEmpty()) sectionHeading = secTitleAnnotations.get(0); Section section = (Section) parsingResult.getAnnotation(); - section.setComponentId(PMCReader.class.getName()); - section.setSectionHeading(sectionHeading); - section.setDepth(depth); - section.setSectionId(sectionId); - List label = parsingResult.getSubResults("label"); - if (!label.isEmpty()) { - // there is only one label element - ElementParsingResult labelParsingResult = (ElementParsingResult) label.get(0); - section.setLabel(labelParsingResult.getResultText()); + if (section != null) { + section.setComponentId(PMCReader.class.getName()); + section.setSectionHeading(sectionHeading); + section.setDepth(depth); + section.setSectionId(sectionId); + List label = parsingResult.getSubResults("label"); + if (!label.isEmpty()) { + // there is only one label element + ElementParsingResult labelParsingResult = (ElementParsingResult) label.get(0); + section.setLabel(labelParsingResult.getResultText()); + } } } } catch (NavException e) { diff --git a/jcore-pmc-reader/src/main/resources/LICENSE.txt b/jcore-pmc-reader/src/main/resources/LICENSE.txt index fbbd41e05..d0f946a29 100644 --- a/jcore-pmc-reader/src/main/resources/LICENSE.txt +++ b/jcore-pmc-reader/src/main/resources/LICENSE.txt @@ -1,6 +1,6 @@ BSD 2-Clause License -Copyright (c) 2017, JULIE Lab +Copyright (c) 2022, JULIE Lab All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-multiplier.xml b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-multiplier.xml index dba9b5af0..c8dbc0610 100644 --- a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-multiplier.xml +++ b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-multiplier.xml @@ -8,7 +8,7 @@ This multiplier expect to receive URIs to NXML documents in the form of JCoReURI feature structures. All JCoReURI FS in the annotation indexes are read and output as new CASes. - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-multiplier-reader.xml b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-multiplier-reader.xml index 88d0d6c73..81a3f80d4 100644 --- a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-multiplier-reader.xml +++ b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-multiplier-reader.xml @@ -5,7 +5,7 @@ JCoRe Pubmed Central Multiplier Reader Reads a directory of NXML files, possibly assembled into ZIP archives. Requires the Pubmed Central Multiplier to follow in the pipeline. This reader only sends URIs referencing the NXML files to the multiplier that then does the parsing. - 2.5.1-SNAPSHOT + 2.6.0 SendCasToLast @@ -49,6 +49,13 @@ false false + + OmitBibliographyReferences + If set to true, references to the bibliography are omitted from the CAS text. + Boolean + false + false + diff --git a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-reader.xml b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-reader.xml index 74eee3a1a..6df46fc6d 100644 --- a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-reader.xml +++ b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-reader.xml @@ -5,7 +5,7 @@ JCoRe Pubmed Central Reader Reads Pubmed Central documents from the NXML format - 2.5.1-SNAPSHOT + 2.6.0 Input @@ -42,6 +42,13 @@ false false + + OmitBibliographyReferences + If set to true, references to the bibliography are omitted from the CAS text. + Boolean + false + false + diff --git a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties-no-bib-refs.yml b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties-no-bib-refs.yml new file mode 100644 index 000000000..a5cd1a93c --- /dev/null +++ b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties-no-bib-refs.yml @@ -0,0 +1,73 @@ +article-title: + block-element: true + type: de.julielab.jcore.types.Title +title: + block-element: true + type: de.julielab.jcore.types.Title + default-feature-values: + titleType: other + paths: + - path: sec/title + type: de.julielab.jcore.types.SectionTitle + default-feature-values: + titleType: section + - path: abstract/sec/title + type: de.julielab.jcore.types.AbstractSectionHeading + default-feature-values: + titleType: abstractSection +abstract: + block-element: true + type: de.julielab.jcore.types.AbstractText +label: + block-element: true + type: de.julielab.jcore.types.Title + default-feature-values: + titleType: other + paths: + - path: list-item/label + omit-element: true +sec: + block-element: true + type: de.julielab.jcore.types.Section + paths: + - path: abstract/sec + type: de.julielab.jcore.types.AbstractSection + attributes: + - name: sec-type + value: supplementary-material + omit-element: true +p: + block-element: true + type: de.julielab.jcore.types.Paragraph +list: + block-element: true + type: de.julielab.jcore.types.List +list-item: + block-element: true + type: de.julielab.jcore.types.ListItem +caption: + block-element: true + type: de.julielab.jcore.types.Caption + default-feature-values: + captionType: other +fn-group: + block-element: true +front: + omit-element: true +back: + omit-element: true +fig-group: + omit-element: true +floats-group: + omit-element: true +array: + omit-element: true +inline-formula: + omit-element: true +disp-formula: + omit-element: true +xref: + attributes: + - name: ref-type + value: bibr + omit-element: true \ No newline at end of file diff --git a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties.yml b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties.yml index f8b5d3429..f1f7dc832 100644 --- a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties.yml +++ b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties.yml @@ -7,6 +7,9 @@ title: default-feature-values: titleType: other paths: + - path: abstract/title + default-feature-values: + titleType: abstract - path: sec/title type: de.julielab.jcore.types.SectionTitle default-feature-values: @@ -21,6 +24,8 @@ abstract: label: block-element: true type: de.julielab.jcore.types.Title + default-feature-values: + titleType: other paths: - path: list-item/label omit-element: true @@ -30,6 +35,10 @@ sec: paths: - path: abstract/sec type: de.julielab.jcore.types.AbstractSection + attributes: + - name: sec-type + value: supplementary-material + omit-element: true p: block-element: true type: de.julielab.jcore.types.Paragraph @@ -56,4 +65,7 @@ floats-group: omit-element: true array: omit-element: true - +inline-formula: + omit-element: true +disp-formula: + omit-element: true \ No newline at end of file diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/NXMLURIIteratorTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/NXMLURIIteratorTest.java index 14faf27df..f1e6bd11c 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/NXMLURIIteratorTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/NXMLURIIteratorTest.java @@ -1,17 +1,21 @@ package de.julielab.jcore.reader.pmc; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileNotFoundException; -import java.net.URI; +import java.net.*; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import static java.nio.charset.StandardCharsets.UTF_8; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.Assert.assertTrue; +import static org.assertj.core.api.Assertions.assertThatCode; +import static org.junit.jupiter.api.Assertions.assertTrue; public class NXMLURIIteratorTest { @Test @@ -53,4 +57,16 @@ public void testGetPmcFiles() throws Exception { assertThat(expectedFiles).containsExactlyInAnyOrder("PMC2847692.nxml.gz", "PMC2758189.nxml.gz", "PMC2970367.nxml.gz", "PMC3201365.nxml.gz", "PMC4257438.nxml.gz"); } + + @Test + public void testXmlEntities() throws MalformedURLException, URISyntaxException { + String inputPath = "jar:file:/data/data_corpora/PMC/non_comm_use.O-Z.xml.zip!/Pädiatrische_Gastroenterologie,_Hepatologie_und_Ernährung/PMC7498810.nxml"; + int exclamationIndex = inputPath.indexOf('!'); + String encoded = inputPath.substring(0, exclamationIndex + 2) + Stream.of(inputPath.substring(exclamationIndex+2).split("/")).map(x -> URLEncoder.encode(x, UTF_8)).collect(Collectors.joining("/")); + URL url = new URL(encoded); + assertThat(url).isNotNull(); + assertThatCode(() -> url.toURI().toASCIIString()).doesNotThrowAnyException(); + String outputPath = Stream.of(url.toURI().toASCIIString().split("/")).map(x -> URLDecoder.decode(x, UTF_8)).collect(Collectors.joining("/")); + assertThat(inputPath).isEqualTo(outputPath); + } } diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCMultiplierTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCMultiplierTest.java index b411afc46..27339365b 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCMultiplierTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCMultiplierTest.java @@ -11,7 +11,7 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.HashSet; @@ -47,8 +47,8 @@ public void testMultiplier() throws UIMAException, IOException { } ++numBatches; } - assertThat(receivedDocIds).containsExactlyInAnyOrder("2847692", "2758189", - "2970367", "3201365", "4257438"); + assertThat(receivedDocIds).containsExactlyInAnyOrder("PMC2847692", "PMC2758189", + "PMC2970367", "PMC3201365", "PMC4257438"); assertThat(numBatches).isEqualTo(3); } @@ -78,8 +78,8 @@ public void testMultiplierFromDescriptors() throws UIMAException, IOException { } ++numBatches; } - assertThat(receivedDocIds).containsExactlyInAnyOrder("2847692", "2758189", - "2970367", "3201365", "4257438"); + assertThat(receivedDocIds).containsExactlyInAnyOrder("PMC2847692", "PMC2758189", + "PMC2970367", "PMC3201365", "PMC4257438"); assertThat(numBatches).isEqualTo(3); } } diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCReaderTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCReaderTest.java index 308f950d2..c8b3bc40b 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCReaderTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCReaderTest.java @@ -19,18 +19,21 @@ import org.apache.uima.fit.util.CasUtil; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.List; import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.IntStream; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatCode; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class PMCReaderTest { + @Test public void testPmcReader1() throws Exception { // read a single file, parse it and right it to XMI for manual review @@ -83,7 +86,7 @@ public void testPmcReader2() throws Exception { cas.reset(); } - assertThat(foundDocuments).containsExactlyInAnyOrder("2847692", "3201365", "4257438", "2758189", "2970367"); + assertThat(foundDocuments).containsExactlyInAnyOrder("PMC2847692", "PMC3201365", "PMC4257438", "PMC2758189", "PMC2970367"); } @Test @@ -122,7 +125,7 @@ public void testPmcReaderRecursiveZip() throws Exception { cas.reset(); } - assertThat(foundDocuments).containsExactlyInAnyOrder("2847692", "3201365", "4257438", "2758189", "2970367"); + assertThat(foundDocuments).containsExactlyInAnyOrder("PMC2847692", "PMC3201365", "PMC4257438", "PMC2758189", "PMC2970367"); } @Test @@ -146,7 +149,7 @@ public void testPmcReaderWhitelist() throws Exception { foundDocuments.add(header.getDocId()); cas.reset(); } - assertThat(foundDocuments).containsExactlyInAnyOrder("2847692", "2758189"); + assertThat(foundDocuments).containsExactlyInAnyOrder("PMC2847692", "PMC2758189"); } @Test @@ -176,7 +179,7 @@ public void testHeader() throws Exception { Header header = (Header) CasUtil.selectSingle(cas.getCas(), CasUtil.getAnnotationType(cas.getCas(), Header.class)); assertNotNull(header); - assertEquals("2847692", header.getDocId()); + assertEquals("PMC2847692", header.getDocId()); assertNotNull(header.getPubTypeList()); assertTrue(header.getPubTypeList().size() > 0); assertEquals("Ambio", ((Journal) header.getPubTypeList(0)).getTitle()); @@ -229,7 +232,7 @@ public void testTables() throws Exception { assertNotNull(table.getObjectTitle()); Title tabelTitle = table.getObjectTitle(); if (tablenum == 0) { - assertEquals("Table 1", tabelTitle.getCoveredText()); + assertEquals("Table 1", tabelTitle.getCoveredText()); // the whitespace is actually a no-break space; note that the // last '1' is actually the digit 1 and not a part of the // codepoint @@ -258,7 +261,7 @@ public void testFigures() throws Exception { assertNotNull(figure.getObjectTitle()); Title tabelTitle = figure.getObjectTitle(); if (tablenum == 0) { - assertEquals("Fig. 1", tabelTitle.getCoveredText()); + assertEquals("Fig. 1", tabelTitle.getCoveredText()); // the whitespace is actually a no-break space; note that the // last '1' is actually the digit 1 and not a part of the // codepoint @@ -288,8 +291,8 @@ public void testKeywords() throws Exception { Set expectedKeywords = new HashSet<>(Arrays.asList("Baltic Sea Action Plan (BSAP)", "Costs", "Review", "Eutrophication", "Hazardous substances")); IntStream.range(0, md.getKeywordList().size()) - .forEach(i -> assertTrue("The keyword \"" + md.getKeywordList(i).getName() + "\" was not expected", - expectedKeywords.remove(md.getKeywordList(i).getName()))); + .forEach(i -> assertTrue(expectedKeywords.remove(md.getKeywordList(i).getName()), + "The keyword \"" + md.getKeywordList(i).getName() + "\" was not expected")); assertTrue(expectedKeywords.isEmpty()); } @@ -359,6 +362,53 @@ public void testFigureReferencesAnnotated() throws Exception { assertThat(figRefs).extracting("refid").containsExactly("Fig1", "Fig2"); } + @Test + public void testBibliographyReferencesAnnotated() throws Exception { + JCas cas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-pubmed-types", + "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); + CollectionReader reader = CollectionReaderFactory.createReader(PMCReader.class, PMCReader.PARAM_INPUT, + "src/test/resources/documents-recursive/PMC2847692.nxml.gz"); + reader.getNext(cas.getCas()); + Collection refs = JCasUtil.select(cas, InternalReference.class); + // Without a filter on bibliographic references, there should 76 references to bibliography + List bibliography = refs.stream().filter(r -> r.getReftype().equalsIgnoreCase("bibliography")).collect(Collectors.toList()); + assertThat(bibliography).hasSize(76); + + // RegEx for something like "2004a" + Matcher yearReferenceMatcher = Pattern.compile("[0-9]{4}[ab]?").matcher(cas.getDocumentText()); + int numReferencePatternsInText = 0; + while (yearReferenceMatcher.find()) { + ++numReferencePatternsInText; + } + // Some found patterns are no references, thus the number is higher than that of the references. + assertThat(numReferencePatternsInText).isEqualTo(84); + } + + @Test + public void testBibliographyReferencesOmitted() throws Exception { + JCas cas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-pubmed-types", + "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); + CollectionReader reader = CollectionReaderFactory.createReader(PMCReader.class, PMCReader.PARAM_INPUT, + "src/test/resources/documents-recursive/PMC2847692.nxml.gz", + PMCMultiplierReader.PARAM_OMIT_BIB_REFERENCES, true); + reader.getNext(cas.getCas()); + Collection refs = JCasUtil.select(cas, InternalReference.class); + // Since we set the omission parameter to true, there should be no bibliographic references + List bibliography = refs.stream().filter(r -> r.getReftype().equalsIgnoreCase("bibliography")).collect(Collectors.toList()); + assertThat(bibliography).isEmpty(); + + // RegEx for something like "2004a" + Matcher yearReferenceMatcher = Pattern.compile("[0-9]{4}[ab]?").matcher(cas.getDocumentText()); + int numReferencePatternsInText = 0; + while (yearReferenceMatcher.find()) { + ++numReferencePatternsInText; + } + // In the test above, where we have the same document but with bib. references, there were 84 occurrences + // of the pattern. 76 of those were actual references. Thus, after removing the references, 8 pattern + // occurrences should remain. + assertThat(numReferencePatternsInText).isEqualTo(8); + } + @Test public void testPmcReaderDescriptor() throws Exception { // read a whole directory with subdirectories @@ -378,7 +428,7 @@ public void testPmcReaderDescriptor() throws Exception { cas.reset(); } - assertThat(foundDocuments).containsExactlyInAnyOrder("2847692", "3201365", "4257438", "2758189", "2970367"); + assertThat(foundDocuments).containsExactlyInAnyOrder("PMC2847692", "PMC3201365", "PMC4257438", "PMC2758189", "PMC2970367"); } @Test @@ -418,4 +468,21 @@ public void testExtractIdFromFilename() throws Exception { String docId = ((Header)cas.getAnnotationIndex(Header.type).iterator().next()).getDocId(); assertEquals(docId, "PMC2847692"); } + + @Test + public void testInlineXmlSpaceIssues() throws Exception { + // read a single file, parse it and right it to XMI for manual review + JCas cas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-pubmed-types", + "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); + CollectionReader reader = CollectionReaderFactory.createReader(PMCReader.class, PMCReader.PARAM_INPUT, + "src/test/resources/documents-errorcauses/PMC2674676.xml.gz"); + while (reader.hasNext()) { + reader.getNext(cas.getCas()); + // looks like this in the XML: + // This preprocessing is performed on both s and r + // Thus, there should be whitespaces around s and r. In the error case, the text looked like this: + // This preprocessing is performed on boths andr + assertThat(cas.getDocumentText()).contains("This preprocessing is performed on both s and r"); + } + } } diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/ContribGroupParserTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/ContribGroupParserTest.java index 911500480..324a653dc 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/ContribGroupParserTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/ContribGroupParserTest.java @@ -12,12 +12,12 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; public class ContribGroupParserTest { @Test diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/ContribParserTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/ContribParserTest.java index 667e85812..fc3f81489 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/ContribParserTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/ContribParserTest.java @@ -13,11 +13,11 @@ import de.julielab.jcore.types.AuthorInfo; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class ContribParserTest { @Test diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/FrontParserTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/FrontParserTest.java index c09fc6313..a3ba75ae7 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/FrontParserTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/FrontParserTest.java @@ -16,11 +16,11 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class FrontParserTest { @Test @@ -38,7 +38,7 @@ public void testParser() throws Exception { Annotation annotation = frontResult.getAnnotation(); assertTrue(annotation instanceof Header); Header header = (Header) annotation; - assertEquals("2847692", header.getDocId()); + assertEquals("PMC2847692", header.getDocId()); assertEquals("10.1007/s13280-009-0005-8", header.getDoi()); assertNotNull(header.getOtherIDs()); assertTrue(header.getOtherIDs().size() > 0); diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/NxmlElementParserTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/NxmlElementParserTest.java index 256ac33a0..de3fca292 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/NxmlElementParserTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/NxmlElementParserTest.java @@ -12,11 +12,11 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class NxmlElementParserTest { @Test diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/SectionParserTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/SectionParserTest.java index 8d2baf7fb..136420616 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/SectionParserTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/SectionParserTest.java @@ -15,7 +15,7 @@ import org.apache.commons.io.IOUtils; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -23,7 +23,7 @@ import java.io.FileInputStream; import java.util.zip.GZIPInputStream; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNotNull; public class SectionParserTest { diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/XRefParserTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/XRefParserTest.java index 46c79e5fb..72d94b03b 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/XRefParserTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/XRefParserTest.java @@ -6,12 +6,12 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; public class XRefParserTest { @Test diff --git a/jcore-pmc-reader/src/test/resources/documents-errorcauses/PMC2674676.xml.gz b/jcore-pmc-reader/src/test/resources/documents-errorcauses/PMC2674676.xml.gz new file mode 100644 index 000000000..10791559b Binary files /dev/null and b/jcore-pmc-reader/src/test/resources/documents-errorcauses/PMC2674676.xml.gz differ diff --git a/jcore-ppd-writer/component.meta b/jcore-ppd-writer/component.meta index d10916db5..457dfc7a3 100644 --- a/jcore-ppd-writer/component.meta +++ b/jcore-ppd-writer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-ppd-writer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Piped Format Writer" } diff --git a/jcore-ppd-writer/pom.xml b/jcore-ppd-writer/pom.xml index 6009a4286..e5649259e 100644 --- a/jcore-ppd-writer/pom.xml +++ b/jcore-ppd-writer/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -42,8 +42,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe Piped Format Writer diff --git a/jcore-ppd-writer/src/main/resources/de/julielab/jcore/consumer/ppd/desc/jcore-ppd-writer.xml b/jcore-ppd-writer/src/main/resources/de/julielab/jcore/consumer/ppd/desc/jcore-ppd-writer.xml index de2470406..11c5b3e39 100644 --- a/jcore-ppd-writer/src/main/resources/de/julielab/jcore/consumer/ppd/desc/jcore-ppd-writer.xml +++ b/jcore-ppd-writer/src/main/resources/de/julielab/jcore/consumer/ppd/desc/jcore-ppd-writer.xml @@ -6,7 +6,7 @@ JCoRe PPD Writer This component writes CAS annotation data to the pipe-separated format. For example, writing tokens with their PoS would result in text like 'The|DET tree|NN is|VBZ green|ADJ'. The component can be configured for an arbitrary number of annotations to be added to each token. - 2.5.1-SNAPSHOT + 2.6.0 TypeToLabelMappings diff --git a/jcore-ppd-writer/src/test/java/de/julielab/jcore/consumer/ppd/PPDWriterTest.java b/jcore-ppd-writer/src/test/java/de/julielab/jcore/consumer/ppd/PPDWriterTest.java index 0327f1b26..0603851fc 100644 --- a/jcore-ppd-writer/src/test/java/de/julielab/jcore/consumer/ppd/PPDWriterTest.java +++ b/jcore-ppd-writer/src/test/java/de/julielab/jcore/consumer/ppd/PPDWriterTest.java @@ -7,13 +7,13 @@ import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileInputStream; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; /** diff --git a/jcore-pubtator-reader/component.meta b/jcore-pubtator-reader/component.meta index 591a4acb5..26181da79 100644 --- a/jcore-pubtator-reader/component.meta +++ b/jcore-pubtator-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-pubtator-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe PubTator Reader" } diff --git a/jcore-pubtator-reader/pom.xml b/jcore-pubtator-reader/pom.xml index bd57f680f..c311f8fac 100644 --- a/jcore-pubtator-reader/pom.xml +++ b/jcore-pubtator-reader/pom.xml @@ -12,7 +12,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 .. @@ -41,8 +41,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-pubtator-reader/src/main/resources/de/julielab/jcore/reader/pubtator/desc/jcore-pubtator-reader.xml b/jcore-pubtator-reader/src/main/resources/de/julielab/jcore/reader/pubtator/desc/jcore-pubtator-reader.xml index 82cd90174..58cd8067a 100644 --- a/jcore-pubtator-reader/src/main/resources/de/julielab/jcore/reader/pubtator/desc/jcore-pubtator-reader.xml +++ b/jcore-pubtator-reader/src/main/resources/de/julielab/jcore/reader/pubtator/desc/jcore-pubtator-reader.xml @@ -5,7 +5,7 @@ jcore-pubtator-reader - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-pubtator-reader/src/test/java/de/julielab/jcore/reader/pubtator/PubtatorReaderTest.java b/jcore-pubtator-reader/src/test/java/de/julielab/jcore/reader/pubtator/PubtatorReaderTest.java index afece0a59..9ad0c4efc 100644 --- a/jcore-pubtator-reader/src/test/java/de/julielab/jcore/reader/pubtator/PubtatorReaderTest.java +++ b/jcore-pubtator-reader/src/test/java/de/julielab/jcore/reader/pubtator/PubtatorReaderTest.java @@ -16,14 +16,14 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.Set; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class PubtatorReaderTest { @Test @@ -73,6 +73,6 @@ public void testDocumentDirectory() throws Exception { } jcas.reset(); } - assertTrue("The following IDs have not been read: " + expectedDocIds, expectedDocIds.isEmpty()); + assertTrue(expectedDocIds.isEmpty(), "The following IDs have not been read: " + expectedDocIds); } } diff --git a/jcore-stanford-lemmatizer-ae/component.meta b/jcore-stanford-lemmatizer-ae/component.meta index 872ac0bdd..10ad8a71e 100644 --- a/jcore-stanford-lemmatizer-ae/component.meta +++ b/jcore-stanford-lemmatizer-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-stanford-lemmatizer-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Stanford Lemmatizer" } diff --git a/jcore-stanford-lemmatizer-ae/pom.xml b/jcore-stanford-lemmatizer-ae/pom.xml index 84fa15b4f..01f130a8c 100644 --- a/jcore-stanford-lemmatizer-ae/pom.xml +++ b/jcore-stanford-lemmatizer-ae/pom.xml @@ -5,7 +5,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 jcore-stanford-lemmatizer-ae JCoRe Stanford Lemmatizer @@ -37,8 +37,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-stanford-lemmatizer-ae/src/main/resources/de/julielab/jcore/ae/stanford/lemma/desc/jcore-stanford-lemmatizer.xml b/jcore-stanford-lemmatizer-ae/src/main/resources/de/julielab/jcore/ae/stanford/lemma/desc/jcore-stanford-lemmatizer.xml index 47dec0beb..b1ff669c4 100644 --- a/jcore-stanford-lemmatizer-ae/src/main/resources/de/julielab/jcore/ae/stanford/lemma/desc/jcore-stanford-lemmatizer.xml +++ b/jcore-stanford-lemmatizer-ae/src/main/resources/de/julielab/jcore/ae/stanford/lemma/desc/jcore-stanford-lemmatizer.xml @@ -5,7 +5,7 @@ JCoRe Stanford Lemmatizer This is the UIMA Wrapper for the Stanford CoreNLP Lemmatizer component. - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-stanford-lemmatizer-ae/src/test/java/de/julielab/jcore/ae/stanford/lemma/StanfordLemmatizerTest.java b/jcore-stanford-lemmatizer-ae/src/test/java/de/julielab/jcore/ae/stanford/lemma/StanfordLemmatizerTest.java index 3e8b94fc2..ca0e0138b 100644 --- a/jcore-stanford-lemmatizer-ae/src/test/java/de/julielab/jcore/ae/stanford/lemma/StanfordLemmatizerTest.java +++ b/jcore-stanford-lemmatizer-ae/src/test/java/de/julielab/jcore/ae/stanford/lemma/StanfordLemmatizerTest.java @@ -13,7 +13,6 @@ import de.julielab.jcore.types.PennBioIEPOSTag; import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; @@ -22,12 +21,15 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Iterator; -public class StanfordLemmatizerTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class StanfordLemmatizerTest { private static final Logger LOGGER = LoggerFactory .getLogger(StanfordLemmatizerTest.class); @@ -66,6 +68,7 @@ public void initCas(JCas aJCas) { } @SuppressWarnings("rawtypes") + @Test public void testProcess() { XMLInputSource lemmaXML = null; diff --git a/jcore-stanford-lemmatizer-ae/src/test/resources/de/julielab/jcore/ae/stanford/lemma/desc/jcore-stanford-lemmatizer-ae.xml b/jcore-stanford-lemmatizer-ae/src/test/resources/de/julielab/jcore/ae/stanford/lemma/desc/jcore-stanford-lemmatizer-ae.xml index ca8ce0703..867658d58 100644 --- a/jcore-stanford-lemmatizer-ae/src/test/resources/de/julielab/jcore/ae/stanford/lemma/desc/jcore-stanford-lemmatizer-ae.xml +++ b/jcore-stanford-lemmatizer-ae/src/test/resources/de/julielab/jcore/ae/stanford/lemma/desc/jcore-stanford-lemmatizer-ae.xml @@ -7,7 +7,7 @@ jcore-stanford-lemmatizer-ae This is the UIMA Wrapper for the Stanford CoreNLP Lemmatizer component. - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-topic-indexing-ae/component.meta b/jcore-topic-indexing-ae/component.meta index 03a0d63b9..a43efef61 100644 --- a/jcore-topic-indexing-ae/component.meta +++ b/jcore-topic-indexing-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-topic-indexing-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe-Topic-Labeling-AE" } diff --git a/jcore-topic-indexing-ae/pom.xml b/jcore-topic-indexing-ae/pom.xml index 010ad84e5..e399f4c39 100644 --- a/jcore-topic-indexing-ae/pom.xml +++ b/jcore-topic-indexing-ae/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -53,7 +53,7 @@ de.julielab jcore-xmi-reader - 2.5.1-SNAPSHOT + 2.6.0 test @@ -96,8 +96,8 @@ - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe-Topic-Labeling-AE diff --git a/jcore-topic-indexing-ae/src/main/resources/de/julielab/jcore/ae/topicindexing/desc/jcore-topic-indexing-ae.xml b/jcore-topic-indexing-ae/src/main/resources/de/julielab/jcore/ae/topicindexing/desc/jcore-topic-indexing-ae.xml index 95d04054e..078ceb6b4 100644 --- a/jcore-topic-indexing-ae/src/main/resources/de/julielab/jcore/ae/topicindexing/desc/jcore-topic-indexing-ae.xml +++ b/jcore-topic-indexing-ae/src/main/resources/de/julielab/jcore/ae/topicindexing/desc/jcore-topic-indexing-ae.xml @@ -6,7 +6,7 @@ JCoRe Topic Indexer This component assigns topics relative to a given topic model to the encoutered documents. The topic model is one trained by the julielab-topic-modeling project. - 2.5.1-SNAPSHOT + 2.6.0 TopicModelConfig diff --git a/jcore-topic-indexing-ae/src/test/java/de/julielab/jcore/ae/TopicIndexingTest.java b/jcore-topic-indexing-ae/src/test/java/de/julielab/jcore/ae/TopicIndexingTest.java index f42a8368b..756bad437 100644 --- a/jcore-topic-indexing-ae/src/test/java/de/julielab/jcore/ae/TopicIndexingTest.java +++ b/jcore-topic-indexing-ae/src/test/java/de/julielab/jcore/ae/TopicIndexingTest.java @@ -21,7 +21,7 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.StringArray; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.IOException; @@ -30,8 +30,8 @@ import java.util.List; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; /** * Unit tests for jcore-topic-labeling-ae. diff --git a/jcore-topics-writer/component.meta b/jcore-topics-writer/component.meta index c98a40a2e..f501752f1 100644 --- a/jcore-topics-writer/component.meta +++ b/jcore-topics-writer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-topics-writer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe Topics Writer" } diff --git a/jcore-topics-writer/pom.xml b/jcore-topics-writer/pom.xml index ad9569a47..d570cd667 100644 --- a/jcore-topics-writer/pom.xml +++ b/jcore-topics-writer/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -42,8 +42,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe Topics Writer diff --git a/jcore-topics-writer/src/main/resources/de/julielab/jcore/consumer/topics/desc/jcore-topics-writer.xml b/jcore-topics-writer/src/main/resources/de/julielab/jcore/consumer/topics/desc/jcore-topics-writer.xml index 47a3c1bb6..a1ed20a19 100644 --- a/jcore-topics-writer/src/main/resources/de/julielab/jcore/consumer/topics/desc/jcore-topics-writer.xml +++ b/jcore-topics-writer/src/main/resources/de/julielab/jcore/consumer/topics/desc/jcore-topics-writer.xml @@ -6,7 +6,7 @@ JCoRe Topics Writer Writes the topic weights, given the jcore-topic-indexing-ae running before, into a simple text file. Thus, the output consists of a sequency of double numbers encodes as strings, separated by tab characters. The topic ID is just the 0-based index of each number, from left to right in the written file. The first entry of each file is the document ID. - 2.5.1-SNAPSHOT + 2.6.0 OutputDirectory diff --git a/jcore-topics-writer/src/test/java/de/julielab/jcore/consumer/topics/TopicsWriterTest.java b/jcore-topics-writer/src/test/java/de/julielab/jcore/consumer/topics/TopicsWriterTest.java index 4db31fa61..693ce2c58 100644 --- a/jcore-topics-writer/src/test/java/de/julielab/jcore/consumer/topics/TopicsWriterTest.java +++ b/jcore-topics-writer/src/test/java/de/julielab/jcore/consumer/topics/TopicsWriterTest.java @@ -9,9 +9,9 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.DoubleArray; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.Test; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.IOException; @@ -26,8 +26,8 @@ */ public class TopicsWriterTest { - @BeforeClass - @AfterClass + @BeforeAll + @AfterAll public static void setup() { FileUtils.deleteQuietly(new File("src/test/resources/output")); } diff --git a/jcore-txt-consumer/component.meta b/jcore-txt-consumer/component.meta index 6cf58e0d6..818930805 100644 --- a/jcore-txt-consumer/component.meta +++ b/jcore-txt-consumer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-txt-consumer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe TXT Consumer" } diff --git a/jcore-txt-consumer/pom.xml b/jcore-txt-consumer/pom.xml index bf6de2d14..6c2e384cf 100644 --- a/jcore-txt-consumer/pom.xml +++ b/jcore-txt-consumer/pom.xml @@ -4,7 +4,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 jcore-txt-consumer JCoRe TXT Consumer @@ -44,8 +44,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-txt-consumer/src/main/resources/de/julielab/jcore/consumer/txt/desc/jcore-txt-consumer.xml b/jcore-txt-consumer/src/main/resources/de/julielab/jcore/consumer/txt/desc/jcore-txt-consumer.xml index 556fbbc5e..7146fd7f7 100644 --- a/jcore-txt-consumer/src/main/resources/de/julielab/jcore/consumer/txt/desc/jcore-txt-consumer.xml +++ b/jcore-txt-consumer/src/main/resources/de/julielab/jcore/consumer/txt/desc/jcore-txt-consumer.xml @@ -6,7 +6,7 @@ JCoRe Text Consumer Stores the CAS document text in files. Either in tokenized sentences plus optional PoS tags or just the original document text. The text files can also be stored in GZIP format or batch-wise in ZIP archives. - 2.5.1-SNAPSHOT + 2.6.0 de.julielab.jcore.consumer.txt diff --git a/jcore-txt-consumer/src/test/java/de/julielab/jcore/consumer/txt/SentenceTokenConsumerTest.java b/jcore-txt-consumer/src/test/java/de/julielab/jcore/consumer/txt/SentenceTokenConsumerTest.java index 82b76eef9..29197eac6 100644 --- a/jcore-txt-consumer/src/test/java/de/julielab/jcore/consumer/txt/SentenceTokenConsumerTest.java +++ b/jcore-txt-consumer/src/test/java/de/julielab/jcore/consumer/txt/SentenceTokenConsumerTest.java @@ -20,7 +20,7 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.*; import java.nio.charset.StandardCharsets; @@ -32,7 +32,7 @@ import static de.julielab.jcore.consumer.txt.SentenceTokenConsumer.*; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class SentenceTokenConsumerTest { /** * just tests if there is an error with an empty CAS diff --git a/jcore-types/pom.xml b/jcore-types/pom.xml index 6abd932d0..b2a52a8f8 100644 --- a/jcore-types/pom.xml +++ b/jcore-types/pom.xml @@ -8,7 +8,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -36,6 +36,9 @@ src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-dbtable-multiplier-types.xml + + src/main/resources/de/julielab/jcore/types/jcore-casflow-types.xml + diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-dbtable-multiplier-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-dbtable-multiplier-types.xml index 57770ed2a..fd003f5c2 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-dbtable-multiplier-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-dbtable-multiplier-types.xml @@ -10,7 +10,7 @@ base document and annotations that have been previously created and stored in separate tables. This is part of the Jena Document Information System, JeDIS. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-uri-multiplier-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-uri-multiplier-types.xml index 7b49b49c9..bb457ab06 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-uri-multiplier-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-uri-multiplier-types.xml @@ -4,7 +4,7 @@ This is a type system for usage with a CAS multiplier. It should not be included into the jcore-all-types type system. This particular type system holds a single URI that points to the resource that should be split into CASes by the multiplier. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany @@ -19,5 +19,18 @@ + + de.julielab.jcore.types.casmultiplier.MultiplierConfigParameters + A list of strings in properties format to specify parameters and their values. The format is <key>=<value>. May be used to transfer configuration properties from the multiplier reader to the multiplier. + Normally in UIMA, the multiplier would just have the configuration parameters itself. However, it can be confusing that the basic reader - without a successive multiplier - has some parameters that the multiplier reader does not exhibit because they must be set on the multiplier. Using this annotation, parameter settings can be sent to the multiplier which then does not need further configuration on its own. + uima.tcas.Annotation + + + parameters + An array of string holding key - value pairs in the format <key>=<value>. + uima.cas.StringArray + + + \ No newline at end of file diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-ace-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-ace-types.xml index e3ba78bce..39357e325 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-ace-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-ace-types.xml @@ -2,7 +2,7 @@ JCoRe ACE Types The jcore-ace-types TS represents the complete ACE Annotation in CAS format. -2.5.1-SNAPSHOT +2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-document-meta-extension-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-document-meta-extension-types.xml index a95f22bfa..dc727cdff 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-document-meta-extension-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-document-meta-extension-types.xml @@ -2,7 +2,7 @@ JCoRe Document Meta Types The types of this type system reflect meta data about documents for rather specific use cases. - 2.5.1-SNAPSHOT + 2.6.0 @@ -25,6 +25,11 @@ This feature is used by the DBCheckpointAE. It allows components in the pipeline to prevent a document to be marked as being finished with processing. This can be used to indicate issues with specific documents which will require reprocessing. uima.cas.Boolean + + isDocumentHashUnchanged + To be set by the XML DB reader/multiplier and to be used (among others) by the XMIDBWriter. Used to prohibit that mirror subsets reset to 'not processed' for this document when there was no change in the document text. That allows to update the base document without indicating that a reprocessing is required. This is useful when the document is updated by the distributor (e.g. PubMed) but the text contents have not changed. + uima.cas.Boolean + diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-dta-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-dta-types.xml index 387aafda0..5abb1cc26 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-dta-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-dta-types.xml @@ -2,7 +2,7 @@ jcore-dta-types - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-evaluation-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-evaluation-types.xml index bb860b3ec..285c5600b 100755 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-evaluation-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-evaluation-types.xml @@ -2,7 +2,7 @@ JCoRe Evaluation Types This type system is an extension of the JCoRe type system to cover evaluation Annotations like missing or additional annotations for evaluation purposes. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-mantra-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-mantra-types.xml index 9d5ffa276..fa4afe671 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-mantra-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-mantra-types.xml @@ -2,7 +2,7 @@ JCoRe Mantra Types The type system contains types for working with documents in the context of the MANTRA project. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-medical-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-medical-types.xml index 3fca73bdb..6f181906a 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-medical-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-medical-types.xml @@ -2,7 +2,7 @@ jcore-medical-types - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-mmax-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-mmax-types.xml index 192bf4dc6..06ef422dd 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-mmax-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-mmax-types.xml @@ -2,7 +2,7 @@ JCoRe MMAX Types The type system contains types for the import of MMAX2 annotations. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-muc7-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-muc7-types.xml index 39575dacb..2bed1a349 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-muc7-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-muc7-types.xml @@ -2,7 +2,7 @@ JCoRe MUC7 Type System This type system contains types covering annotations for the MUC7 data. -2.5.1-SNAPSHOT +2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-ace-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-ace-types.xml index f2db24b6f..2d834a6df 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-ace-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-ace-types.xml @@ -2,7 +2,7 @@ JCoRe Semantics ACE Types The type system contains ACE types of the ACE taxonomy. -2.5.1-SNAPSHOT +2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-bootstrep-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-bootstrep-types.xml index 2ae6f1df3..37a80cc11 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-bootstrep-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-bootstrep-types.xml @@ -2,7 +2,7 @@ JCoRe Semantics BOOTStrep Types The type system is an extension of the JCoRe core type system for types required in the context of the BOOTStrep project. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-mention-extension-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-mention-extension-types.xml index 1b67565c2..377618374 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-mention-extension-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-mention-extension-types.xml @@ -2,7 +2,7 @@ JCoRe Semantics Mention Types Extension JCoRe type extensions to the JCoRe Semantics Mention types. Required for some processing or representation, these types do not extend the actual semantics of the core type system. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-stemnet-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-stemnet-types.xml index c213f3f08..6238a9640 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-stemnet-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-stemnet-types.xml @@ -2,7 +2,7 @@ JCoRe Semantics StemNet Typs The type system contains types of the StemNet project. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-wikipedia-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-wikipedia-types.xml index 48c8e3b9e..1bea4abdf 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-wikipedia-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-wikipedia-types.xml @@ -2,7 +2,7 @@ JCoRe Wikipedia Types The type system contains types for the annotation of meta information of Wikipedia pages. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-affect-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-affect-types.xml index 10d3a8bb7..92a6adc0e 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-affect-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-affect-types.xml @@ -2,7 +2,7 @@ jcore-affect-types - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-all-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-all-types.xml index fedf2eec7..6922c15db 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-all-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-all-types.xml @@ -2,7 +2,7 @@ JCoRe All Types This is just a convenience file, assembling all JCoRe types - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-basic-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-basic-types.xml index 00003147c..8cfc25831 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-basic-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-basic-types.xml @@ -2,7 +2,7 @@ JCoRe Basic Types The type system contains the basic annotation types. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-casflow-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-casflow-types.xml new file mode 100644 index 000000000..bc335c293 --- /dev/null +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-casflow-types.xml @@ -0,0 +1,28 @@ + + + JCoRe CAS Flow Types + This is a type system to facilitate the routing of CASes through AggregateAnalysisEngines via + FlowControllers. The types herein serve to indicate which components should be visited for the CAS + carrying annotations of this type. + + 2.6.0 + JULIE Lab Jena, Germany + + + de.julielab.jcore.types.casflow.ToVisit + Contains a list of delegate analysis engine names that the CAS, having this annotation, should + visit. Other components will be skipped. The names must the delegate keys specified in the aggregate + descriptor. + + uima.tcas.Annotation + + + delegateKeys + The keys of the delegates to visit. The keys are the names given to the delegate analysis engines in the aggregate. An empty or null array indicates that no component should be visited. + uima.cas.StringArray + uima.cas.String + + + + + \ No newline at end of file diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-discourse-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-discourse-types.xml index d3190b9e5..e8897be62 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-discourse-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-discourse-types.xml @@ -2,7 +2,7 @@ JCoRe Discourse Types Discourse types such as coreference relations. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany @@ -34,7 +34,7 @@ de.julielab.jcore.types.CorefExpression - A 'coreference expression' might by any span of text that is part of a set of text expressions refering to the same entity. Speaking in the anaphora framework, coreference expressions are either anaphors - mostly pronouns and definite noun phrases - or their antecedents - the original, first mention of an entity or already an anaphoric expression referring itself to a reference to the original entity mention. + A 'coreference expression' might be any span of text that is part of a set of text expressions referring to the same entity. Speaking in the anaphora framework, co-reference expressions are either anaphors - mostly pronouns and definite noun phrases - or their antecedents - the original, first mention of an entity or already an anaphoric expression referring itself to a reference to the original entity mention. de.julielab.jcore.types.Annotation diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-clinicaltrial-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-clinicaltrial-types.xml index ce908039f..24abc85ec 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-clinicaltrial-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-clinicaltrial-types.xml @@ -5,7 +5,7 @@ document meta information (bibliographical and content information), especially for PubMed abstracts. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-pubmed-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-pubmed-types.xml index 6363ece45..64bbd5e32 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-pubmed-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-pubmed-types.xml @@ -5,7 +5,7 @@ document meta information (bibliographical and content information), especially for PubMed abstracts. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-types.xml index 5a23252bd..f0324e628 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-types.xml @@ -2,7 +2,7 @@ JCoRe Document Meta Types The type system contains types for the annotation of document meta information (bibliographical and content information). - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-clinicaltrial-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-clinicaltrial-types.xml index 082c8e775..4422696fe 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-clinicaltrial-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-clinicaltrial-types.xml @@ -4,7 +4,7 @@ This type system contains document structure types specific to the clinical trails XML format as retrieved from https://clinicaltrials.gov/. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-pubmed-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-pubmed-types.xml index 57ea9b281..d13edaf0d 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-pubmed-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-pubmed-types.xml @@ -4,7 +4,7 @@ This type system contains document structure types specific to PubMed or MEDLINE, e.g. detailed descriptions of structured abstracts. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-types.xml index 633edd187..4e8fcf501 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-types.xml @@ -2,7 +2,7 @@ JCoRe Document Structure Types The type system contains the types for the annotation of document sutructure, e.g. titles, abstract text, captions etc. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-morpho-syntax-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-morpho-syntax-types.xml index 1c65aef6f..72adcfed0 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-morpho-syntax-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-morpho-syntax-types.xml @@ -4,7 +4,7 @@ The type system contains types for the annotation of morpho-syntactic and syntactic analysis (constituncy-based and dependecy-based parsing) results. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-biology-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-biology-types.xml index 92f89b23b..e7bdf766c 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-biology-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-biology-types.xml @@ -2,7 +2,7 @@ JCoRe Semantics Biology Types The type system contains types of the biomedical domain. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-concept-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-concept-types.xml index edea7e8ee..7bfb0c3ad 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-concept-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-concept-types.xml @@ -2,7 +2,7 @@ JCoRe Semantics Concept Types The type system contains core semantic types definitions such as entity, relation and event. -2.5.1-SNAPSHOT +2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-mention-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-mention-types.xml index e2ecd9dd4..64bc7b357 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-mention-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-mention-types.xml @@ -2,7 +2,7 @@ JCoRe Semantics Mention Types The type system contains core semantic types definitions such as entity, relation and event. The types in this type system refer to actual text occurrences. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/priorities/jcore-type-priorities.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/priorities/jcore-type-priorities.xml index 1373c4eac..b8ee7c4b1 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/priorities/jcore-type-priorities.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/priorities/jcore-type-priorities.xml @@ -2,7 +2,7 @@ jcore-type-priorities - 2.5.1-SNAPSHOT + 2.6.0 de.julielab.jcore.types.Title diff --git a/jcore-utilities/pom.xml b/jcore-utilities/pom.xml index 6395a3b73..77fa5f3b5 100644 --- a/jcore-utilities/pom.xml +++ b/jcore-utilities/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -19,8 +19,8 @@ slf4j-api - junit - junit + org.junit.jupiter + junit-jupiter-engine org.apache.commons diff --git a/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReAnnotationIndexMerger.java b/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReAnnotationIndexMerger.java index 2923b7bd7..d016b5eb7 100644 --- a/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReAnnotationIndexMerger.java +++ b/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReAnnotationIndexMerger.java @@ -120,7 +120,6 @@ public boolean incrementAnnotation() { } protected boolean moveIterator(boolean initialize) { - int minBegin = Integer.MAX_VALUE; if (!initialize) { annotationIterators.get(currentIndex).moveToNext(); firstToken = false; @@ -135,6 +134,9 @@ protected boolean moveIterator(boolean initialize) { return true; } } + + // find the iterator with the lowest-begin-offset annotation and set currentIndex accordingly + int minBegin = Integer.MAX_VALUE; for (int i = 0; i < annotationIterators.size(); i++) { FSIterator it = annotationIterators.get(i); if (initialize) diff --git a/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReAnnotationTools.java b/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReAnnotationTools.java index 04eb42a78..a580bcd94 100644 --- a/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReAnnotationTools.java +++ b/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReAnnotationTools.java @@ -18,6 +18,7 @@ **/ package de.julielab.jcore.utility; +import de.julielab.jcore.utility.index.JCoReOverlapAnnotationIndex; import org.apache.commons.lang3.Range; import org.apache.uima.cas.FSIterator; import org.apache.uima.jcas.JCas; @@ -26,10 +27,9 @@ import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.IntStream; // import de.julielab.jcore.types.Annotation; @@ -588,4 +588,35 @@ public static T getLastOverlappingAnnotation(JCas aJCas, return null; } + /** + *

Determines and returns the annotations in an annotation sequence (e.g. tokens) whose offsets are between the two input annotations (e.g. some entities we want to get their token distance of).

+ * @param a1 An annotation. + * @param a2 Another annotation. + * @param underlyingAnnotationIndex An overlap index to efficiently find the border the underlying annotations of the sequence to return. + * @return The annotations from underlyingAnnotationSequence between a1 and a2, excluding annotations overlapping either. + */ + public static List getAnnotationsBetween(Annotation a1, Annotation a2, JCoReOverlapAnnotationIndex underlyingAnnotationIndex) { + List tokensInBetween = Collections.emptyList(); + List underlyingAnnotationSequence = underlyingAnnotationIndex.getBeginIndex(); + Annotation firstAnnotation = a1.getBegin() <= a2.getBegin() ? a1 : a2; + Annotation secondAnnotation = a1.getBegin() <= a2.getBegin() ? a2 : a1; + final Optional firstSequenceAnnotation = underlyingAnnotationIndex.search(firstAnnotation).stream().findFirst(); + final Optional secondSequenceAnnotation = underlyingAnnotationIndex.search(secondAnnotation).stream().findFirst(); + if (firstSequenceAnnotation.isPresent() && secondSequenceAnnotation.isPresent()) { + int firstSequenceAnnotationIndex = Collections.binarySearch(underlyingAnnotationSequence, firstSequenceAnnotation.get(), Comparator.comparingInt(Annotation::getBegin)); + int secondSequenceAnnotationIndex = Collections.binarySearch(underlyingAnnotationSequence, secondSequenceAnnotation.get(), Comparator.comparingInt(Annotation::getBegin)); + if (firstSequenceAnnotationIndex != -1 && secondSequenceAnnotationIndex != -1) { + // move the first and second token outside of the spans of the input annotations + while(firstSequenceAnnotationIndex < secondSequenceAnnotationIndex && underlyingAnnotationSequence.get(firstSequenceAnnotationIndex).getBegin() < firstAnnotation.getEnd()) + ++firstSequenceAnnotationIndex; + while(secondSequenceAnnotationIndex > firstSequenceAnnotationIndex && underlyingAnnotationSequence.get(secondSequenceAnnotationIndex).getEnd() > secondAnnotation.getBegin()) + --secondSequenceAnnotationIndex; + if (firstSequenceAnnotationIndex != secondSequenceAnnotationIndex) { + tokensInBetween = IntStream.rangeClosed(firstSequenceAnnotationIndex, secondSequenceAnnotationIndex).mapToObj(underlyingAnnotationSequence::get).collect(Collectors.toList()); + } + } + } + return tokensInBetween; + } + } diff --git a/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java b/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java index a3e4bd532..9ceb5f84e 100644 --- a/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java +++ b/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java @@ -1,11 +1,15 @@ package de.julielab.jcore.utility; +import de.julielab.jcore.types.InternalReference; import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; import java.util.Map.Entry; import java.util.NavigableMap; import java.util.Set; import java.util.TreeMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * This class is helpful when some parts of the CAS document text should be cut @@ -13,144 +17,259 @@ * document text that results from cutting out said text passages. It offers a * method to return the actual text string and a method to map the character * offsets of the compacted string to the original CAS document text. - * - * @author faessler * + * @author faessler */ public class JCoReCondensedDocumentText { - private NavigableMap condensedPos2SumCutMap; - private NavigableMap originalPos2SumCutMap; - private String condensedText; - private JCas cas; - - public JCas getCas() { - return cas; - } - - /** - *

- * Cuts away the covered text of annotations of a type in cutAwayTypes - * from the cas document text. If cutAwayTypes is null or - * empty, this class' methods will return the original CAS data. - *

- * - * @param cas - * The CAS for which the document text should be cut. - * @param cutAwayTypes - * The types for cutting. May be null. - * @throws ClassNotFoundException - * If cutAwayTypes contains non-existing type names. - */ - public JCoReCondensedDocumentText(JCas cas, Set cutAwayTypes) throws ClassNotFoundException { - this.cas = cas; - buildMap(cas, cutAwayTypes); - } - - /** - *

- * Creates a map that maps those positions of the small-cut text that correspond - * to an intermediate next position after a cut-away annotation in the original - * text to the sum of ranges covered by cut-away annotations up to the original - * offset. - *

- *

- * If cutAwayTypes is empty, no work will be done and the methods of - * this class we return the original text and offets of the CAS. - *

- * - * @param cas - * The CAS for create a cut-away document text for. - * @param cutAwayTypes - * The qualified type names of the annotations whose covered text - * should be cut away. - * @throws ClassNotFoundException - * If cutAwayTypes contains type identifiers to - * non-existing types. - */ - public void buildMap(JCas cas, Set cutAwayTypes) throws ClassNotFoundException { - if (cutAwayTypes == null || cutAwayTypes.isEmpty()) - return; - StringBuilder sb = new StringBuilder(); - condensedPos2SumCutMap = new TreeMap<>(); - condensedPos2SumCutMap.put(0, 0); - originalPos2SumCutMap = new TreeMap<>(); - originalPos2SumCutMap.put(0, 0); - JCoReAnnotationIndexMerger merger = new JCoReAnnotationIndexMerger(cutAwayTypes, true, null, cas); - int cutSum = 0; - int lastBegin = 0; - int lastEnd = -1; - // For each ignored annotation, there could be following annotations overlapping - // with the first, effectively enlargeing the ignored span. Thus, we iterate - // until we find an ignored annotation the has a positive (not 0) distance to a - // previous one. Then, we store the length of the span of cut-away annotations - // for the largest end of the previous annotations. - while (merger.incrementAnnotation()) { - int end = merger.getCurrentEnd(); - int begin = merger.getCurrentBegin(); - - if (lastEnd > 0 && begin > lastEnd) { - cutSum += lastEnd - lastBegin; - int condensedPosition = lastEnd - cutSum + 1; - condensedPos2SumCutMap.put(condensedPosition, cutSum); - originalPos2SumCutMap.put(lastEnd, cutSum); - lastBegin = begin; - sb.append(cas.getDocumentText().substring(lastEnd, begin)); - } else if (lastEnd < 0) { - lastBegin = begin; - sb.append(cas.getDocumentText().substring(0, begin)); - } - lastEnd = end; - } - // Since we iterate one annotation further than the annotation we store the span - // for, we need to take care of the very last ignored annotation after the loop - // - it has never been handled itself. - if (lastEnd > 0) { - cutSum += lastEnd - lastBegin; - int condensedPosition = lastEnd - cutSum + 1; - condensedPos2SumCutMap.put(condensedPosition, cutSum); - originalPos2SumCutMap.put(lastEnd, cutSum); - } - // If lastEnd is still -1 one, we just did not find any of the cut away annotations. Thus, we just copy the whole text. - if (lastEnd == -1) - lastEnd = 0; - if (lastEnd < cas.getDocumentText().length()) - sb.append(cas.getDocumentText().substring(lastEnd, cas.getDocumentText().length())); - condensedText = sb.toString(); - } - - /** - * Given a character offset relative to the condensed document text, this method - * returns the corresponding offset in the original CAS document text. - * - * @param condensedOffset - * The character offset in the condensed document text string. - * @return The character offset relative to the original CAS document text - * associated with condensedOffset. - */ - public int getOriginalOffsetForCondensedOffset(int condensedOffset) { - if (condensedPos2SumCutMap == null) - return condensedOffset; - Entry floorEntry = condensedPos2SumCutMap.floorEntry(condensedOffset); - return condensedOffset + floorEntry.getValue(); - } - - /** - * Given a character offset relative to the original CAS document text, this method - * returns the corresponding offset in the condensed document text. - * - * @param originalOffset - * The character offset in the originalOffset document CAS text string. - * @return The character offset relative to the condensed document text - * associated with originalOffset. - */ - public int getCondensedOffsetForOriginalOffset(int originalOffset) { - if (originalPos2SumCutMap == null) - return originalOffset; - Entry floorEntry = originalPos2SumCutMap.floorEntry(originalOffset); - return originalOffset - floorEntry.getValue(); - } - - public String getCodensedText() { - return condensedText != null ? condensedText : cas.getDocumentText(); - } + private NavigableMap condensedPos2SumCutMap; + private NavigableMap originalPos2SumCutMap; + private String condensedText; + private JCas cas; + private Set cutAwayFillCharacters; + private boolean skipInternalReferencesWithLetters; + + public boolean isSkipInternalReferencesWithLetters() { + return skipInternalReferencesWithLetters; + } + + /** + *

+ * Cuts away the covered text of annotations of a type in cutAwayTypes + * from the cas document text. If cutAwayTypes is null or + * empty, this class' methods will return the original CAS data. + *

+ * + * @param cas The CAS for which the document text should be cut. + * @param cutAwayTypes The types for cutting. May be null. + * @throws ClassNotFoundException If cutAwayTypes contains non-existing type names. + */ + public JCoReCondensedDocumentText(JCas cas, Set cutAwayTypes) throws ClassNotFoundException { + this(cas, cutAwayTypes, false); + } + + /** + *

+ * Cuts away the covered text of annotations of a type in cutAwayTypes + * from the cas document text. If cutAwayTypes is null or + * empty, this class' methods will return the original CAS data. + *

+ * + * @param cas The CAS for which the document text should be cut. + * @param cutAwayTypes The types for cutting. May be null. + * @throws ClassNotFoundException If cutAwayTypes contains non-existing type names. + */ + public JCoReCondensedDocumentText(JCas cas, Set cutAwayTypes, boolean skipInternalReferencesWithLetters) throws ClassNotFoundException { + this(cas, cutAwayTypes, null, skipInternalReferencesWithLetters); + } + + /** + *

+ * Cuts away the covered text of annotations of a type in cutAwayTypes + * from the cas document text. If cutAwayTypes is null or + * empty, this class' methods will return the original CAS data. + *

+ *

The cutAwayFillCharacters set may provide characters that, when being the only character between + * to cut-away annotations, will add to the span of text being cut away. This way, enumerations of references + * (e.g. "4,6,8") can be completely removed, for example.

+ * + * @param cas The CAS for which the document text should be cut. + * @param cutAwayTypes The types for cutting. May be null. + * @param cutAwayFillCharacters Characters that, when being the only separator between two cut away annotations, are also cut away. + * @throws ClassNotFoundException If cutAwayTypes contains non-existing type names. + */ + public JCoReCondensedDocumentText(JCas cas, Set cutAwayTypes, Set cutAwayFillCharacters) throws ClassNotFoundException{ + this(cas, cutAwayTypes, cutAwayFillCharacters, false); + } + + /** + *

+ * Cuts away the covered text of annotations of a type in cutAwayTypes + * from the cas document text. If cutAwayTypes is null or + * empty, this class' methods will return the original CAS data. + *

+ *

The cutAwayFillCharacters set may provide characters that, when being the only character between + * to cut-away annotations, will add to the span of text being cut away. This way, enumerations of references + * (e.g. "4,6,8") can be completely removed, for example.

+ * + * @param cas The CAS for which the document text should be cut. + * @param cutAwayTypes The types for cutting. May be null. + * @param cutAwayFillCharacters Characters that, when being the only separator between two cut away annotations, are also cut away. + * @throws ClassNotFoundException If cutAwayTypes contains non-existing type names. + */ + public JCoReCondensedDocumentText(JCas cas, Set cutAwayTypes, Set cutAwayFillCharacters, boolean skipInternalReferencesWithLetters) throws ClassNotFoundException { + this.cas = cas; + this.cutAwayFillCharacters = cutAwayFillCharacters; + this.skipInternalReferencesWithLetters = skipInternalReferencesWithLetters; + buildMap(cas, cutAwayTypes); + } + + public JCas getCas() { + return cas; + } + + /** + *

+ * Creates a map that maps those positions of the small-cut text that correspond + * to an intermediate next position after a cut-away annotation in the original + * text to the sum of ranges covered by cut-away annotations up to the original + * offset. + *

+ *

+ * If cutAwayTypes is empty, no work will be done and the methods of + * this class we return the original text and offets of the CAS. + *

+ * + * @param cas The CAS for create a cut-away document text for. + * @param cutAwayTypes The qualified type names of the annotations whose covered text + * should be cut away. + * @throws ClassNotFoundException If cutAwayTypes contains type identifiers to + * non-existing types. + */ + public void buildMap(JCas cas, Set cutAwayTypes) throws ClassNotFoundException { + if (cutAwayTypes == null || cutAwayTypes.isEmpty()) + return; + Pattern letterP = Pattern.compile("[a-zA-Z]"); + StringBuilder sb = new StringBuilder(); + condensedPos2SumCutMap = new TreeMap<>(); + condensedPos2SumCutMap.put(0, 0); + originalPos2SumCutMap = new TreeMap<>(); + originalPos2SumCutMap.put(0, 0); + JCoReAnnotationIndexMerger merger = new JCoReAnnotationIndexMerger(cutAwayTypes, true, null, cas); + int cutSum = 0; + int lastBegin = 0; + int lastEnd = -1; + int lastCutSum = 0; + // For each ignored annotation, there could be following annotations overlapping + // with the first, effectively enlarging the ignored span. Thus, we iterate + // until we find an ignored annotation that has a positive (not 0) distance to a + // previous one. Then, we store the length of the span of cut-away annotations + // for the largest end of the previous annotations. + while (merger.incrementAnnotation()) { + int begin = merger.getCurrentBegin(); + int end = merger.getCurrentEnd(); + + // Only remove InternalReferences without letters. Those are just numbers in + // PMC and often lead to errors because they are not really part of the sentence. Table and figure + // references, on the other hand, are embedded in the text. Rule of thumb: Remove references + // that don't have a letter. + if (skipInternalReferencesWithLetters && (merger.getAnnotation() instanceof InternalReference || merger.getAnnotation() instanceof de.julielab.jcore.types.pubmed.InternalReference)) { + String coveredText = ((Annotation)merger.getAnnotation()).getCoveredText(); + Matcher letterM = letterP.matcher(coveredText); + if (letterM.find()) + continue; + } + + boolean moreThanOneCharacterDistance = begin - lastEnd > 2; + boolean previousCharacterIsCutAwayDelimiter = cutAwayFillCharacters == null || cutAwayFillCharacters.isEmpty() || (begin - lastEnd == 2 && cutAwayFillCharacters.contains(cas.getDocumentText().charAt(begin - 1))); + if (lastEnd > 0 && begin > lastEnd && (previousCharacterIsCutAwayDelimiter || moreThanOneCharacterDistance)) { + // Adapt offsets to remove superfluous white spaces from the condensed text + boolean precedingCharacterIsWS = lastBegin == 0 || Character.isWhitespace(cas.getDocumentText().charAt(lastBegin - 1)); + boolean succeedingCharacterIsWS = lastEnd < cas.getDocumentText().length() && Character.isWhitespace(cas.getDocumentText().charAt(lastEnd)); + boolean extendLastEnd = precedingCharacterIsWS && succeedingCharacterIsWS; + if (extendLastEnd) + ++lastEnd; + if (precedingCharacterIsWS && end >= cas.getDocumentText().length()) + --begin; + // The current cut away annotation begins after the previous cut away annotation, thus there is no + // overlap and we can add the current state to the maps. + cutSum += lastEnd - lastBegin; + int condensedPosition = lastEnd - cutSum; + if (condensedPosition == lastBegin && !extendLastEnd) + ++condensedPosition; + // For original offsets we need to be able to know where the begin and the end of + // the cut away annotation was. This is exploited in getCondensedOffsetForOriginalOffset() + originalPos2SumCutMap.put(lastBegin, lastCutSum); + originalPos2SumCutMap.put(lastEnd, cutSum); + lastBegin = begin; + lastCutSum = cutSum; + if (condensedPosition + cutSum >= cas.getDocumentText().length()) + cutSum = cas.getDocumentText().length() -1 - condensedPosition; + condensedPos2SumCutMap.put(condensedPosition, cutSum); + sb.append(cas.getDocumentText(), lastEnd, begin); + } else if (lastEnd < 0) { + // This is the first annotation + if (begin > 0 && end >= cas.getDocumentText().length() && Character.isWhitespace(cas.getDocumentText().charAt(begin - 1))) + // Case: A single cut away annotation right at the end of the document text. + // Then we want to extend the cut away area to the leading whitespace to remove that as well. + --begin; + lastBegin = begin; + sb.append(cas.getDocumentText(), 0, begin); + } + lastEnd = end; + } + // Since we iterate one annotation further than the annotation we store the span + // for, we need to take care of the very last ignored annotation after the loop + // - it has never been handled itself. + if (lastEnd > 0) { + // Adapt offsets to avoid unnecessary white spaces regarding the tail of the document text. + boolean precedingCharacterIsWS = lastBegin < 1 || Character.isWhitespace(cas.getDocumentText().charAt(lastBegin - 1)); + boolean succeedingCharacterIsWS = lastEnd < cas.getDocumentText().length() && Character.isWhitespace(cas.getDocumentText().charAt(lastEnd)); + if (precedingCharacterIsWS && (succeedingCharacterIsWS || lastEnd >= cas.getDocumentText().length())) + ++lastEnd; + cutSum += lastEnd - lastBegin; + int condensedPosition = lastEnd - cutSum; + originalPos2SumCutMap.put(lastBegin, lastCutSum); + originalPos2SumCutMap.put(lastEnd, cutSum); + // Avoid the situation where the computed original position includes the last cut away annotation. + // This can happen when a cut away annotation appears at the very end of the text. Then, the cutSum + // accounts for this last annotation at the end of the condensed text which would result in an original + // position _after_ the cut away annotation. + if (condensedPosition + cutSum >= cas.getDocumentText().length()) + cutSum = cas.getDocumentText().length() -1 - condensedPosition; + condensedPos2SumCutMap.put(condensedPosition, cutSum); + } + // If lastEnd is still -1, we just did not find any of the cut away annotations. Thus, we just copy the whole text. + if (lastEnd == -1) + lastEnd = 0; + if (lastEnd < cas.getDocumentText().length()) + sb.append(cas.getDocumentText().substring(lastEnd)); + condensedText = sb.toString(); + } + + + /** + * Given a character offset relative to the condensed document text, this method + * returns the corresponding offset in the original CAS document text. + * + * @param condensedOffset The character offset in the condensed document text string. + * @return The character offset relative to the original CAS document text + * associated with condensedOffset. + */ + public int getOriginalOffsetForCondensedOffset(int condensedOffset) { + if (condensedPos2SumCutMap == null) + return condensedOffset; + Entry floorEntry = condensedPos2SumCutMap.floorEntry(condensedOffset); + return condensedOffset + floorEntry.getValue(); + } + + /** + * Given a character offset relative to the original CAS document text, this method + * returns the corresponding offset in the condensed document text. + * + * @param originalOffset The character offset in the originalOffset document CAS text string. + * @return The character offset relative to the condensed document text + * associated with originalOffset. + */ + public int getCondensedOffsetForOriginalOffset(int originalOffset) { + if (originalPos2SumCutMap == null) + return originalOffset; + Entry floorEntry = originalPos2SumCutMap.floorEntry(originalOffset); + Entry ceilingEntry = originalPos2SumCutMap.ceilingEntry(originalOffset); + // floor entry can never be null because the mapping 0=0 always exists + if (floorEntry != null && ceilingEntry != null) { + // Determine if the original offset is inside or outside of a cut away annotation. + // If the difference of key and value is the same for floor and ceiling, the originalOffset + // is within of a cut away annotation. Otherwise, it is outside a cut away annotation + int floorDiff = floorEntry.getKey() - floorEntry.getValue(); + int ceilingDiff = ceilingEntry.getKey() - ceilingEntry.getValue(); + boolean withinCutAway = floorDiff == ceilingDiff; + if (withinCutAway) + return originalOffset - ceilingEntry.getValue() + (ceilingEntry.getKey() - originalOffset); + } + return originalOffset - floorEntry.getValue(); + } + + public String getCodensedText() { + return condensedText != null ? condensedText : cas.getDocumentText(); + } } diff --git a/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReTools.java b/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReTools.java index 038321c70..606502b76 100644 --- a/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReTools.java +++ b/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReTools.java @@ -259,7 +259,8 @@ public static FSArray copyFSArray(FSArray array) { /** * Creates a new StringArray from the given string elements. - * @param jCas The jCas to associate the new StringArray with. + * + * @param jCas The jCas to associate the new StringArray with. * @param elements The strings to put into the StringArray. * @return The new, filled StringArray. */ @@ -456,7 +457,11 @@ else if (comparison < 0) { * @throws IOException If reading the resource file fails. */ public static InputStream resolveExternalResourceGzipInputStream(DataResource resource) throws IOException { + if (resource == null) + throw new IllegalArgumentException("The passed DataResource is null."); InputStream is = resource.getInputStream(); + if (is == null) + throw new IllegalArgumentException("The resource at " + resource.getUrl() + " could not be read. It does not exist or is not on the ClassPath."); String lcUriString = resource.getUri().toString().toLowerCase(); if (lcUriString.endsWith(".gz") || lcUriString.endsWith(".gzip")) is = new GZIPInputStream(is); diff --git a/jcore-utilities/src/main/java/de/julielab/jcore/utility/index/JCoReCoverIndex.java b/jcore-utilities/src/main/java/de/julielab/jcore/utility/index/JCoReCoverIndex.java index 02d192b73..ef6c6588b 100644 --- a/jcore-utilities/src/main/java/de/julielab/jcore/utility/index/JCoReCoverIndex.java +++ b/jcore-utilities/src/main/java/de/julielab/jcore/utility/index/JCoReCoverIndex.java @@ -106,10 +106,8 @@ public void index(JCas jCas, Type type) { * indexed annotations, first {@link #freeze()} the index and then * {@link #search(int, int)} it. * - * @param jCas - * A CAS instance. - * @param type - * The annotation type to index. + * @param annotation + * A UIMA annotation */ public void index(E annotation) { if (frozen) diff --git a/jcore-utilities/src/main/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndex.java b/jcore-utilities/src/main/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndex.java index ea919ae06..7a44dedee 100644 --- a/jcore-utilities/src/main/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndex.java +++ b/jcore-utilities/src/main/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndex.java @@ -1,11 +1,10 @@ -/** - * +/** * Copyright (c) 2017, JULIE Lab. - * All rights reserved. This program and the accompanying materials + * All rights reserved. This program and the accompanying materials * are made available under the terms of the BSD-2-Clause License - * - * Author: - * + *

+ * Author: + *

* Description: **/ package de.julielab.jcore.utility.index; @@ -19,7 +18,6 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.stream.Stream; /** *

@@ -46,145 +44,143 @@ * elements. Those are - in the case a lies in the middle of the index elements * - n/2. *

- * - * @author faessler * - * @param - * The annotation type the index should be over. + * @param The annotation type the index should be over. + * @author faessler */ public class JCoReOverlapAnnotationIndex implements JCoReAnnotationIndex { - private List beginIndex; - private List endIndex; - private boolean frozen; - - public JCoReOverlapAnnotationIndex() { - beginIndex = new ArrayList<>(); - endIndex = new ArrayList<>(); - } - - public JCoReOverlapAnnotationIndex(JCas jcas, int type) { - this(jcas, jcas.getCasType(type)); - } - - public JCoReOverlapAnnotationIndex(JCas jcas, Type type) { - this(); - index(jcas, type); - freeze(); - } - - /** - * Indexes the whole contents of the CAS annotation index of type - * type. For each annotation, the {@link #indexTermGenerator} is - * used to create terms with which the annotation will be associated in the - * index and can be retrieved by a search method. - * - * @param jCas - * A CAS instance. - * @param type - * The annotation type to index. - */ - public void index(JCas jCas, int type) { - index(jCas, jCas.getCasType(type)); - } - - /** - * Indexes the whole contents of the CAS annotation index of type - * type. For each annotation, the {@link #indexTermGenerator} is - * used to create terms with which the annotation will be associated in the - * index and can be retrieved by a search method. - * - * @param jCas - * A CAS instance. - * @param type - * The annotation type to index. - */ - @SuppressWarnings("unchecked") - public void index(JCas jCas, Type type) { - FSIterator it = jCas.getAnnotationIndex(type).iterator(); - while (it.hasNext()) { - Annotation annotation = (Annotation) it.next(); - index((E) annotation); - } - } - - public void index(E annotation) { - if (frozen) - throw new IllegalStateException("This index is frozen and cannot except further items."); - beginIndex.add(annotation); - endIndex.add(annotation); - } - - public void freeze() { - frozen = true; - Collections.sort(beginIndex, Comparators.beginOffsetComparator()); - Collections.sort(endIndex, Comparators.endOffsetComparator()); - } - - /** - * Returns all annotation in the index overlapping in any way with a - * (embedded, covering, partial overlappings). The resulting list is either - * sorted by begin or end offset. It is not easily predictable which case it - * is (could be added as a return value if that would be useful in any way). - * - * @param a - * The annotation to retrieve overlapping annotations from the - * index for. - * @return All annotations in the index overlapping a. - */ - public Stream search(T a) { - if (!frozen) - throw new IllegalStateException( - "This index is not frozen and cannot be used yet. Freeze the index before searching."); - if (beginIndex.isEmpty()) - return Stream.empty(); - // The following is rather difficult to understand from the code. The - // idea is the following: - // We search annotations overlapping with a. Thus, we can rule out those - // annotations that end before a or start after a. - // In the next 4 lines, we determine how many annotations can be ruled - // out because they start after a and how many end before a. - int begin = a.getBegin(); - int end = a.getEnd(); - int indexBeginAfterEnd = insertionPoint(JCoReTools.binarySearch(beginIndex, an -> an.getBegin(), end)); - int indexEndBeforeBegin = insertionPoint(JCoReTools.binarySearch(endIndex, an -> an.getEnd(), begin)); - - // Depending on which case rules out more annotations - ending before a - // or starting after a - we look at the case that leaves us with the - // fewest annotations. If those were the annotations that started after - // a, then we keep those that start before a ends. Those are than - // filtered for annotations that end before a starts. - if (indexBeginAfterEnd < endIndex.size() - indexEndBeforeBegin) { - List beginBeforeEnd = new ArrayList<>(beginIndex.subList(0, indexBeginAfterEnd)); - ArrayList result = new ArrayList<>(); - for (E e : beginBeforeEnd) { - if (e.getEnd() > begin) - result.add(e); - } - return result.stream(); - } else { - List endAfterBegin = new ArrayList<>(endIndex.subList(indexEndBeforeBegin, endIndex.size())); - ArrayList result = new ArrayList<>(); - for (E e : endAfterBegin) { - if (e.getBegin() < end) - result.add(e); - } - return result.stream(); - } - } - - private int insertionPoint(int i) { - return i < 0 ? -(i + 1) : i; - } - - /** - * Un-freeze the index to allow new elements to be added. - */ - public void melt() { - frozen = false; - } - - @Override - public void add(E a) { - index(a); - } + private List beginIndex; + private List endIndex; + private boolean frozen; + + public JCoReOverlapAnnotationIndex() { + beginIndex = new ArrayList<>(); + endIndex = new ArrayList<>(); + } + + public JCoReOverlapAnnotationIndex(JCas jcas, int type) { + this(jcas, jcas.getCasType(type)); + } + + public JCoReOverlapAnnotationIndex(JCas jcas, Type type) { + this(); + index(jcas, type); + freeze(); + } + + public void index(JCas jCas, int type) { + index(jCas, jCas.getCasType(type)); + } + + public void index(JCas jCas, Type type) { + FSIterator it = jCas.getAnnotationIndex(type).iterator(); + while (it.hasNext()) { + Annotation annotation = it.next(); + index((E) annotation); + } + } + + public void index(E annotation) { + if (frozen) + throw new IllegalStateException("This index is frozen and cannot accept further items."); + beginIndex.add(annotation); + endIndex.add(annotation); + } + + public void freeze() { + frozen = true; + Collections.sort(beginIndex, Comparators.beginOffsetComparator()); + Collections.sort(endIndex, Comparators.endOffsetComparator()); + } + + /** + * Returns all annotation in the index overlapping in any way with a + * (embedded, covering, partial overlappings). The resulting list is either + * sorted by begin or end offset. It is not easily predictable which case it + * is (could be added as a return value if that would be useful in any way). + * + * @param a The annotation to retrieve overlapping annotations from the + * index for. + * @return All annotations in the index overlapping a. + */ + public List search(T a) { + if (!frozen) + throw new IllegalStateException( + "This index is not frozen and cannot be used yet. Freeze the index before searching."); + if (beginIndex.isEmpty()) + return Collections.emptyList(); + // The following is rather difficult to understand from the code. The + // idea is the following: + // We search annotations overlapping with a. Thus, we can rule out those + // annotations that end before a or start after a. + // In the next 4 lines, we determine how many annotations can be ruled + // out because they start after a and how many end before a. + int begin = a.getBegin(); + int end = a.getEnd(); + int indexBeginAfterEnd = insertionPoint(JCoReTools.binarySearch(beginIndex, an -> an.getBegin(), end)); + int indexEndBeforeBegin = insertionPoint(JCoReTools.binarySearch(endIndex, an -> an.getEnd(), begin)); + + // Depending on which case rules out more annotations - ending before a + // or starting after a - we look at the case that leaves us with the + // fewest annotations. If those were the annotations that started after + // a, then we keep those that start before a ends. Those are than + // filtered for annotations that end before a starts. + if (indexBeginAfterEnd < endIndex.size() - indexEndBeforeBegin) { + List beginBeforeEnd = new ArrayList<>(beginIndex.subList(0, indexBeginAfterEnd)); + List result = new ArrayList<>(); + for (E e : beginBeforeEnd) { + if (e.getEnd() > begin) + result.add(e); + } + return result; + } else { + List endAfterBegin = new ArrayList<>(endIndex.subList(indexEndBeforeBegin, endIndex.size())); + List result = new ArrayList<>(); + for (E e : endAfterBegin) { + if (e.getBegin() < end) + result.add(e); + } + return result; + } + } + + private int insertionPoint(int i) { + return i < 0 ? -(i + 1) : i; + } + + /** + * Un-freeze the index to allow new elements to be added. + */ + public void melt() { + frozen = false; + } + + @Override + public void add(E a) { + index(a); + } + + /** + *

Returns the internal list where the indexed annotations are sorted by begin offset. External changes to + * this list might break the index.

+ * + * @return The indexed annotations sorted bei their begin offset. + */ + public List getBeginIndex() { + return beginIndex; + } + + /** + *

Returns the internal list where the indexed annotations are sorted by end offset. External changes to + * this list might break the index.

+ * + * @return The indexed annotations sorted bei their end offset. + */ + public List getEndIndex() { + return endIndex; + } + + public boolean isFrozen() { + return frozen; + } } diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReAnnotationToolsTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReAnnotationToolsTest.java index 42a374b7d..37e3ffbff 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReAnnotationToolsTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReAnnotationToolsTest.java @@ -18,7 +18,7 @@ package de.julielab.jcore.utility; import de.julielab.jcore.types.*; -import junit.framework.TestCase; +import de.julielab.jcore.utility.index.JCoReOverlapAnnotationIndex; import org.apache.uima.UIMAFramework; import org.apache.uima.cas.CAS; import org.apache.uima.cas.FSIterator; @@ -28,27 +28,31 @@ import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.XMLInputSource; import org.apache.uima.util.XmlCasDeserializer; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.FileInputStream; -import java.lang.reflect.InvocationTargetException; +import java.util.ArrayList; import java.util.List; +import static org.junit.jupiter.api.Assertions.*; + // import de.julielab.jcore.types.Annotation; -public class JCoReAnnotationToolsTest extends TestCase { +public class JCoReAnnotationToolsTest { /** * Logger for this class */ private static final Logger LOG = LoggerFactory.getLogger(JCoReAnnotationToolsTest.class); - JCas jcas; - public final String DESC_TEST_ANALYSIS_ENGINE = "src/test/resources/AETestDescriptor.xml"; + static JCas jcas; + public final static String DESC_TEST_ANALYSIS_ENGINE = "src/test/resources/AETestDescriptor.xml"; - protected void setUp() throws Exception { + @BeforeAll + protected static void setUp() throws Exception { // get a CAS/JCas CAS cas = CasCreationUtils.createCas(UIMAFramework.getXMLParser().parseAnalysisEngineDescription( @@ -78,9 +82,8 @@ protected void setUp() throws Exception { e4.addToIndexes(); } - // TODO only Exception werfen - public void testGetAnnotationAtOffset() throws SecurityException, IllegalArgumentException, ClassNotFoundException, - NoSuchMethodException, InstantiationException, IllegalAccessException, InvocationTargetException { + @Test + public void testGetAnnotationAtOffset() throws Exception { LOG.debug("testGetAnnotationAtOffset() - testing getAnnotationAtOffset(..)"); Annotation entity = new Annotation(jcas); @@ -94,10 +97,8 @@ public void testGetAnnotationAtOffset() throws SecurityException, IllegalArgumen assertTrue(anno == null); } - // TODO only Exception werfen - public void testGetOverlappingAnnotation() throws SecurityException, IllegalArgumentException, - ClassNotFoundException, NoSuchMethodException, InstantiationException, IllegalAccessException, - InvocationTargetException { + @Test + public void testGetOverlappingAnnotation() throws Exception { LOG.debug("testGetOverlappingAnnotation() - testing getOverlappingAnnotation(..)"); Annotation entity = new Annotation(jcas); @@ -119,10 +120,8 @@ public void testGetOverlappingAnnotation() throws SecurityException, IllegalArgu assertTrue((anno != null) && (anno instanceof Annotation)); } - // TODO only Exception werfen - public void testGetAnnotationByClassName() throws SecurityException, IllegalArgumentException, - ClassNotFoundException, NoSuchMethodException, InstantiationException, IllegalAccessException, - InvocationTargetException { + @Test + public void testGetAnnotationByClassName() throws Exception { LOG.debug("testGetAnnotationByClassName() - testing getAnnotationObject(..)"); Annotation entity = new Annotation(jcas); @@ -130,6 +129,7 @@ public void testGetAnnotationByClassName() throws SecurityException, IllegalArgu assertTrue(anno instanceof Annotation); } + @Test public void testGetPartiallyOverlappingAnnotationOtherType() throws Exception { JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-all-types"); jcas.setDocumentText("wort"); @@ -242,7 +242,7 @@ public void testIncludedAnnotations() throws Exception { List includedAnnotations = JCoReAnnotationTools.getIncludedAnnotations(jcas, em, Token.class); - assertEquals("Wrong amount of included tokens returned", 4, includedAnnotations.size()); + assertEquals(4, includedAnnotations.size(), "Wrong amount of included tokens returned"); for (int i = 0; i < includedAnnotations.size(); i++) { Token includedToken = includedAnnotations.get(i); @@ -472,4 +472,31 @@ public void testGetLastOverlappingAnnotation() throws Exception { Token result = JCoReAnnotationTools.getLastOverlappingAnnotation(jcas, em, Token.class); assertEquals(t4, result); } + + @Test + public void testGetAnnotationsBetween() throws Exception{ + final JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types"); + // create some token sequence; omit white spaces for simplicity + List tokenList = new ArrayList<>(); + JCoReOverlapAnnotationIndex tokenIndex = new JCoReOverlapAnnotationIndex<>(); + for (int i = 0; i < 100; i++) { + final Token token = new Token(jcas, i * 5, i * 5 + 5); + tokenList.add(token); + tokenIndex.index(token); + } + tokenIndex.freeze(); + final List between1 = JCoReAnnotationTools.getAnnotationsBetween(new Annotation(jcas, 0, 2), new Annotation(jcas, 497, 500), tokenIndex); + assertEquals(98, between1.size()); + // the same setup as above but with switched annotations + final List between2 = JCoReAnnotationTools.getAnnotationsBetween(new Annotation(jcas, 497, 500), new Annotation(jcas, 0, 2), tokenIndex); + assertEquals(98, between2.size()); + // the input annotations overlap, there should be no output + final List between3 = JCoReAnnotationTools.getAnnotationsBetween(new Annotation(jcas, 1, 10), new Annotation(jcas, 0, 2), tokenIndex); + assertEquals(0, between3.size()); + final List between4 = JCoReAnnotationTools.getAnnotationsBetween(new Annotation(jcas, 255, 260), new Annotation(jcas, 235, 240), tokenIndex); + assertEquals(3, between4.size()); + // the annotations are out of the token span + final List between5 = JCoReAnnotationTools.getAnnotationsBetween(new Annotation(jcas, 1000, 1005), new Annotation(jcas, 600, 6005), tokenIndex); + assertEquals(0, between5.size()); + } } diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java index 445b234e3..da51e1c59 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java @@ -1,14 +1,16 @@ package de.julielab.jcore.utility; +import de.julielab.jcore.types.Annotation; import de.julielab.jcore.types.InternalReference; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.Arrays; import java.util.HashSet; +import java.util.Set; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class JCoReCondensedDocumentTextTest { @Test @@ -28,7 +30,7 @@ public void testReduce() throws Exception { assertEquals(13, condensedText.getOriginalOffsetForCondensedOffset(13)); assertEquals(15, condensedText.getOriginalOffsetForCondensedOffset(14)); assertEquals(30, condensedText.getOriginalOffsetForCondensedOffset(29)); - + assertEquals(0, condensedText.getCondensedOffsetForOriginalOffset(0)); assertEquals(13, condensedText.getCondensedOffsetForOriginalOffset(13)); assertEquals(14, condensedText.getCondensedOffsetForOriginalOffset(15)); @@ -52,11 +54,188 @@ public void testReduce2() throws Exception { assertEquals(13, condensedText.getOriginalOffsetForCondensedOffset(13)); assertEquals(15, condensedText.getOriginalOffsetForCondensedOffset(14)); assertEquals(31, condensedText.getOriginalOffsetForCondensedOffset(29)); - + assertEquals(0, condensedText.getCondensedOffsetForOriginalOffset(0)); assertEquals(13, condensedText.getCondensedOffsetForOriginalOffset(13)); assertEquals(14, condensedText.getCondensedOffsetForOriginalOffset(15)); assertEquals(28, condensedText.getCondensedOffsetForOriginalOffset(30)); assertEquals(29, condensedText.getCondensedOffsetForOriginalOffset(31)); } + + @Test + public void testReduce3() throws Exception { + // Here we also add commas as cut away characters, offering the possibility to remove enumerations of + // references completely. + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + jcas.setDocumentText("This sentence has multiple references.2,5;42 This is a second sentence.7,8"); + InternalReference ref1 = new InternalReference(jcas, 38, 39); + ref1.addToIndexes(); + InternalReference ref2 = new InternalReference(jcas, 40, 41); + ref2.addToIndexes(); + InternalReference ref3 = new InternalReference(jcas, 42, 44); + ref3.addToIndexes(); + InternalReference ref4 = new InternalReference(jcas, 71, 72); + ref4.addToIndexes(); + InternalReference ref5 = new InternalReference(jcas, 73, 74); + ref5.addToIndexes(); + + JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, + new HashSet<>(Arrays.asList(InternalReference.class.getCanonicalName())), Set.of(',', ';')); + assertEquals("This sentence has multiple references. This is a second sentence.", condensedText.getCodensedText()); + } + + @Test + public void testReduce4() throws Exception { + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + jcas.setDocumentText("This sentence\n1\nhas references."); + InternalReference ref1 = new InternalReference(jcas, 14, 15); + ref1.addToIndexes(); + + JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, + new HashSet<>(Arrays.asList(InternalReference.class.getCanonicalName()))); + assertEquals("This sentence\nhas references.", condensedText.getCodensedText()); + assertEquals(0, condensedText.getOriginalOffsetForCondensedOffset(0)); + assertEquals(16, condensedText.getOriginalOffsetForCondensedOffset(14)); + } + + @Test + public void testReduce5() throws Exception { + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + jcas.setDocumentText("Leptin is an adipose-derived protein secreted by adipocytes and is expressed in adipose tissue.\n" + + "1\n" + + "It has the role of being a key regulator of several physiological pathways including body weight and regulation of food intake, inflammation, endocrine function, energy homeostasis, bone metabolism and immunity.\n" + + "2\n" + + "3\n" + + "Results from various studies indicate that leptin may play a significant role in bone physiology, independent of the central nervous system.\n"); + InternalReference ref1 = new InternalReference(jcas, 96, 97); + ref1.addToIndexes(); + InternalReference ref2 = new InternalReference(jcas, 310, 311); + ref2.addToIndexes(); + InternalReference ref3 = new InternalReference(jcas, 312, 313); + ref3.addToIndexes(); + + JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, + new HashSet<>(Arrays.asList(InternalReference.class.getCanonicalName()))); + assertEquals("Leptin is an adipose-derived protein secreted by adipocytes and is expressed in adipose tissue.\n" + + "It has the role of being a key regulator of several physiological pathways including body weight and regulation of food intake, inflammation, endocrine function, energy homeostasis, bone metabolism and immunity.\n" + + "Results from various studies indicate that leptin may play a significant role in bone physiology, independent of the central nervous system.\n", condensedText.getCodensedText()); + assertEquals(98, condensedText.getOriginalOffsetForCondensedOffset(96)); + assertEquals(314, condensedText.getOriginalOffsetForCondensedOffset(308)); + } + + @Test + public void testReduce6() throws Exception { + // Test the option to skip internal references that have letters from omission from the condensed text. + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + jcas.setDocumentText("Andreeva et al.19 and Xiao et al.20 studied the way of binding of a myosin head to an actin filament by using tryptic digestion of myofibrils and measuring optical polarization and dichroism. They concluded that in the rigor rabbit psoas muscle each myosin head binds to two actin monomers in a thin filament20, suggesting the possibility that the myosin head may first bind to one and then to two monomers in the actin filament19.\n" + + "Figure 2 shows an example of possible mechanism of how such binding change occurs."); + InternalReference ref1 = new InternalReference(jcas, 15, 17); + ref1.addToIndexes(); + InternalReference ref2 = new InternalReference(jcas, 33, 35); + ref2.addToIndexes(); + InternalReference ref3 = new InternalReference(jcas, 308, 310); + ref3.addToIndexes(); + InternalReference ref4 = new InternalReference(jcas, 428, 430); + ref4.addToIndexes(); + InternalReference ref5 = new InternalReference(jcas, 432, 440); + ref5.addToIndexes(); + + JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, + new HashSet<>(Arrays.asList(InternalReference.class.getCanonicalName())), true); + assertEquals("Andreeva et al. and Xiao et al. studied the way of binding of a myosin head to an actin filament by using tryptic digestion of myofibrils and measuring optical polarization and dichroism. They concluded that in the rigor rabbit psoas muscle each myosin head binds to two actin monomers in a thin filament, suggesting the possibility that the myosin head may first bind to one and then to two monomers in the actin filament.\n" + + "Figure 2 shows an example of possible mechanism of how such binding change occurs.", condensedText.getCodensedText()); + } + + + @Test + public void testCondensedOffsetsWithinCutawayAnnotations() throws Exception { + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + jcas.setDocumentText("Not cut away 1. Cut away 1. Not cut away 2. Cut away 2. Not cut away 3."); + Annotation cutAwayAnnotation = new Annotation(jcas, 16, 27); + cutAwayAnnotation.addToIndexes(); + Annotation cutAwayAnnotation2 = new Annotation(jcas, 44, 55); + cutAwayAnnotation2.addToIndexes(); + + JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, + new HashSet<>(Arrays.asList(Annotation.class.getCanonicalName()))); + assertEquals("Not cut away 1. Not cut away 2. Not cut away 3.", condensedText.getCodensedText()); + assertEquals(10, condensedText.getCondensedOffsetForOriginalOffset(10)); + assertEquals(15, condensedText.getCondensedOffsetForOriginalOffset(15)); + assertEquals(16, condensedText.getCondensedOffsetForOriginalOffset(16)); + assertEquals(16, condensedText.getCondensedOffsetForOriginalOffset(17)); + assertEquals(16, condensedText.getCondensedOffsetForOriginalOffset(27)); + assertEquals(19, condensedText.getCondensedOffsetForOriginalOffset(31)); + } + + @Test + public void testCutAwayAtBeginning() throws Exception { + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + jcas.setDocumentText("Cut away. Not cut away."); + Annotation cutAwayAnnotation = new Annotation(jcas, 0, 9); + cutAwayAnnotation.addToIndexes(); + + JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, + new HashSet<>(Arrays.asList(Annotation.class.getCanonicalName()))); + assertEquals("Not cut away.", condensedText.getCodensedText()); + assertEquals(0, condensedText.getCondensedOffsetForOriginalOffset(3)); + assertEquals(3, condensedText.getCondensedOffsetForOriginalOffset(13)); + } + + @Test + public void testCutAwayAtEnd() throws Exception { + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + jcas.setDocumentText("Not cut away. Cut away."); + Annotation cutAwayAnnotation = new Annotation(jcas, 14, 23); + cutAwayAnnotation.addToIndexes(); + + JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, + new HashSet<>(Arrays.asList(Annotation.class.getCanonicalName()))); + assertEquals("Not cut away.", condensedText.getCodensedText()); + assertEquals(10, condensedText.getCondensedOffsetForOriginalOffset(10)); + assertEquals(13, condensedText.getCondensedOffsetForOriginalOffset(16)); + assertEquals(13, condensedText.getCondensedOffsetForOriginalOffset(23)); + } + + @Test + public void testEmbeddedCutAway() throws Exception { + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + jcas.setDocumentText("Not cut away. Cut away. Not cut away."); + Annotation cutAwayAnnotation = new Annotation(jcas, 14, 23); + cutAwayAnnotation.addToIndexes(); + + JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, + new HashSet<>(Arrays.asList(Annotation.class.getCanonicalName()))); + assertEquals("Not cut away. Not cut away.", condensedText.getCodensedText()); + assertEquals(10, condensedText.getCondensedOffsetForOriginalOffset(10)); + assertEquals(14, condensedText.getCondensedOffsetForOriginalOffset(16)); + assertEquals(14, condensedText.getCondensedOffsetForOriginalOffset(23)); + assertEquals(15, condensedText.getCondensedOffsetForOriginalOffset(25)); + } + + @Test + public void testEnclosingCutAway() throws Exception { + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + jcas.setDocumentText("Cut away. Not cut away. Cut away."); + Annotation cutAwayAnnotation = new Annotation(jcas, 0, 9); + cutAwayAnnotation.addToIndexes(); + Annotation cutAwayAnnotation2 = new Annotation(jcas, 24, 33); + cutAwayAnnotation2.addToIndexes(); + + JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, + new HashSet<>(Arrays.asList(Annotation.class.getCanonicalName()))); + assertEquals("Not cut away.", condensedText.getCodensedText()); + assertEquals(0, condensedText.getCondensedOffsetForOriginalOffset(10)); + assertEquals(3, condensedText.getCondensedOffsetForOriginalOffset(13)); + assertEquals(13, condensedText.getCondensedOffsetForOriginalOffset(27)); + assertEquals(13, condensedText.getCondensedOffsetForOriginalOffset(33)); + } } diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReFSListIteratorTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReFSListIteratorTest.java index bb11d9beb..1ebca1c68 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReFSListIteratorTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReFSListIteratorTest.java @@ -13,14 +13,14 @@ import de.julielab.jcore.types.Token; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.NoSuchElementException; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class JCoReFSListIteratorTest { diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReFeaturePathTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReFeaturePathTest.java index 8983aa858..bc01ec660 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReFeaturePathTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReFeaturePathTest.java @@ -19,14 +19,14 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.cas.StringArray; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class JCoReFeaturePathTest { @Test @@ -275,12 +275,12 @@ public void testReplacePrimitiveValue() throws Exception { assertEquals("originalValue", cm.getTextualRepresentation()); assertEquals("originalValue", fp.getValueAsString(cm)); - assertEquals("replacementValue", fp.getValueAsString(cm, true)); + assertEquals( fp.getValueAsString(cm, true), "replacementValue"); assertEquals("replacementValue", fp.getValueAsString(cm)); assertEquals("replacementValue", cm.getTextualRepresentation()); // doing a replacement again should have no effect - assertEquals("replacementValue", fp.getValueAsString(cm, true)); + assertEquals( fp.getValueAsString(cm, true), "replacementValue"); assertEquals("replacementValue", fp.getValueAsString(cm)); } @@ -303,11 +303,11 @@ public void testReplaceNotMappedPrimitiveValue() throws Exception { assertEquals("originalValue", cm.getTextualRepresentation()); assertEquals("originalValue", fp.getValueAsString(cm)); - assertEquals("replacementValue", fp.getValueAsString(cm, true)); + assertEquals( fp.getValueAsString(cm, true), "replacementValue"); assertEquals("unknownValue", cm2.getTextualRepresentation()); assertEquals("unknownValue", fp.getValueAsString(cm2)); - assertEquals("not-mapped", fp.getValueAsString(cm2, true)); + assertEquals( fp.getValueAsString(cm2, true), "not-mapped"); } @Test @@ -328,7 +328,7 @@ public void testReplaceNotMappedPrimitiveValueWithNull() throws Exception { assertEquals("unknownValue", cm.getTextualRepresentation()); assertEquals("unknownValue", fp.getValueAsString(cm)); assertEquals(null, fp.getValueAsString(cm, true)); - assertNotSame("null", fp.getValueAsString(cm, true)); + assertNotSame( fp.getValueAsString(cm, true), "null"); } @Test @@ -350,7 +350,7 @@ public void testReplaceAllArrayElements() throws Exception { fp.initialize("/semanticTypes", replacements); assertEquals("entry1, entry2, entry3", fp.getValueAsString(ocm)); - assertEquals("replacement1, replacement2, replacement3", fp.getValueAsString(ocm, true)); + assertEquals( fp.getValueAsString(ocm, true), "replacement1, replacement2, replacement3"); } @Test @@ -368,7 +368,7 @@ public void testReplaceAllArrayElementsFromFile() throws Exception { fp.initialize("/semanticTypes"); assertEquals("entry1, entry2, entry3", fp.getValueAsString(ocm)); - assertEquals("replacement1, replacement2, replacement3", fp.getValueAsString(ocm, true)); + assertEquals( fp.getValueAsString(ocm, true), "replacement1, replacement2, replacement3"); } @@ -391,7 +391,7 @@ public void testReplaceSingleArrayElement() throws Exception { JCoReFeaturePath fp = new JCoReFeaturePath(); fp.initialize("/semanticTypes[1]", replacements); - assertEquals("replacement2", fp.getValueAsString(ocm, true)); + assertEquals( fp.getValueAsString(ocm, true), "replacement2"); fp.initialize("/semanticTypes"); assertEquals("entry1, replacement2, entry3", fp.getValueAsString(ocm)); @@ -423,7 +423,7 @@ public void testReplaceValueOnDeepFeatureStructure() throws Exception { JCoReFeaturePath fp = new JCoReFeaturePath(); fp.initialize("/resourceEntryList/entryId", replacements); - assertEquals("tid1, tid2", fp.getValueAsString(gene, true)); + assertEquals( fp.getValueAsString(gene, true), "tid1, tid2"); } diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReToolsTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReToolsTest.java index 3212f4c77..74619777f 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReToolsTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReToolsTest.java @@ -16,13 +16,13 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.Arrays; import java.util.Collection; import java.util.List; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class JCoReToolsTest { @Test @@ -36,7 +36,7 @@ public void testAddCollectionToFSArray1() throws Exception { Annotation newElement = new Annotation(jCas); Collection newElements = Lists.newArrayList(newElement); FSArray joinedArray = JCoReTools.addToFSArray(fsArray, newElements); - assertEquals("A new FSArray was instantiated although the old one should have been kept", fsArray, joinedArray); + assertEquals( fsArray, joinedArray, "A new FSArray was instantiated although the old one should have been kept"); assertEquals(newElement, joinedArray.get(1)); } @@ -54,7 +54,7 @@ public void testAddCollectionToFSArray2() throws Exception { Annotation newElement4 = new Annotation(jCas, 4, 4); Collection newElements = Lists.newArrayList(newElement1, newElement2, newElement3, newElement4); FSArray joinedArray = JCoReTools.addToFSArray(fsArray, newElements); - assertNotSame("The old FSArray was returned although a new one should have been created", fsArray, joinedArray); + assertNotSame( fsArray, joinedArray, "The old FSArray was returned although a new one should have been created"); assertEquals(newElement1, joinedArray.get(1)); assertEquals(newElement2, joinedArray.get(2)); assertEquals(newElement3, joinedArray.get(3)); @@ -68,7 +68,7 @@ public void testAddCollectionToFSArray3() throws Exception { Annotation newElement = new Annotation(jCas); Collection newElements = Lists.newArrayList(newElement); FSArray joinedArray = JCoReTools.addToFSArray(fsArray, newElements); - assertNotSame("The old FSArray was returned although a new one should have been created", fsArray, joinedArray); + assertNotSame( fsArray, joinedArray, "The old FSArray was returned although a new one should have been created"); assertEquals(newElement, joinedArray.get(0)); } @@ -79,7 +79,7 @@ public void testAddCollectionToFSArray4() throws Exception { Annotation newElement = new Annotation(jCas); Collection newElements = Lists.newArrayList(newElement); FSArray joinedArray = JCoReTools.addToFSArray(fsArray, newElements); - assertEquals("A new FSArray was instantiated although the old one should have been kept", fsArray, joinedArray); + assertEquals( fsArray, joinedArray, "A new FSArray was instantiated although the old one should have been kept"); assertEquals(newElement, joinedArray.get(0)); } @@ -93,7 +93,7 @@ public void testAddElementToFSArray1() throws Exception { assertNull(fsArray.get(1)); Annotation newElement = new Annotation(jCas); FSArray joinedArray = JCoReTools.addToFSArray(fsArray, newElement); - assertEquals("A new FSArray was instantiated although the old one should have been kept", fsArray, joinedArray); + assertEquals( fsArray, joinedArray, "A new FSArray was instantiated although the old one should have been kept"); assertEquals(newElement, joinedArray.get(1)); } @@ -112,23 +112,23 @@ public void testAddElementToFSArray2() throws Exception { List newElements = Lists.newArrayList(newElement1, newElement2, newElement3, newElement4); FSArray joinedArray = JCoReTools.addToFSArray(fsArray, newElements.get(0)); - assertEquals("A new FSArray was instantiated although the old one should have been kept", fsArray, joinedArray); + assertEquals( fsArray, joinedArray, "A new FSArray was instantiated although the old one should have been kept"); assertEquals(2, joinedArray.size()); assertEquals(newElement1, joinedArray.get(1)); fsArray = joinedArray; joinedArray = JCoReTools.addToFSArray(fsArray, newElements.get(1)); - assertNotSame("The old FSArray was returned although a new one should have been created", fsArray, joinedArray); + assertNotSame( fsArray, joinedArray, "The old FSArray was returned although a new one should have been created"); assertEquals(newElement2, joinedArray.get(2)); fsArray = joinedArray; joinedArray = JCoReTools.addToFSArray(fsArray, newElements.get(2)); - assertEquals("A new FSArray was instantiated although the old one should have been kept", fsArray, joinedArray); + assertEquals( fsArray, joinedArray, "A new FSArray was instantiated although the old one should have been kept"); assertEquals(newElement3, joinedArray.get(3)); fsArray = joinedArray; joinedArray = JCoReTools.addToFSArray(fsArray, newElements.get(3)); - assertEquals("A new FSArray was instantiated although the old one should have been kept", fsArray, joinedArray); + assertEquals( fsArray, joinedArray, "A new FSArray was instantiated although the old one should have been kept"); assertEquals(newElement4, joinedArray.get(4)); } diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/ComparatorsTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/ComparatorsTest.java index f5720c7c1..5b87d968a 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/ComparatorsTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/ComparatorsTest.java @@ -13,10 +13,10 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class ComparatorsTest { @Test diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReCoverAnnotationIndexTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReCoverAnnotationIndexTest.java index e81f2cd08..5e2b1d105 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReCoverAnnotationIndexTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReCoverAnnotationIndexTest.java @@ -14,13 +14,13 @@ import de.julielab.jcore.types.Token; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.Set; import java.util.stream.Collectors; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class JCoReCoverAnnotationIndexTest { @Test diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReMapAnnotationIndexTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReMapAnnotationIndexTest.java index 4cd521007..088917946 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReMapAnnotationIndexTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReMapAnnotationIndexTest.java @@ -14,7 +14,7 @@ import de.julielab.jcore.types.Token; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.HashMap; import java.util.Set; @@ -22,7 +22,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class JCoReMapAnnotationIndexTest { @Test diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndexTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndexTest.java index ef0a044c9..942f32785 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndexTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndexTest.java @@ -13,13 +13,12 @@ import de.julielab.jcore.types.Token; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.List; -import java.util.stream.Collectors; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class JCoReOverlapAnnotationIndexTest { @Test @@ -39,23 +38,23 @@ public void testOverlapAnnotationIndex() throws Exception { t6.addToIndexes(); JCoReOverlapAnnotationIndex index = new JCoReOverlapAnnotationIndex<>(jcas, Token.type); - List result = index.search(t2).collect(Collectors.toList()); + List result = index.search(t2); assertTrue(result.contains(t1)); assertTrue(result.contains(t2)); assertTrue(result.contains(t3)); assertEquals(3, result.size()); - result = index.search(t1).collect(Collectors.toList()); + result = index.search(t1); assertTrue(result.contains(t1)); assertTrue(result.contains(t2)); assertEquals(2, result.size()); - result = index.search(t4).collect(Collectors.toList()); + result = index.search(t4); assertTrue(result.contains(t4)); assertTrue(result.contains(t5)); assertEquals(2, result.size()); - result = index.search(t6).collect(Collectors.toList()); + result = index.search(t6); assertTrue(result.contains(t6)); assertEquals(1, result.size()); } diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReSetAnnotationIndexTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReSetAnnotationIndexTest.java index cfb4f2374..111861268 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReSetAnnotationIndexTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReSetAnnotationIndexTest.java @@ -15,13 +15,13 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.Set; import java.util.stream.Collectors; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class JCoReSetAnnotationIndexTest { @Test diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReTreeMapAnnotationIndexTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReTreeMapAnnotationIndexTest.java index 1294407f2..208e8abb4 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReTreeMapAnnotationIndexTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReTreeMapAnnotationIndexTest.java @@ -15,12 +15,12 @@ import de.julielab.jcore.types.Token; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.Set; import java.util.stream.Collectors; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class JCoReTreeMapAnnotationIndexTest { @Test diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/TermGeneratorsTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/TermGeneratorsTest.java index 8595e5840..e3d269f83 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/TermGeneratorsTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/TermGeneratorsTest.java @@ -13,12 +13,12 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.function.BinaryOperator; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; public class TermGeneratorsTest { diff --git a/jcore-utilities/src/test/resources/AETestDescriptor.xml b/jcore-utilities/src/test/resources/AETestDescriptor.xml index ea2658e15..f438cd6b6 100644 --- a/jcore-utilities/src/test/resources/AETestDescriptor.xml +++ b/jcore-utilities/src/test/resources/AETestDescriptor.xml @@ -6,7 +6,7 @@ JulesToolsDescriptor -2.5.1-SNAPSHOT +2.6.0 diff --git a/jcore-xmi-db-reader/README.md b/jcore-xmi-db-reader/README.md index d587fa8b1..af691dee8 100644 --- a/jcore-xmi-db-reader/README.md +++ b/jcore-xmi-db-reader/README.md @@ -1,8 +1,10 @@ # JCoRe XMI Database Reader -**Descriptor Path**: +**Descriptor Paths**: ``` de.julielab.jcore.reader.xmi.desc.jcore-xmi-db-reader +de.julielab.jcore.reader.xmi.desc.jcore-xmi-db-multiplier-reader +de.julielab.jcore.reader.xmi.desc.jcore-xmi-db-multiplier ``` ### Objective diff --git a/jcore-xmi-db-reader/component.meta b/jcore-xmi-db-reader/component.meta index d8abdab0f..e49317b51 100644 --- a/jcore-xmi-db-reader/component.meta +++ b/jcore-xmi-db-reader/component.meta @@ -23,7 +23,7 @@ "maven-artifact": { "artifactId": "jcore-xmi-db-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe XMI Database Reader" } diff --git a/jcore-xmi-db-reader/pom.xml b/jcore-xmi-db-reader/pom.xml index 4e3f07f1a..fa3b75799 100644 --- a/jcore-xmi-db-reader/pom.xml +++ b/jcore-xmi-db-reader/pom.xml @@ -5,7 +5,7 @@ jedis-parent de.julielab - 2.5.1-SNAPSHOT + 2.6.0 ../jedis-parent jcore-xmi-db-reader @@ -18,7 +18,7 @@ de.julielab jcore-db-reader - 2.5.1-SNAPSHOT + 2.6.0 org.testng @@ -59,13 +59,13 @@ de.julielab jcore-xml-db-reader - 2.5.1-SNAPSHOT + 2.6.0 test de.julielab jcore-xmi-db-writer - 2.5.1-SNAPSHOT + 2.6.0 test diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java index 81dff7797..1b9c9c080 100644 --- a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java +++ b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java @@ -14,7 +14,6 @@ import de.julielab.xml.binary.BinaryJeDISNodeDecoder; import de.julielab.xml.binary.BinaryXmiBuilder; import org.apache.commons.lang.StringUtils; -import org.apache.uima.cas.CASException; import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.impl.XmiCasDeserializer; import org.apache.uima.collection.CollectionException; @@ -41,10 +40,8 @@ public class CasPopulator { private final static Logger log = LoggerFactory.getLogger(CasPopulator.class); private final DataBaseConnector dbc; private final boolean readsBaseDocument; - private final int numAdditionalTables; private final int numDataRetrievedDataFields; - private final String dataTable; - private final String[] additionalTableNames; + private final String[] unqualifiedAnnotationModuleNames; private final XmiBuilder builder; private final Boolean logFinalXmi; private final int xercesAttributeBufferSize; @@ -73,10 +70,8 @@ public CasPopulator(String dataTable, Initializer initializer, Boolean readDataT this.tableName = tableName; this.readsBaseDocument = initializer.getReadsBaseDocument(); this.joinTables = initializer.isJoinTables(); - this.numAdditionalTables = initializer.getNumAdditionalTables(); this.numDataRetrievedDataFields = initializer.getNumDataRetrievedDataFields(); - this.dataTable = dataTable; - this.additionalTableNames = initializer.getUnqualifiedAnnotationModuleNames(); + this.unqualifiedAnnotationModuleNames = initializer.getUnqualifiedAnnotationModuleNames(); this.builder = initializer.getXmiBuilder(); binaryBuilder = initializer.getBinaryBuilder(); useBinaryFormat = initializer.isUseBinaryFormat(); @@ -86,7 +81,7 @@ public CasPopulator(String dataTable, Initializer initializer, Boolean readDataT reverseBinaryMapping = initializer.getReverseBinaryMapping(); featuresToMapBinary = initializer.getFeaturesToMapBinary(); if (useBinaryFormat) { - binaryJeDISNodeDecoder = new BinaryJeDISNodeDecoder(Stream.of(additionalTableNames).collect(Collectors.toSet()), true); + binaryJeDISNodeDecoder = new BinaryJeDISNodeDecoder(Stream.of(unqualifiedAnnotationModuleNames).collect(Collectors.toSet()), true); } else binaryJeDISNodeDecoder = null; } @@ -190,7 +185,7 @@ public void populateCas(byte[][] data, JCas jCas) throws CasPopulationException } } log.trace("Setting max XMI ID to the CAS."); - storeMaxXmiIdAndSofaMappings(jCas, data); + storeMaxXmiIdAndSofaMappings(jCas, data, storeMaxXmiId); log.trace("Setting meta data to: Reads data table: {}, table name: {}", readsDataTable, tableName); DBReader.setDBProcessingMetaData(dbc, readsDataTable, tableName, data, jCas); } catch (Exception e) { @@ -243,7 +238,7 @@ private String getPkStringFromData(byte[][] data) { return sb.toString(); } - private void storeMaxXmiIdAndSofaMappings(JCas aCAS, byte[][] data) { + public static void storeMaxXmiIdAndSofaMappings(JCas aCAS, byte[][] data, Boolean storeMaxXmiId) { if (storeMaxXmiId && data.length > 2) { String docId = JCoReTools.getDocId(aCAS); byte[] maxXmiIdBytes = data[2]; diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplier.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplier.java index a29dcb8dd..da16d4ef0 100644 --- a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplier.java +++ b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplier.java @@ -4,7 +4,10 @@ import de.julielab.costosys.dbconnection.CoStoSysConnection; import de.julielab.costosys.dbconnection.DataBaseConnector; import de.julielab.jcore.reader.db.DBMultiplier; +import de.julielab.jcore.reader.db.DBReader; import de.julielab.jcore.types.casmultiplier.RowBatch; +import de.julielab.jcore.types.pubmed.Header; +import de.julielab.jcore.utility.JCoReTools; import de.julielab.xml.JulieXMLConstants; import de.julielab.xml.XmiSplitConstants; import de.julielab.xml.binary.BinaryJeDISNodeEncoder; @@ -21,6 +24,7 @@ import java.io.ByteArrayInputStream; import java.io.FileNotFoundException; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.sql.ResultSet; import java.sql.SQLException; import java.util.*; @@ -29,9 +33,12 @@ public class XmiDBMultiplier extends DBMultiplier implements Initializable { public static final String PARAM_LOG_FINAL_XMI = Initializer.PARAM_LOG_FINAL_XMI; + public static final String PARAM_TRUNCATE_AT_SIZE = "TruncateAtSize"; private final static Logger log = LoggerFactory.getLogger(XmiDBMultiplier.class); @ConfigurationParameter(name = PARAM_LOG_FINAL_XMI, mandatory = false, defaultValue = "false", description = "For debugging purposes. If set to true, before parsing the final XMI data assembled from the annotation modules, it is printed to console.") private boolean logFinalXmi; + @ConfigurationParameter(name = PARAM_TRUNCATE_AT_SIZE, mandatory = false, description = "Specify size in bytes of the XMI sofa string, i.e. the document text. If the text surpasses that size, the document is not populated from XMI but given some placeholder information. This can be necessary when large documents cannot be handled by subsequent components in the pipeline.") + private int truncationSize; private Initializer initializer; private CasPopulator casPopulator; private String[] xmiModuleAnnotationNames; @@ -42,10 +49,12 @@ public class XmiDBMultiplier extends DBMultiplier implements Initializable { public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); logFinalXmi = Optional.ofNullable((Boolean) aContext.getConfigParameterValue(PARAM_LOG_FINAL_XMI)).orElse(false); + truncationSize = Optional.ofNullable((Integer)aContext.getConfigParameterValue(PARAM_TRUNCATE_AT_SIZE)).orElse(0); } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { + log.trace("Incoming jCas instance: " + aJCas); boolean initDone = super.initialized; RowBatch rowBatch = null; if (!initDone) { @@ -90,8 +99,13 @@ public AbstractCas next() throws AnalysisEngineProcessException { populateCas(jCas); } } catch (Throwable throwable) { - log.error("Error while reading document from the database: ", throwable); - throw throwable; + log.error("Error while reading document from the database. Releasing the CAS. ", throwable); + jCas.release(); + throw new AnalysisEngineProcessException(throwable); + } + if (log.isTraceEnabled()) { + log.trace("Outgoing multiplier jCas instance: {}", jCas); + log.trace("Returning CAS containing document {}", JCoReTools.getDocId(jCas)); } return jCas; } @@ -101,10 +115,42 @@ private void populateCas(JCas jCas) throws AnalysisEngineProcessException { throw new AnalysisEngineProcessException(new IllegalStateException("Initialization of the component was not finished. See previous errors to learn the reason. Cannot continue.")); try { final byte[][] data = documentDataIterator.next(); - log.trace("Populating CAS with {}", casPopulator); - if (data != null) + final int pkSize = (int) dbc.getActiveTableFieldConfiguration().getPrimaryKeyFields().count(); + if (log.isTraceEnabled()) { + List l = new ArrayList<>(); + for (int i = pkSize; i < data.length; i++) { + if (data[i] == null) + continue; + int length = data[i].length; + double lengthInMb = (length / 1024d) / 1024d; + l.add("col" + i + ":" + lengthInMb + "MB"); + } + log.trace("Populating CAS for document ID {} with column data of sizes {}", new String(data[0]), String.join(",", l)); + } + boolean truncate = false; + if (truncationSize > 0) { + if(data[pkSize].length > truncationSize) + truncate = true; + } + if (data != null && !truncate) casPopulator.populateCas(data, jCas); + else if (truncate) { + // This document is too long. Set the document ID and some placeholder document text. + jCas.setDocumentText("This document was truncated due to exceedingly long text contents."); + List pkElements = new ArrayList<>(); + for (int i = 0; i < pkSize; i++) { + pkElements.add(new String(data[i], StandardCharsets.UTF_8)); + } + final Header header = new Header(jCas); + header.setDocId(pkElements.stream().collect(Collectors.joining(","))); + header.addToIndexes(); + + CasPopulator.storeMaxXmiIdAndSofaMappings(jCas, data, initializer.getStoreMaxXmiId()); + DBReader.setDBProcessingMetaData(dbc, readDataTable, tableName, data, jCas); + log.debug("Truncating document with ID {} due to its text size of {} bytes which is greater than the given threshold of {} bytes.", pkElements, data[pkSize].length, truncationSize); + } } catch (CasPopulationException e) { + log.error("Exception while populating CAS", e); throw new AnalysisEngineProcessException(e); } } diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierReader.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierReader.java index 22cadadcc..60c405b2f 100644 --- a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierReader.java +++ b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierReader.java @@ -42,14 +42,14 @@ public class XmiDBMultiplierReader extends DBMultiplierReader { public static final String PARAM_ANNOTATIONS_TO_LOAD = Initializer.PARAM_ANNOTATIONS_TO_LOAD; public static final String PARAM_XMI_META_SCHEMA = "XmiMetaTablesSchema"; private final static Logger log = LoggerFactory.getLogger(XmiDBMultiplierReader.class); + @ConfigurationParameter(name = PARAM_ANNOTATIONS_TO_LOAD, mandatory = false, description = "An array of qualified UIMA type names. The provided names will be converted to database table column names in an equivalent manner as the XMIDBWriter does when storing the annotations. Thus, by default the columns of the XMI table holding annotation module information are named by lowercased UIMA type name where dots are replaced by underscores.. This can be overwritten by appending ':' to a table name. The given type names will be converted to valid Postgres columns names by replacing dots with underscores and the colon will be converted to the dollar character. From the resolved columns, annotation modules in segmented XMI format are read where an annotation module contains all annotation instances of a specific type in a specific document. All annotation modules read this way are merged with the base document, resulting in valid XMI data which is then deserialized into the CAS.") + protected String[] qualifiedAnnotationColumnNames; @ConfigurationParameter(name = PARAM_READS_BASE_DOCUMENT, description = "Indicates if this reader reads segmented " + "annotation data. If set to false, the XMI data is expected to represent complete annotated documents. " + "If it is set to true, a segmented annotation graph is expected and the table given with the 'Table' parameter " + "will contain the document text together with some basic annotations. What exactly is stored in which manner " + "is determined by the jcore-xmi-db-consumer used to write the data into the database.") private Boolean readsBaseDocument; - @ConfigurationParameter(name = PARAM_ANNOTATIONS_TO_LOAD, mandatory = false, description = "An array of qualified UIMA type names. The provided names will be converted to database table column names in an equivalent manner as the XMIDBWriter does when storing the annotations. Thus, by default the columns of the XMI table holding annotation module information are named by lowercased UIMA type name where dots are replaced by underscores.. This can be overwritten by appending ':' to a table name. The given type names will be converted to valid Postgres columns names by replacing dots with underscores and the colon will be converted to the dollar character. From the resolved columns, annotation modules in segmented XMI format are read where an annotation module contains all annotation instances of a specific type in a specific document. All annotation modules read this way are merged with the base document, resulting in valid XMI data which is then deserialized into the CAS.") - protected String[] qualifiedAnnotationColumnNames; @ConfigurationParameter(name = PARAM_STORE_XMI_ID, mandatory = false, description = "This parameter is required " + "to be set to true, if this reader is contained in a pipeline that also contains a jcore-xmi-db-writer and" + "the writer will segment the CAS annotation graph and store only parts of it. Then, it is important to " + @@ -68,7 +68,7 @@ public class XmiDBMultiplierReader extends DBMultiplierReader { "(j)visualvm, the hot spots of work can be identified. If one of those is the XML attribute buffer " + "resizing, this parameter should be set to a size that makes buffer resizing unnecessary.") private int xercesAttributeBufferSize; - @ConfigurationParameter(name = PARAM_XMI_META_SCHEMA, mandatory = false, defaultValue = "public", description = "Each XMI file defines a number of XML namespaces according to the types used in the document. Those namespaces are stored in a table named '" +XmiSplitConstants.XMI_NS_TABLE + "' when splitting annotations in annotation modules by the XMI DB writer. This parameter allows to specify in which Postgres schema this table should be looked for. Also, the table listing the annotation tables is stored in this Postgres schema. Defaults to 'public'.") + @ConfigurationParameter(name = PARAM_XMI_META_SCHEMA, mandatory = false, defaultValue = "public", description = "Each XMI file defines a number of XML namespaces according to the types used in the document. Those namespaces are stored in a table named '" + XmiSplitConstants.XMI_NS_TABLE + "' when splitting annotations in annotation modules by the XMI DB writer. This parameter allows to specify in which Postgres schema this table should be looked for. Also, the table listing the annotation tables is stored in this Postgres schema. Defaults to 'public'.") private String xmiMetaSchema; private boolean doGzip; private String[] additionalTableNames; @@ -94,7 +94,7 @@ public void initialize(UimaContext context) throws ResourceInitializationExcepti } @Override - public void getNext(JCas jCas) throws CollectionException { + public void getNext(JCas jCas) throws CollectionException, IOException { try { super.getNext(jCas); // The above call to super.getNext has created a RowBatch annotation which we retrieve here. @@ -107,7 +107,7 @@ public void getNext(JCas jCas) throws CollectionException { rowBatch.setXercesAttributeBufferSize(xercesAttributeBufferSize); rowBatch.setXmiMetaTablesPostgresSchema(xmiMetaSchema); } catch (Throwable throwable) { - log.error("Exception ocurred while trying to get the next document", throwable); + log.error("Exception occurred while trying to get the next document", throwable); throw throwable; } } @@ -122,6 +122,8 @@ private void adaptReaderConfigurationForXmiData() throws ResourceInitializationE costosysConfig = (String) getConfigParameterValue(PARAM_COSTOSYS_CONFIG_NAME); try { dbc = new DataBaseConnector(costosysConfig); + if (dbc.getMaxConnections() < 3) + dbc.setMaxConnections(3); } catch (FileNotFoundException e) { throw new ResourceInitializationException(e); } @@ -185,7 +187,7 @@ private void determineDataFormat(String table) throws ResourceInitializationExce } private void checkForJeDISBinaryFormat(byte[] firstTwoBytes) { - short header = (short) ((firstTwoBytes[0]<<8) | (0xff & firstTwoBytes[1])); + short header = (short) ((firstTwoBytes[0] << 8) | (0xff & firstTwoBytes[1])); if (header != BinaryJeDISNodeEncoder.JEDIS_BINARY_MAGIC) { useBinaryFormat = false; log.debug("Is data encoded in JeDIS binary format: false"); diff --git a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier-reader.xml b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier-reader.xml index 312cb5e0a..fa03c02ba 100644 --- a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier-reader.xml +++ b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier-reader.xml @@ -5,7 +5,7 @@ JCoRe XMI Database Multiplier Reader This is an extension of the DBMultiplierReader to handle JeDIS XMI annotation module data. - 2.5.1-SNAPSHOT + 2.6.0 ReadsBaseDocument diff --git a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml index 0e15747d0..7ba47c81b 100644 --- a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml +++ b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml @@ -4,9 +4,9 @@ true de.julielab.jcore.reader.xmi.XmiDBMultiplier - JCoRe Abstract Database Multiplier + JCoRe XMI Database Multiplier A multiplier that receives document IDs to read from a database table from the DBMultiplierReader. The reader also delivers the path to the corpus storage system (CoStoSys) configuration and additional tables for joining with the main data table. This multiplier class is abstract and cannot be used directly.Extending classes must implement the next() method to actually read documents from the database and populate CASes with them. This component is a part of the Jena Document Information System, JeDIS. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany JULIE Lab Jena, Germany @@ -17,6 +17,13 @@ false false + + TruncateAtSize + Specify size in bytes of the XMI sofa string, i.e. the document text. If the text surpasses that size, the document is not populated from XMI but given some placeholder information. This can be necessary when large documents cannot be handled by subsequent components in the pipeline. + Integer + false + false + @@ -28,8 +35,9 @@ + - + diff --git a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-reader.xml b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-reader.xml index c09220a89..bc148ea7e 100644 --- a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-reader.xml +++ b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-reader.xml @@ -5,7 +5,7 @@ JCoRe XMI Database Reader A database readerthat expects serialized UIMA CAS objects in XMI format as input. The reader has the capability to read segmented annotation graphs that have been stored by the jcore-xmi-db-writer. This component is part of the Jena Document Information System, JeDIS. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany @@ -169,7 +169,7 @@ - + diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierDifferentNsSchemaTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierDifferentNsSchemaTest.java index ff60e41a0..73dcdc055 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierDifferentNsSchemaTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierDifferentNsSchemaTest.java @@ -31,7 +31,7 @@ public class XmiDBMultiplierDifferentNsSchemaTest { - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static int subsetCounter; diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java index 2af097f43..fabc558aa 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java @@ -31,7 +31,7 @@ public class XmiDBMultiplierTest { - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static int subsetCounter; @@ -40,6 +40,7 @@ public static void setup() throws UIMAException, IOException, ConfigurationExcep postgres.start(); XmiDBSetupHelper.createDbcConfig(postgres); DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); + dbc.setMaxConnections(3); costosysConfig = DBTestUtils.createTestCostosysConfig("xmi_text", 10, postgres); new File(costosysConfig).deleteOnExit(); XmiDBSetupHelper.processAndSplitData(costosysConfig, false, false,"public"); @@ -57,6 +58,7 @@ public static void shutdown() { @Test(threadPoolSize = 3, invocationCount = 10, timeOut = 500000) public void testXmiDBMultiplierReader() throws Exception { DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); + dbc.setMaxConnections(5); String xmisubset; synchronized (XmiDBMultiplierDifferentNsSchemaTest.class) { xmisubset = "xmisubset" + subsetCounter++; diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderBinaryFormatTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderBinaryFormatTest.java index 309ab09a4..5af87e804 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderBinaryFormatTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderBinaryFormatTest.java @@ -2,7 +2,9 @@ import de.julielab.costosys.dbconnection.DataBaseConnector; import de.julielab.jcore.db.test.DBTestUtils; -import de.julielab.jcore.types.*; +import de.julielab.jcore.types.Header; +import de.julielab.jcore.types.Sentence; +import de.julielab.jcore.types.Token; import org.apache.commons.configuration2.ex.ConfigurationException; import org.apache.uima.UIMAException; import org.apache.uima.collection.CollectionReader; @@ -20,11 +22,11 @@ import java.util.ArrayList; import java.util.List; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; public class XmiDBReaderBinaryFormatTest { - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static String xmisubset; @@ -36,7 +38,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); costosysConfig = DBTestUtils.createTestCostosysConfig("xmi_text", 2, postgres); XmiDBSetupHelper.processAndSplitData(costosysConfig, false, true,"public"); - assertTrue("The data document table exists", dbc.withConnectionQueryBoolean(c -> c.tableExists("_data.documents"))); + assertTrue(dbc.withConnectionQueryBoolean(c -> c.tableExists("_data.documents")), "The data document table exists"); xmisubset = "xmisubset"; dbc.setActiveTableSchema("xmi_text"); dbc.reserveConnection(); diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderDifferentNsSchemaTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderDifferentNsSchemaTest.java index a8a15b58d..8ae996691 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderDifferentNsSchemaTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderDifferentNsSchemaTest.java @@ -1,15 +1,13 @@ package de.julielab.jcore.reader.xmi; import de.julielab.costosys.dbconnection.DataBaseConnector; -import de.julielab.jcore.consumer.xmi.XMIDBWriter; import de.julielab.jcore.db.test.DBTestUtils; -import de.julielab.jcore.reader.db.TableReaderConstants; -import de.julielab.jcore.types.*; +import de.julielab.jcore.types.Header; +import de.julielab.jcore.types.Sentence; +import de.julielab.jcore.types.Token; import org.apache.commons.configuration2.ex.ConfigurationException; import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.collection.CollectionReader; -import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.CollectionReaderFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; @@ -24,11 +22,11 @@ import java.util.ArrayList; import java.util.List; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; public class XmiDBReaderDifferentNsSchemaTest { - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static String xmisubset; @@ -40,7 +38,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); costosysConfig = DBTestUtils.createTestCostosysConfig("xmi_text", 2, postgres); XmiDBSetupHelper.processAndSplitData(costosysConfig, false, false, "someotherschema"); - assertTrue("The data document table exists", dbc.withConnectionQueryBoolean(c -> c.tableExists("_data.documents"))); + assertTrue(dbc.withConnectionQueryBoolean(c -> c.tableExists("_data.documents")), "The data document table exists"); xmisubset = "xmisubset"; dbc.setActiveTableSchema("xmi_text"); dbc.reserveConnection(); diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderGzippedDataTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderGzippedDataTest.java index 9a7fea0b3..e25808419 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderGzippedDataTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderGzippedDataTest.java @@ -31,7 +31,7 @@ * The exact same test as {@link XmiDBReaderTest} but here, the data is gzipped. */ public class XmiDBReaderGzippedDataTest { - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static String xmisubset; @@ -41,7 +41,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf XmiDBSetupHelper.createDbcConfig(postgres); DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); - costosysConfig = DBTestUtils.createTestCostosysConfig("xmi_text", 1, postgres); + costosysConfig = DBTestUtils.createTestCostosysConfig("xmi_text", 2, postgres); new File(costosysConfig).deleteOnExit(); XmiDBSetupHelper.processAndSplitData(costosysConfig, true, false,"public"); assertTrue(dbc.withConnectionQueryBoolean( c -> c.tableExists("_data.documents")), "The data document table exists"); diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderMonolithicDocumentsTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderMonolithicDocumentsTest.java index e0ae7f3ed..8b0dab1d2 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderMonolithicDocumentsTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderMonolithicDocumentsTest.java @@ -28,7 +28,7 @@ public class XmiDBReaderMonolithicDocumentsTest { - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static String xmisubset; @@ -38,7 +38,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf XmiDBSetupHelper.createDbcConfig(postgres); DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); - costosysConfig = DBTestUtils.createTestCostosysConfig("xmi_complete_cas", 1, postgres); + costosysConfig = DBTestUtils.createTestCostosysConfig("xmi_complete_cas", 2, postgres); new File(costosysConfig).deleteOnExit(); XmiDBSetupHelper.processAndStoreCompleteXMIData(costosysConfig, true); dbc.reserveConnection(); diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderTest.java index 72bea54a6..36ca9601a 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderTest.java @@ -24,11 +24,11 @@ import java.util.ArrayList; import java.util.List; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; public class XmiDBReaderTest { - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static String xmisubset; @@ -40,7 +40,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); costosysConfig = DBTestUtils.createTestCostosysConfig("xmi_text", 2, postgres); XmiDBSetupHelper.processAndSplitData(costosysConfig, false, false,"public"); - assertTrue("The data document table exists", dbc.withConnectionQueryBoolean(c -> c.tableExists("_data.documents"))); + assertTrue(dbc.withConnectionQueryBoolean(c -> c.tableExists("_data.documents")), "The data document table exists"); xmisubset = "xmisubset"; dbc.setActiveTableSchema("xmi_text"); dbc.reserveConnection(); diff --git a/jcore-xmi-db-reader/src/test/resources/logback-test.xml b/jcore-xmi-db-reader/src/test/resources/logback-test.xml index 37c8a721c..6a4a567cd 100644 --- a/jcore-xmi-db-reader/src/test/resources/logback-test.xml +++ b/jcore-xmi-db-reader/src/test/resources/logback-test.xml @@ -9,7 +9,9 @@ - + + + diff --git a/jcore-xmi-db-writer/component.meta b/jcore-xmi-db-writer/component.meta index 708695365..55d656ba9 100644 --- a/jcore-xmi-db-writer/component.meta +++ b/jcore-xmi-db-writer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-xmi-db-writer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe XMI Database Writer" } diff --git a/jcore-xmi-db-writer/pom.xml b/jcore-xmi-db-writer/pom.xml index 5a7320d2f..b959967ea 100644 --- a/jcore-xmi-db-writer/pom.xml +++ b/jcore-xmi-db-writer/pom.xml @@ -4,7 +4,7 @@ jedis-parent de.julielab - 2.5.1-SNAPSHOT + 2.6.0 ../jedis-parent jcore-xmi-db-writer @@ -144,7 +144,7 @@ de.julielab jcore-db-checkpoint-ae - 2.5.1-SNAPSHOT + 2.6.0 de.julielab @@ -159,6 +159,12 @@ logback-classic test + + org.jetbrains + annotations + RELEASE + compile + https://github.com/JULIELab/jcore-base/tree/master/jcore-xmi-db-writer diff --git a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java index f639e58ae..1d13802dd 100644 --- a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java +++ b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java @@ -29,6 +29,7 @@ import de.julielab.jcore.types.Header; import de.julielab.jcore.types.XmiMetaData; import de.julielab.jcore.types.ext.DBProcessingMetaData; +import de.julielab.jcore.utility.JCoReTools; import de.julielab.xml.*; import de.julielab.xml.binary.BinaryJeDISNodeEncoder; import de.julielab.xml.binary.BinaryStorageAnalysisResult; @@ -122,7 +123,6 @@ public class XMIDBWriter extends JCasAnnotator_ImplBase { private static Map> binaryMappedFeatures = Collections.emptyMap(); private static Map> splitterResultMap; private static Map, CountDownLatch>>> xmiBufferItemsToProcess; - private static ReentrantLock missingMappingsGatheringLock; private static CountDownLatch missingMappingsGatheringLatch = new CountDownLatch(0); private static ReentrantLock mappingUpdateLock; private DataBaseConnector dbc; @@ -251,6 +251,8 @@ public class XMIDBWriter extends JCasAnnotator_ImplBase { @ConfigurationParameter(name = PARAM_ADD_SHA_HASH, mandatory = false, description = "Possible values: document_text. If this parameter is set to a valid value, the SHA256 hash for the given value will be calculated, base64 encoded and added to each document as a new column in the document table. The column will be named after the parameter value, suffixed by '_sha256'.") private String documentItemToHash; private Map shaMap; + private Set mirrorResetIds; + private Set unchangedDocuments; private String mappingCacheKey; private DocumentReleaseCheckpoint docReleaseCheckpoint; private List currentDocumentIdBatch; @@ -289,8 +291,8 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept // The deletion of obsolete annotations should only be active when the base document is stored because then, old annotations won't be valid any more. deleteObsolete &= storeBaseDocument; baseDocumentAnnotationTypes = Arrays.stream( - Optional.ofNullable((String[]) aContext.getConfigParameterValue(PARAM_BASE_DOCUMENT_ANNOTATION_TYPES)) - .orElse(new String[0])) + Optional.ofNullable((String[]) aContext.getConfigParameterValue(PARAM_BASE_DOCUMENT_ANNOTATION_TYPES)) + .orElse(new String[0])) .collect(Collectors.toSet()); attributeSize = (Integer) aContext.getConfigParameterValue(PARAM_ATTRIBUTE_SIZE); writeBatchSize = Optional.ofNullable((Integer) aContext.getConfigParameterValue(PARAM_WRITE_BATCH_SIZE)).orElse(50); @@ -321,7 +323,7 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept } if (xmiMetaSchema.isBlank()) - throw new ResourceInitializationException(new IllegalArgumentException("The XMI meta table Postgres schema must either be omitted at all or non-empty but was.")); + throw new ResourceInitializationException(new IllegalArgumentException("The XMI meta table Postgres schema must either be omitted at all or non-empty but was '" + xmiMetaSchema + "'.")); unqualifiedAnnotationNames = Collections.emptyList(); @@ -424,6 +426,8 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept if (useBinaryFormat) { this.binaryEncoder = new BinaryJeDISNodeEncoder(); } + mirrorResetIds = new HashSet<>(); + unchangedDocuments = new HashSet<>(); log.info(XMIDBWriter.class.getName() + " initialized."); log.info("Effective document table name: {}", effectiveDocTableName); @@ -510,7 +514,15 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { } catch (IllegalArgumentException e) { // Do nothing; this is not the work item CAS } - DocumentId docId = getDocumentId(aJCas); + Collection metaDatas = JCasUtil.select(aJCas, DBProcessingMetaData.class); + if (metaDatas.size() > 1) + throw new AnalysisEngineProcessException(new IllegalArgumentException( + "There is more than one type of DBProcessingMetaData in document " + JCoReTools.getDocId(aJCas))); + Optional metaData = metaDatas.stream().findAny(); + DocumentId docId = getDocumentId(aJCas, metaData); + setMirrorResetStateForDocId(docId, metaData); + if (metaData.isPresent() && metaData.get().getIsDocumentHashUnchanged()) + unchangedDocuments.add(docId); if (docId == null) { log.warn("The current document does not have a document ID. It is omitted from database import."); return; @@ -519,12 +531,8 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { currentDocumentIdBatch.add(docId); if (subsetTable == null) { - Collection metaData = JCasUtil.select(aJCas, DBProcessingMetaData.class); if (!metaData.isEmpty()) { - if (metaData.size() > 1) - throw new AnalysisEngineProcessException(new IllegalArgumentException( - "There is more than one type of DBProcessingMetaData in document " + docId)); - subsetTable = metaData.stream().findAny().get().getSubsetTable(); + subsetTable = metaData.get().getSubsetTable(); if (subsetTable != null && storeBaseDocument) { // Check if we are about to read from a mirror subset and to update the base document. This is not allowed @@ -533,7 +541,7 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { try (CoStoSysConnection costoConn = dbc.obtainOrReserveConnection()) { Map mirrorSubsetNames = dbc.getMirrorSubsetNames(costoConn, effectiveDocTableName); if (mirrorSubsetNames.keySet().contains(subsetTable.replace("^[^.]\\.", ""))) - throw new AnalysisEngineProcessException(new IllegalArgumentException("The read subset table " + subsetTable + " is a mirror subset its document table " + effectiveDocTableName + " and the base document should be stored. This base document storage would cause all its subset to reset the updated documents. Thus, the subset " + subsetTable + " would be partially reset while processing, reading the same documents over and over again. This is therefore illegal.")); + throw new AnalysisEngineProcessException(new IllegalArgumentException("The read subset table " + subsetTable + " is a mirror subset of the target document table " + effectiveDocTableName + " and the base document should be stored. This base document storage would cause all its subset to reset the updated documents. Thus, the subset " + subsetTable + " would be partially reset while processing, reading the same documents over and over again. This is therefore illegal.")); } } } @@ -564,6 +572,21 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { } } + private void setMirrorResetStateForDocId(DocumentId docId, Optional metaData) { + if (metaData.isPresent()) { + // mirror subset reset is only necessary if we store the base document in any way; + // additionally, we check if the document text hash key is reported to be different to its already + // existing database entry. Only then the mirror subsets should be reset for this document because only + // then a re-processing of the document makes sense. + // The isDocumentHashUnchanged feature is set by the XMLDBMultiplier. + if (storeBaseDocument && !metaData.get().getIsDocumentHashUnchanged()) + mirrorResetIds.add(docId); + } else { + // default: reset the mirror tables + mirrorResetIds.add(docId); + } + } + private void handleAddhash(JCas aJCas, DocumentId docId) { if (documentItemToHash != null) { final String documentText = aJCas.getDocumentText(); @@ -738,7 +761,7 @@ private void createAnnotationModules() throws AnalysisEngineProcessException { // adapt the map keys to table names (currently, the keys are the // Java type names) splitXmiData = convertModuleLabelsToColumnNames(splitXmiData); - + log.trace("The following columns have XMI data: {}", splitXmiData.keySet()); for (String columnName : splitXmiData.keySet()) { boolean isBaseDocumentColumn = columnName.equals(XmiSplitConstants.BASE_DOC_COLUMN); @@ -837,25 +860,28 @@ private Map convertModuleLabelsToColumnNames(Map< return convertedMap; } - private DocumentId getDocumentId(JCas aJCas) { + private DocumentId getDocumentId(JCas aJCas, Optional metaData) { DocumentId docId = null; - try { - DBProcessingMetaData dbProcessingMetaData = JCasUtil.selectSingle(aJCas, DBProcessingMetaData.class); - docId = new DocumentId(dbProcessingMetaData); - } catch (IllegalArgumentException e) { - // it seems there is not DBProcessingMetaData we could get a complex primary key from. The document ID + if (metaData.isPresent()) { + docId = new DocumentId(metaData.get()); + } else { + // it seems there is no DBProcessingMetaData we could get a complex primary key from. The document ID // will have to do. - log.trace("Could not find the primary key in the DBProcessingMetaData due to exception: {}. Using the document ID as primary key.", e.getMessage()); + log.trace("Could not find the primary key in the DBProcessingMetaData because no meta data annotation is set. Using the document ID as primary key."); } if (docId == null) { AnnotationIndex headerIndex = aJCas.getAnnotationIndex(Header.type); FSIterator headerIt = headerIndex.iterator(); if (!headerIt.hasNext()) { - int min = Math.min(100, aJCas.getDocumentText().length()); + String docText = ""; + if (aJCas.getDocumentText() != null) { + int min = Math.min(100, aJCas.getDocumentText().length()); + docText = aJCas.getDocumentText().substring(0, min); + } log.warn( "Got document without a header and without DBProcessingMetaData; cannot obtain document ID." + " This document will not be written into the database. Document text begins with: {}", - aJCas.getDocumentText().substring(0, min)); + docText); ++headerlessDocuments; return null; } @@ -1002,7 +1028,7 @@ public void batchProcessComplete() throws AnalysisEngineProcessException { final boolean readyToSendData = processXmiBuffer(); if (readyToSendData) { if (!(featuresToMapDryRun && useBinaryFormat)) - annotationInserter.sendXmiDataToDatabase(effectiveDocTableName, annotationModules, subsetTable, storeBaseDocument, deleteObsolete, shaMap); + annotationInserter.sendXmiDataToDatabase(effectiveDocTableName, annotationModules, subsetTable, mirrorResetIds, unchangedDocuments, deleteObsolete, shaMap); else log.info("The dry run to see details about features to be mapped in the binary format is activated. No contents are written into the database."); log.trace("Clearing {} annotation modules", annotationModules.size()); @@ -1012,6 +1038,8 @@ public void batchProcessComplete() throws AnalysisEngineProcessException { if (docReleaseCheckpoint != null) docReleaseCheckpoint.release(jedisSyncKey, currentDocumentIdBatch.stream()); currentDocumentIdBatch.clear(); + mirrorResetIds.clear(); + unchangedDocuments.clear(); } } catch (XmiDataInsertionException e) { throw new AnalysisEngineProcessException(e); @@ -1031,7 +1059,7 @@ public void collectionProcessComplete() throws AnalysisEngineProcessException { try { processXmiBuffer(); if (!(featuresToMapDryRun && useBinaryFormat)) - annotationInserter.sendXmiDataToDatabase(effectiveDocTableName, annotationModules, subsetTable, storeBaseDocument, deleteObsolete, shaMap); + annotationInserter.sendXmiDataToDatabase(effectiveDocTableName, annotationModules, subsetTable, mirrorResetIds, unchangedDocuments, deleteObsolete, shaMap); else log.info("The dry run to see details about features to be mapped in the binary format is activated. No contents are written into the database."); annotationModules.clear(); @@ -1040,11 +1068,14 @@ public void collectionProcessComplete() throws AnalysisEngineProcessException { if (docReleaseCheckpoint != null) docReleaseCheckpoint.release(jedisSyncKey, currentDocumentIdBatch.stream()); currentDocumentIdBatch.clear(); + mirrorResetIds.clear(); + unchangedDocuments.clear(); } catch (XmiDataInsertionException e) { throw new AnalysisEngineProcessException(e); } - log.info("{} documents without a head occured overall. Those could not be written into the database.", - headerlessDocuments); + if (headerlessDocuments > 0) + log.info("{} documents without a head occured overall. Those could not be written into the database.", + headerlessDocuments); dbc.close(); } diff --git a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java index 31fb146ef..d561432fe 100644 --- a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java +++ b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java @@ -16,9 +16,12 @@ import java.sql.BatchUpdateException; import java.sql.PreparedStatement; import java.sql.SQLException; +import java.text.DecimalFormat; import java.util.*; import java.util.function.Function; +import java.util.function.Predicate; import java.util.stream.Collectors; +import java.util.stream.Stream; public class XmiDataInserter { @@ -32,6 +35,7 @@ public class XmiDataInserter { private Map maxXmiIdMap; private String componentDbName; private String hashColumnName; + private DecimalFormat df = new DecimalFormat(); private List processedDocumentIds; @@ -57,25 +61,36 @@ public XmiDataInserter(Set annotationModuleColumnNames, * update. It will just be inserted otherwise (throwing an error if there * will be a primary key constraint violation, i.e. duplicates). * - * @param serializedCASes - * @param storeBaseDocument + * @param annotationModules + * @param mirrorResetIds + * @param unchangedDocuments * @param deleteObsolete * @param shaMap * @throws XmiDataInsertionException * @throws AnalysisEngineProcessException */ - public void sendXmiDataToDatabase(String xmiTableName, List serializedCASes, String subsetTableName, Boolean storeBaseDocument, Boolean deleteObsolete, Map shaMap) throws XmiDataInsertionException { - if (log.isTraceEnabled()) { - log.trace("Sending XMI data for {} tables to the database", serializedCASes.size()); - log.trace("Sending {} XMI data items", serializedCASes.size()); - } - final Map> dataByDoc = serializedCASes.stream().collect(Collectors.groupingBy(XmiData::getDocId)); - final Set documentIdsWithValues = shaMap != null ? Sets.union(dataByDoc.keySet(), shaMap.keySet()) : dataByDoc.keySet(); + public void sendXmiDataToDatabase(String xmiTableName, List annotationModules, String subsetTableName, Set mirrorResetIds, Set unchangedDocuments, Boolean deleteObsolete, Map shaMap) throws XmiDataInsertionException { + log.trace("Sending {} XMI data items", annotationModules.size()); + final Map> dataByDoc = annotationModules.stream().collect(Collectors.groupingBy(XmiData::getDocId)); + // Collect all document IDs we want to add something for into the database. This can be annotations or the hash. + final Set documentIdsWithData = shaMap != null ? Sets.union(dataByDoc.keySet(), shaMap.keySet()) : dataByDoc.keySet(); + log.trace("There are {} documents with values to be updated in the database.", documentIdsWithData.size()); class RowIterator implements Iterator> { - - private Iterator docIdIterator = documentIdsWithValues.iterator(); + // Add documents that have been processed but no data. We need to do this to override potentially existing + // annotation values with null to remove them. + private Iterator docIdIterator; private FieldConfig fieldConfig = dbc.getFieldConfiguration(schemaDocument); private List> fields = fieldConfig.getFields(); + /** + * An iterator that always returns only rows for a subset of document IDs. Either the ones that need mirror subsets to be reset or those for which mirror subsets should not be reset. + * @param returnDocumentsWithMirrorReset + */ + public RowIterator(boolean returnDocumentsWithMirrorReset) { + Predicate mirrorResetFilterPredicate = docId -> !unchangedDocuments.contains(docId); + if (!returnDocumentsWithMirrorReset) + mirrorResetFilterPredicate = Predicate.not(mirrorResetFilterPredicate); + docIdIterator = Stream.concat(documentIdsWithData.stream(), processedDocumentIds.stream()).filter(mirrorResetFilterPredicate).distinct().iterator(); + } @Override public boolean hasNext() { @@ -84,7 +99,7 @@ public boolean hasNext() { @Override public Map next() { - Map row = new HashMap(); + Map row = new HashMap<>(); final DocumentId docId = docIdIterator.next(); // There might actually be no data when we only write the SHA hashes final List dataList = dataByDoc.getOrDefault(docId, Collections.emptyList()); @@ -138,9 +153,13 @@ public Map next() { missingColumns.forEach(c -> row.put(c, null)); } // Set columns without a value to null to delete a potentially existing value. - if (updateMode) { + // But only if the document text had changed. Otherwise we would just delete all the annotations we + // actually want to keep. + if (updateMode && !unchangedDocuments.contains(docId)) { Set annotationColumnsWithValues = dataList.stream().map(XmiData::getColumnName).collect(Collectors.toSet()); + log.trace("Annotation columns with values: {}", annotationColumnsWithValues); final Sets.SetView columnsWithoutValues = Sets.difference(annotationModuleColumnNames, annotationColumnsWithValues); + log.trace("Annotation columns without values: {}", columnsWithoutValues); columnsWithoutValues.forEach(col -> { row.put(col, null); log.trace("{}=null", col); @@ -160,18 +179,25 @@ public void remove() { } } + long time = System.currentTimeMillis(); try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { + log.debug("Obtained connection after {}ms", System.currentTimeMillis() - time); conn.setAutoCommit(false); - RowIterator iterator = new RowIterator(); + // This is the private in-line defined class from above. All values are already contained in the class + // definition. + RowIterator iterator = new RowIterator(true); try { if (updateMode) { - log.debug("Updating {} XMI CAS data in database table '{}'.", - serializedCASes.size(), xmiTableName); - dbc.updateFromRowIterator(iterator, xmiTableName, false, storeBaseDocument, schemaDocument); + log.debug("Updating {} XMI CAS data in database table '{}' for documents with mirror subset resets.", + processedDocumentIds.size() - unchangedDocuments.size(), xmiTableName); + dbc.updateFromRowIterator(iterator, xmiTableName, false, true, schemaDocument); + log.debug("Updating {} XMI CAS data in database table '{}' for documents without mirror subset resets.", + unchangedDocuments.size(), xmiTableName); + dbc.updateFromRowIterator(new RowIterator(false), xmiTableName, false, false, schemaDocument); } else { log.debug("Inserting {} XMI CAS data into database table '{}'.", - serializedCASes.size(), xmiTableName); + annotationModules.size(), xmiTableName); dbc.importFromRowIterator(iterator, xmiTableName, false, schemaDocument); } } catch (Exception e) { @@ -179,6 +205,7 @@ public void remove() { throw new XmiDataInsertionException(e); } setLastComponent(conn, subsetTableName); + processedDocumentIds.clear(); log.debug("Committing XMI data to database."); conn.commit(); maxXmiIdMap.clear(); @@ -189,6 +216,10 @@ public void remove() { if (null != ne) ne.printStackTrace(); } + if (log.isDebugEnabled()) { + time = System.currentTimeMillis() - time; + log.debug("Database import of {} XMI documents took {}ms ({}ms per document)", documentIdsWithData.size(), time, df.format((double) time / documentIdsWithData.size())); + } } /** @@ -241,8 +272,6 @@ private void setLastComponent(CoStoSysConnection conn, String subsetTableName) t else nextException.printStackTrace(); throw new XmiDataInsertionException(nextException); - } finally { - processedDocumentIds.clear(); } } diff --git a/jcore-xmi-db-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-db-writer.xml b/jcore-xmi-db-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-db-writer.xml index 9eab689a6..da64061af 100644 --- a/jcore-xmi-db-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-db-writer.xml +++ b/jcore-xmi-db-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-db-writer.xml @@ -6,7 +6,7 @@ JCoRe XMI Database Writer This component is capable of storing the standard UIMA serialization of documents in one or even multiple database tables. The UIMA serialization format is XMI, an XML format that expressed an annotation graph. This component either stores the whole annotation graph in XMI format in a database row, together with the document ID. Alternatively, it makes use of the jcore-xmi-splitter to segment the annotation graph with respect to a user specified list of annotation types. Then, the XMI data of each annotation type is extracted from the document XMI data and stored in a separate table. The tables are created automatically according to the primary key of the active table schema in the Corpus Storage System (CoStoSys) configuration file that is also given as a parameter. The jcore-xmi-db-reader is capable of reading this kind of distributed annotation graph and reassemble a valid XMI document which then cas be deserialized into a CAS. This consumer is UIMA DUCC compatible. It requires the collection reader to forward the work item CAS to the consumer. This is required so the consumer knows that a work item has been finished and that all cached data - in this case the XMI data - should be flushed. This is important! Without the forwarding of the work item CAS, the last batch of cached XMI data will not be written into the database. This component is part of the Jena Document Information System, JeDIS. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab Jena, Germany @@ -190,6 +190,7 @@ + diff --git a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterBinaryFormatTest.java b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterBinaryFormatTest.java index f89ce94e5..135affc2d 100644 --- a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterBinaryFormatTest.java +++ b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterBinaryFormatTest.java @@ -4,9 +4,7 @@ import de.julielab.costosys.dbconnection.DataBaseConnector; import de.julielab.jcore.db.test.DBTestUtils; import de.julielab.jcore.types.*; -import de.julielab.jcore.types.ext.DBProcessingMetaData; import de.julielab.xml.XmiSplitConstants; -import de.julielab.xml.XmiSplitter; import de.julielab.xml.binary.BinaryDecodingResult; import de.julielab.xml.binary.BinaryJeDISNodeDecoder; import org.apache.commons.configuration2.ex.ConfigurationException; @@ -17,8 +15,13 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.StringArray; -import org.junit.*; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import org.testcontainers.containers.PostgreSQLContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; import java.io.ByteArrayInputStream; import java.io.IOException; @@ -26,24 +29,25 @@ import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; -import java.util.*; import java.util.List; +import java.util.*; import java.util.stream.Collectors; import static java.nio.charset.StandardCharsets.UTF_8; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatCode; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +@Testcontainers public class XmiDBWriterBinaryFormatTest { - @ClassRule - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + @Container + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static String xmlSubsetTable; private static DataBaseConnector dbc; - @BeforeClass + @BeforeAll public static void setup() throws SQLException, UIMAException, IOException, ConfigurationException { dbc = DBTestUtils.getDataBaseConnector(postgres); dbc.reserveConnection(); @@ -52,7 +56,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf dbc.releaseConnections(); } - @AfterClass + @AfterAll public static void shutDown() { dbc.close(); } @@ -65,7 +69,7 @@ public static JCas getJCasWithRequiredTypes() throws UIMAException { "de.julielab.jcore.types.jcore-xmi-splitter-types"); } - @Before + @BeforeEach public void cleanForTest() throws SQLException { String binaryMappingTable = "public." + MetaTableManager.BINARY_MAPPING_TABLE; String binaryFeaturesToMapTable = "public." + MetaTableManager.BINARY_FEATURES_TO_MAP_TABLE; diff --git a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterMonolithicDocumentTest.java b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterMonolithicDocumentTest.java index 10684230b..6f8611d29 100644 --- a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterMonolithicDocumentTest.java +++ b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterMonolithicDocumentTest.java @@ -3,7 +3,9 @@ import de.julielab.costosys.dbconnection.CoStoSysConnection; import de.julielab.costosys.dbconnection.DataBaseConnector; import de.julielab.jcore.db.test.DBTestUtils; -import de.julielab.jcore.types.*; +import de.julielab.jcore.types.Header; +import de.julielab.jcore.types.Sentence; +import de.julielab.jcore.types.Token; import org.apache.commons.configuration2.ex.ConfigurationException; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngine; @@ -12,39 +14,35 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.ClassRule; -import org.junit.Test; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import org.testcontainers.containers.PostgreSQLContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; import java.io.ByteArrayInputStream; -import java.io.IOException; import java.sql.ResultSet; -import java.sql.SQLException; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatCode; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertTrue; +@Testcontainers public class XmiDBWriterMonolithicDocumentTest { - @ClassRule - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + @Container + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static DataBaseConnector dbc; - @BeforeClass + @BeforeAll public static void setup() throws ConfigurationException { dbc = DBTestUtils.getDataBaseConnector(postgres); costosysConfig = DBTestUtils.createTestCostosysConfig("medline_2017", 1, postgres); DBTestUtils.createAndSetHiddenConfig("src/test/resources/hiddenConfig.txt", postgres); } - @AfterClass + @AfterAll public static void shutDown() { dbc.close(); } diff --git a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java index 866d0ddf8..306ab2820 100644 --- a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java +++ b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java @@ -4,6 +4,7 @@ import de.julielab.costosys.dbconnection.DataBaseConnector; import de.julielab.jcore.db.test.DBTestUtils; import de.julielab.jcore.types.*; +import de.julielab.jcore.types.ext.DBProcessingMetaData; import de.julielab.xml.XmiSplitConstants; import org.apache.commons.configuration2.ex.ConfigurationException; import org.apache.uima.UIMAException; @@ -11,39 +12,45 @@ import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.ClassRule; -import org.junit.Test; +import org.apache.uima.jcas.cas.StringArray; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.InvalidXMLException; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; import org.testcontainers.containers.PostgreSQLContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.sql.ResultSet; import java.sql.SQLException; import java.util.List; -import java.util.Map; -import java.util.Set; +import java.util.*; import java.util.stream.Collectors; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatCode; +@Testcontainers public class XmiDBWriterTest { - @ClassRule - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + @Container + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:" + DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; - private static String xmlSubsetTable; private static DataBaseConnector dbc; - @BeforeClass + @BeforeAll public static void setup() throws SQLException, UIMAException, IOException, ConfigurationException { dbc = DBTestUtils.getDataBaseConnector(postgres); dbc.reserveConnection(); costosysConfig = DBTestUtils.createTestCostosysConfig("medline_2017", 1, postgres); - xmlSubsetTable = DBTestUtils.setupDatabase(dbc, "src/test/resources/pubmedsample18n0001.xml.gz", "medline_2017", 177, postgres); dbc.releaseConnections(); + DBTestUtils.createAndSetHiddenConfig("src/test/resources/hiddenConfig.txt", postgres); } - @AfterClass + @AfterAll public static void shutDown() { dbc.close(); } @@ -57,14 +64,14 @@ public static JCas getJCasWithRequiredTypes() throws UIMAException { } @Test - public void testXmiDBWriterSplitAnnotations() throws Exception { + public void testXmiDBWriterSplitAnnotationsSpecifyAnnotationSchemas() throws Exception { AnalysisEngine xmiWriter = AnalysisEngineFactory.createEngine("de.julielab.jcore.consumer.xmi.desc.jcore-xmi-db-writer", - XMIDBWriter.PARAM_ANNOS_TO_STORE, new String[]{Token.class.getCanonicalName(), Sentence.class.getCanonicalName()}, + XMIDBWriter.PARAM_ANNOS_TO_STORE, new String[]{"tokenschema:" + Token.class.getCanonicalName(), "sentenceschema:" + Sentence.class.getCanonicalName()}, XMIDBWriter.PARAM_COSTOSYS_CONFIG, costosysConfig, XMIDBWriter.PARAM_STORE_ALL, false, XMIDBWriter.PARAM_STORE_BASE_DOCUMENT, true, - XMIDBWriter.PARAM_TABLE_DOCUMENT, "_data.documents2", + XMIDBWriter.PARAM_TABLE_DOCUMENT, "_data.documents", XMIDBWriter.PARAM_DO_GZIP, false, XMIDBWriter.PARAM_STORE_RECURSIVELY, true, XMIDBWriter.PARAM_UPDATE_MODE, true, @@ -85,24 +92,27 @@ public void testXmiDBWriterSplitAnnotations() throws Exception { xmiWriter.collectionProcessComplete(); dbc = DBTestUtils.getDataBaseConnector(postgres); - try (CoStoSysConnection ignored = dbc.obtainOrReserveConnection()) { - assertThat(dbc.tableExists("_data.documents2")).isTrue(); + try (CoStoSysConnection costoConn = dbc.obtainOrReserveConnection()) { + assertThat(dbc.tableExists("_data.documents")).isTrue(); - assertThat(dbc.getTableColumnNames("_data.documents2")).contains("de_julielab_jcore_types_token", "de_julielab_jcore_types_sentence"); - assertThat(dbc.isEmpty("_data.documents2", XmiSplitConstants.BASE_DOC_COLUMN)).isFalse(); - assertThat(dbc.isEmpty("_data.documents2", XmiDataInserter.FIELD_MAX_XMI_ID)).isFalse(); - assertThat(dbc.isEmpty("_data.documents2", "sofa_mapping")).isFalse(); - assertThat(dbc.isEmpty("_data.documents2", "de_julielab_jcore_types_token")).isFalse(); - assertThat(dbc.isEmpty("_data.documents2", "de_julielab_jcore_types_sentence")).isFalse(); + final List> infos = dbc.getTableColumnInformation("_data.documents", "column_name"); + final Set columnNames = infos.stream().map(info -> info.get("column_name")).map(String.class::cast).collect(Collectors.toSet()); + final String tokenColumn = "tokenschema$de_julielab_jcore_types_token"; + final String sentenceColumn = "sentenceschema$de_julielab_jcore_types_sentence"; + assertThat(columnNames).contains(tokenColumn, sentenceColumn); + + assertThat(dbc.isEmpty("_data.documents", tokenColumn)).isFalse(); + assertThat(dbc.isEmpty("_data.documents", sentenceColumn)).isFalse(); } } @Test - public void testXmiDBWriterSplitAnnotationsSpecifyAnnotationSchemas() throws Exception { + public void testXmiDBWriterSplitAnnotationsDefaultAnnotationSchemas() throws Exception { AnalysisEngine xmiWriter = AnalysisEngineFactory.createEngine("de.julielab.jcore.consumer.xmi.desc.jcore-xmi-db-writer", - XMIDBWriter.PARAM_ANNOS_TO_STORE, new String[]{"tokenschema:" + Token.class.getCanonicalName(), "sentenceschema:" + Sentence.class.getCanonicalName()}, + XMIDBWriter.PARAM_ANNOS_TO_STORE, new String[]{Token.class.getCanonicalName(), Sentence.class.getCanonicalName()}, + XMIDBWriter.PARAM_ANNO_DEFAULT_QUALIFIER, "testschema", XMIDBWriter.PARAM_COSTOSYS_CONFIG, costosysConfig, XMIDBWriter.PARAM_STORE_ALL, false, XMIDBWriter.PARAM_STORE_BASE_DOCUMENT, true, @@ -133,54 +143,216 @@ public void testXmiDBWriterSplitAnnotationsSpecifyAnnotationSchemas() throws Exc final List> infos = dbc.getTableColumnInformation("_data.documents", "column_name"); final Set columnNames = infos.stream().map(info -> info.get("column_name")).map(String.class::cast).collect(Collectors.toSet()); - final String tokenColumn = "tokenschema$de_julielab_jcore_types_token"; - final String sentenceColumn = "sentenceschema$de_julielab_jcore_types_sentence"; + final String tokenColumn = "testschema$de_julielab_jcore_types_token"; + final String sentenceColumn = "testschema$de_julielab_jcore_types_sentence"; assertThat(columnNames).contains(tokenColumn, sentenceColumn); - - assertThat(dbc.isEmpty("_data.documents", tokenColumn)).isFalse(); - assertThat(dbc.isEmpty("_data.documents", sentenceColumn)).isFalse(); } } @Test - public void testXmiDBWriterSplitAnnotationsDefaultAnnotationSchemas() throws Exception { + public void testXmiSubtypeStorage() throws Exception { AnalysisEngine xmiWriter = AnalysisEngineFactory.createEngine("de.julielab.jcore.consumer.xmi.desc.jcore-xmi-db-writer", - XMIDBWriter.PARAM_ANNOS_TO_STORE, new String[]{ Token.class.getCanonicalName(), Sentence.class.getCanonicalName()}, - XMIDBWriter.PARAM_ANNO_DEFAULT_QUALIFIER, "testschema", + XMIDBWriter.PARAM_ANNOS_TO_STORE, new String[]{Token.class.getCanonicalName(), Sentence.class.getCanonicalName()}, XMIDBWriter.PARAM_COSTOSYS_CONFIG, costosysConfig, XMIDBWriter.PARAM_STORE_ALL, false, XMIDBWriter.PARAM_STORE_BASE_DOCUMENT, true, - XMIDBWriter.PARAM_TABLE_DOCUMENT, "_data.documents", + XMIDBWriter.PARAM_TABLE_DOCUMENT, "_data.documents3", XMIDBWriter.PARAM_DO_GZIP, false, XMIDBWriter.PARAM_STORE_RECURSIVELY, true, XMIDBWriter.PARAM_UPDATE_MODE, true, - XMIDBWriter.PARAM_BASE_DOCUMENT_ANNOTATION_TYPES, new String[]{MeshHeading.class.getCanonicalName(), AbstractText.class.getCanonicalName(), Title.class.getCanonicalName(), de.julielab.jcore.types.pubmed.Header.class.getCanonicalName()} + XMIDBWriter.PARAM_BASE_DOCUMENT_ANNOTATION_TYPES, new String[]{InternalReference.class.getCanonicalName()} ); JCas jCas = getJCasWithRequiredTypes(); final Header header = new Header(jCas); header.setDocId("789"); header.addToIndexes(); - jCas.setDocumentText("This is a sentence. This is another one."); - new Sentence(jCas, 0, 19).addToIndexes(); - new Sentence(jCas, 20, 40).addToIndexes(); - // Of course, these token offsets are wrong, but it doesn't matter to the test - new Token(jCas, 0, 19).addToIndexes(); - new Token(jCas, 20, 40).addToIndexes(); + jCas.setDocumentText("This is a sentence.1,2"); + new de.julielab.jcore.types.pubmed.InternalReference(jCas, 19, 20).addToIndexes(); + new de.julielab.jcore.types.pubmed.InternalReference(jCas, 21, 22).addToIndexes(); assertThatCode(() -> xmiWriter.process(jCas)).doesNotThrowAnyException(); jCas.reset(); xmiWriter.collectionProcessComplete(); dbc = DBTestUtils.getDataBaseConnector(postgres); - try (CoStoSysConnection costoConn = dbc.obtainOrReserveConnection()) { - assertThat(dbc.tableExists("_data.documents")).isTrue(); + try (CoStoSysConnection ignored = dbc.obtainOrReserveConnection()) { + assertThat(dbc.tableExists("_data.documents3")).isTrue(); + ResultSet rs = ignored.createStatement().executeQuery("SELECT " + XmiSplitConstants.BASE_DOC_COLUMN + " FROM " + "_data.documents3"); + assertThat(rs.next()).isTrue(); +// String documentString = rs.getString(1); +// System.out.println(documentString); - final List> infos = dbc.getTableColumnInformation("_data.documents", "column_name"); - final Set columnNames = infos.stream().map(info -> info.get("column_name")).map(String.class::cast).collect(Collectors.toSet()); + } + } - final String tokenColumn = "testschema$de_julielab_jcore_types_token"; - final String sentenceColumn = "testschema$de_julielab_jcore_types_sentence"; - assertThat(columnNames).contains(tokenColumn, sentenceColumn); + @Nested + class WriteWithMirrorSubsets { + /** + * This test checks that the XMI is split as intended and distributed into database table columns as annotation modules. + * @throws Exception + */ + @Test + public void testXmiDBWriterSplitAnnotations() throws Exception { + + AnalysisEngine xmiWriter = getXmiWriterForDocuments2(); + JCas jCas = getJCasWithRequiredTypes(); + prepareDocument1(jCas); + assertThatCode(() -> xmiWriter.process(jCas)).doesNotThrowAnyException(); + jCas.reset(); + prepareDocument2(jCas); + assertThatCode(() -> xmiWriter.process(jCas)).doesNotThrowAnyException(); + xmiWriter.collectionProcessComplete(); + + dbc = DBTestUtils.getDataBaseConnector(postgres); + dbc.setActiveTableSchema("medline_2017"); + try (CoStoSysConnection ignored = dbc.obtainOrReserveConnection()) { + assertThat(dbc.tableExists("_data.documents2")).isTrue(); + + assertThat(dbc.getTableColumnNames("_data.documents2")).contains("de_julielab_jcore_types_token", "de_julielab_jcore_types_sentence"); + assertThat(dbc.isEmpty("_data.documents2", XmiSplitConstants.BASE_DOC_COLUMN)).isFalse(); + assertThat(dbc.isEmpty("_data.documents2", XmiDataInserter.FIELD_MAX_XMI_ID)).isFalse(); + assertThat(dbc.isEmpty("_data.documents2", "sofa_mapping")).isFalse(); + assertThat(dbc.isEmpty("_data.documents2", "de_julielab_jcore_types_token")).isFalse(); + assertThat(dbc.isEmpty("_data.documents2", "de_julielab_jcore_types_sentence")).isFalse(); + + } + + // create a subset for nested tests and set its only entry to "processed" + try (CoStoSysConnection ignored = dbc.obtainOrReserveConnection()) { + dbc.createSubsetTable("_data._data_mirror", "_data.documents2", 1, "Test subset", "medline_2017"); + dbc.initMirrorSubset("_data._data_mirror", "_data.documents2", true, "medline_2017"); + List idsList = new ArrayList<>(); + idsList.add(new byte[][]{"789".getBytes(StandardCharsets.UTF_8)}); + idsList.add(new byte[][]{"890".getBytes(StandardCharsets.UTF_8)}); + dbc.setProcessed("_data._data_mirror", idsList); + assertThat(dbc.status("_data._data_mirror", EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED)).isProcessed).isEqualTo(2); + } + } + + /** + * Produces the test XMI writer for this nested test group. It stores the base document which should cause + * mirror subsets to reset the "is processed" status to false for the written documents. + * @return The XMI writer for testing. + * @throws InvalidXMLException + * @throws IOException + * @throws ResourceInitializationException + */ + private AnalysisEngine getXmiWriterForDocuments2() throws InvalidXMLException, IOException, ResourceInitializationException { + return AnalysisEngineFactory.createEngine("de.julielab.jcore.consumer.xmi.desc.jcore-xmi-db-writer", + XMIDBWriter.PARAM_ANNOS_TO_STORE, new String[]{Token.class.getCanonicalName(), Sentence.class.getCanonicalName()}, + XMIDBWriter.PARAM_COSTOSYS_CONFIG, costosysConfig, + XMIDBWriter.PARAM_STORE_ALL, false, + XMIDBWriter.PARAM_STORE_BASE_DOCUMENT, true, + XMIDBWriter.PARAM_TABLE_DOCUMENT, "_data.documents2", + XMIDBWriter.PARAM_DO_GZIP, false, + XMIDBWriter.PARAM_STORE_RECURSIVELY, true, + XMIDBWriter.PARAM_UPDATE_MODE, true, + XMIDBWriter.PARAM_BASE_DOCUMENT_ANNOTATION_TYPES, new String[]{MeshHeading.class.getCanonicalName(), AbstractText.class.getCanonicalName(), Title.class.getCanonicalName(), de.julielab.jcore.types.pubmed.Header.class.getCanonicalName()} + ); + } + + /** + * Prepares the first of two documents used in these nested tests. + * @param jCas The CAS to populate with the test data. + */ + private void prepareDocument1(JCas jCas) { + final Header header = new Header(jCas); + header.setDocId("789"); + header.addToIndexes(); + jCas.setDocumentText("This is a sentence. This is another one."); + new Sentence(jCas, 0, 19).addToIndexes(); + new Sentence(jCas, 20, 40).addToIndexes(); + // Of course, these token offsets are wrong, but it doesn't matter to the test + new Token(jCas, 0, 19).addToIndexes(); + new Token(jCas, 20, 40).addToIndexes(); + } + + /** + * Prepares the second of two documents used in these nested tests. + * @param jCas The CAS to populate with the test data. + */ + private void prepareDocument2(JCas jCas) { + final Header header2 = new Header(jCas); + header2.setDocId("890"); + header2.addToIndexes(); + jCas.setDocumentText("Sentence of document 2."); + new Sentence(jCas, 0, jCas.getDocumentText().length()).addToIndexes(); + } + + /** + * Default case: mirror subsets should be reset after writing the base document + */ + @Nested + class CheckMirrorSubsetIsReset { + @Test + public void testMirrorSubsetReset() throws Exception { + AnalysisEngine xmiWriter = getXmiWriterForDocuments2(); + JCas jCas = getJCasWithRequiredTypes(); + prepareDocument1(jCas); + assertThatCode(() -> xmiWriter.process(jCas)).doesNotThrowAnyException(); + jCas.reset(); + prepareDocument2(jCas); + assertThatCode(() -> xmiWriter.process(jCas)).doesNotThrowAnyException(); + xmiWriter.collectionProcessComplete(); + + dbc = DBTestUtils.getDataBaseConnector(postgres); + dbc.setActiveTableSchema("medline_2017"); + + // check that the subset table has been reset + try (CoStoSysConnection ignored = dbc.obtainOrReserveConnection()) { + assertThat(dbc.status("_data._data_mirror", EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED)).isProcessed).isEqualTo(0); + assertThat(dbc.status("_data._data_mirror", EnumSet.of(DataBaseConnector.StatusElement.TOTAL)).total).isEqualTo(2); + // set it again to processed for the next test + List idsList = new ArrayList<>(); + idsList.add(new byte[][]{"789".getBytes(StandardCharsets.UTF_8)}); + idsList.add(new byte[][]{"890".getBytes(StandardCharsets.UTF_8)}); + dbc.setProcessed("_data._data_mirror", idsList); + assertThat(dbc.status("_data._data_mirror", EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED)).isProcessed).isEqualTo(2); + } + } + } + + /** + * The interesting test case: Given a DBProcessingMetaData annotation that specifies that the document + * text hasn't changed between a former document version in the database and the newly written version, + * the mirror subsets should not be reset to "is not processed" for the given document. + */ + @Nested + class CheckMirrorSubsetIsNotReset { + @Test + public void testMirrorSubsetNotReset() throws Exception { + // precondition check: the mirror subset is currently processed + // this main test will be to ensure that the mirror subset stays this way + try (CoStoSysConnection ignored = dbc.obtainOrReserveConnection()) { + assertThat(dbc.status("_data._data_mirror", EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED)).isProcessed).isEqualTo(2); + assertThat(dbc.status("_data._data_mirror", EnumSet.of(DataBaseConnector.StatusElement.TOTAL)).total).isEqualTo(2); + } + AnalysisEngine xmiWriter = getXmiWriterForDocuments2(); + JCas jCas = getJCasWithRequiredTypes(); + prepareDocument1(jCas); + // This is the important part: tell the writer not to reset mirror subsets for this document + DBProcessingMetaData processingMetaData = new DBProcessingMetaData(jCas); + processingMetaData.setIsDocumentHashUnchanged(true); + StringArray pk = new StringArray(jCas, 1); + pk.set(0, "789"); + processingMetaData.setPrimaryKey(pk); + processingMetaData.addToIndexes(); + assertThatCode(() -> xmiWriter.process(jCas)).doesNotThrowAnyException(); + jCas.reset(); + prepareDocument2(jCas); + assertThatCode(() -> xmiWriter.process(jCas)).doesNotThrowAnyException(); + xmiWriter.collectionProcessComplete(); + + dbc = DBTestUtils.getDataBaseConnector(postgres); + dbc.setActiveTableSchema("medline_2017"); + + // check that the subset table has NOT been reset for document 789 but for the other + try (CoStoSysConnection ignored = dbc.obtainOrReserveConnection()) { + assertThat(dbc.status("_data._data_mirror", EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED)).isProcessed).isEqualTo(1); + assertThat(dbc.status("_data._data_mirror", EnumSet.of(DataBaseConnector.StatusElement.TOTAL)).total).isEqualTo(2); + } + } } } + } diff --git a/jcore-xmi-reader/component.meta b/jcore-xmi-reader/component.meta index 701192b4c..57ad76f80 100644 --- a/jcore-xmi-reader/component.meta +++ b/jcore-xmi-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-xmi-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe XMI Collection Reader" } diff --git a/jcore-xmi-reader/pom.xml b/jcore-xmi-reader/pom.xml index ea0dcd482..a6017493b 100644 --- a/jcore-xmi-reader/pom.xml +++ b/jcore-xmi-reader/pom.xml @@ -13,7 +13,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -24,8 +24,8 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-xmi-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-reader.xml b/jcore-xmi-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-reader.xml index a7701f7e3..d21e7b29b 100644 --- a/jcore-xmi-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-reader.xml +++ b/jcore-xmi-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-reader.xml @@ -6,7 +6,7 @@ XmiCollectionReader A CollectionReader which reads CAS data stored as XMI files from the file system. The reader grounds on IBM's XmiCollectionReader delivered with older versions of UIMA and has been extended by the Julie Lab team at the University of Jena. This XMI reader is capable of reading (g)zipped XMI files and is able to recursively search subdirectories of a delivered root directory for XMI files. - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-xmi-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiCollectionReaderTest.java b/jcore-xmi-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiCollectionReaderTest.java index 17fda0be8..2d360f427 100644 --- a/jcore-xmi-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiCollectionReaderTest.java +++ b/jcore-xmi-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiCollectionReaderTest.java @@ -16,9 +16,9 @@ import org.apache.uima.fit.factory.CollectionReaderFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertTrue; public class XmiCollectionReaderTest { @Test diff --git a/jcore-xmi-writer/component.meta b/jcore-xmi-writer/component.meta index 48695ccb1..7afe174fc 100644 --- a/jcore-xmi-writer/component.meta +++ b/jcore-xmi-writer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-xmi-writer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe XMI Writer" } diff --git a/jcore-xmi-writer/pom.xml b/jcore-xmi-writer/pom.xml index 950de517b..0babbc06a 100644 --- a/jcore-xmi-writer/pom.xml +++ b/jcore-xmi-writer/pom.xml @@ -11,7 +11,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -29,8 +29,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-xmi-writer/src/main/java/de/julielab/jcore/consumer/xmi/CasToXmiConsumer.java b/jcore-xmi-writer/src/main/java/de/julielab/jcore/consumer/xmi/CasToXmiConsumer.java index 6a33348dd..4762f809e 100644 --- a/jcore-xmi-writer/src/main/java/de/julielab/jcore/consumer/xmi/CasToXmiConsumer.java +++ b/jcore-xmi-writer/src/main/java/de/julielab/jcore/consumer/xmi/CasToXmiConsumer.java @@ -295,7 +295,7 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { String fileName = outFileName.toString(); try { writeXmi(jcas.getCas(), fileName); - LOGGER.info(" Wrote file " + fileName); + LOGGER.debug(" Wrote file " + fileName); } catch (IOException e) { try { throw new ResourceProcessException(e); diff --git a/jcore-xmi-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-writer.xml b/jcore-xmi-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-writer.xml index cfd5692d9..a4af702ed 100644 --- a/jcore-xmi-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-writer.xml +++ b/jcore-xmi-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-writer.xml @@ -6,7 +6,7 @@ XMIWriter - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-xmi-writer/src/test/java/de/julielab/jcore/consumer/xmi/CasToXmiConsumerTest.java b/jcore-xmi-writer/src/test/java/de/julielab/jcore/consumer/xmi/CasToXmiConsumerTest.java index 1242372d6..e6b7006e2 100644 --- a/jcore-xmi-writer/src/test/java/de/julielab/jcore/consumer/xmi/CasToXmiConsumerTest.java +++ b/jcore-xmi-writer/src/test/java/de/julielab/jcore/consumer/xmi/CasToXmiConsumerTest.java @@ -24,15 +24,15 @@ import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FilenameFilter; import java.io.IOException; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; /** * Test for class {@link CasToXmiConsumer} @@ -91,7 +91,7 @@ public boolean accept(File file, String name) { * Delete all files ending with "xmi" or "xmi.gzip" in the output directory, * and do the same for all subdirectories of outputDir, recursively */ - @Before + @BeforeEach public void clearDirectory() { File outputDir = new File(OUTPUT_FOLDER_XMI); removeXmiGzipAndZipFiles(outputDir); @@ -121,7 +121,7 @@ private void removeXmiGzipAndZipFiles(File dir) { /** * Create the CasConsumer under test */ - @Before + @BeforeEach public void createConsumer() { // XMLInputSource source; try { diff --git a/jcore-xmi-writer/src/test/resources/de/julielab/jcore/consumer/xmi/CasToXmiConsumer.xml b/jcore-xmi-writer/src/test/resources/de/julielab/jcore/consumer/xmi/CasToXmiConsumer.xml index 1453038df..7538342bb 100644 --- a/jcore-xmi-writer/src/test/resources/de/julielab/jcore/consumer/xmi/CasToXmiConsumer.xml +++ b/jcore-xmi-writer/src/test/resources/de/julielab/jcore/consumer/xmi/CasToXmiConsumer.xml @@ -6,7 +6,7 @@ XMIWriter - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-xml-db-reader/component.meta b/jcore-xml-db-reader/component.meta index 6fde40ce7..37ac82af4 100644 --- a/jcore-xml-db-reader/component.meta +++ b/jcore-xml-db-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-xml-db-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe XML Database Reader" } diff --git a/jcore-xml-db-reader/pom.xml b/jcore-xml-db-reader/pom.xml index 72a3652f7..99ecaa819 100644 --- a/jcore-xml-db-reader/pom.xml +++ b/jcore-xml-db-reader/pom.xml @@ -15,7 +15,7 @@ de.julielab jedis-parent - 2.5.1-SNAPSHOT + 2.6.0 ../jedis-parent @@ -23,7 +23,7 @@ de.julielab jcore-db-reader - 2.5.1-SNAPSHOT + 2.6.0 de.julielab @@ -51,7 +51,7 @@ de.julielab jcore-xml-mapper - 2.5.1-SNAPSHOT + 2.6.0 de.julielab @@ -64,8 +64,25 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine + + + de.julielab + jcore-db-test-utilities + + + org.assertj + assertj-core + + + de.julielab + jcore-descriptor-creator + + + ch.qos.logback + logback-classic + provided https://github.com/JULIELab/jcore-base/jcore-xml-db-reader diff --git a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/CasPopulator.java b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/CasPopulator.java index 9dc0e6559..0175ec384 100644 --- a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/CasPopulator.java +++ b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/CasPopulator.java @@ -2,10 +2,12 @@ import de.julielab.costosys.dbconnection.DataBaseConnector; import de.julielab.jcore.reader.xmlmapper.mapper.XMLMapper; +import de.julielab.jcore.types.Header; import org.apache.uima.jcas.JCas; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -18,13 +20,18 @@ public class CasPopulator { private final XMLMapper xmlMapper; private Row2CasMapper row2CasMapper; private String[] rowMappingArray; - private BiConsumer dbProcessingMetaDataSetter; + private int truncationSize; - public CasPopulator(DataBaseConnector dbc, XMLMapper xmlMapper, Row2CasMapper row2CasMapper, String[] rowMappingArray) { + public CasPopulator(DataBaseConnector dbc, XMLMapper xmlMapper, Row2CasMapper row2CasMapper, String[] rowMappingArray, int truncationSize) { this.dbc = dbc; this.xmlMapper = xmlMapper; this.row2CasMapper = row2CasMapper; this.rowMappingArray = rowMappingArray; + this.truncationSize = truncationSize; + } + + public CasPopulator(DataBaseConnector dbc, XMLMapper xmlMapper, Row2CasMapper row2CasMapper, String[] rowMappingArray) { + this(dbc, xmlMapper, row2CasMapper, rowMappingArray, Integer.MAX_VALUE); } public void populateCas(JCas jcas, byte[][] arrayArray, BiConsumer dbProcessingMetaDataSetter) throws CasPopulationException { @@ -68,7 +75,15 @@ public void populateCas(JCas jcas, byte[][] arrayArray, BiConsumer (pkIndices.size() + 1)) { diff --git a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java index b7e041f2d..5c0373e24 100644 --- a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java +++ b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java @@ -1,23 +1,33 @@ package de.julielab.jcore.reader.xml; +import de.julielab.costosys.configuration.FieldConfig; +import de.julielab.costosys.dbconnection.CoStoSysConnection; import de.julielab.jcore.reader.db.DBMultiplier; import de.julielab.jcore.reader.db.DBReader; import de.julielab.jcore.reader.xmlmapper.mapper.XMLMapper; +import de.julielab.jcore.types.casflow.ToVisit; +import de.julielab.jcore.types.casmultiplier.RowBatch; +import de.julielab.jcore.types.ext.DBProcessingMetaData; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.tuple.Pair; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.AbstractCas; +import org.apache.uima.cas.FeatureStructure; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.StringArray; import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.*; import java.util.stream.Collectors; @ResourceMetaData(name = "JCoRe XML Database Multiplier", description = "This CAS multiplier receives information about " + @@ -28,10 +38,19 @@ "CAS with them via the 'RowMapping' parameter. This component is part of the Jena Document Information System, " + "JeDIS." , vendor = "JULIE Lab Jena, Germany", copyright = "JULIE Lab Jena, Germany") +@TypeCapability(inputs = {"de.julielab.jcore.types.casmultiplier.RowBatch"}, outputs = {"de.julielab.jcore.types.casflow.ToVisit"}) public class XMLDBMultiplier extends DBMultiplier { -private final static Logger log = LoggerFactory.getLogger(XMLDBMultiplier.class); public static final String PARAM_ROW_MAPPING = Initializer.PARAM_ROW_MAPPING; public static final String PARAM_MAPPING_FILE = Initializer.PARAM_MAPPING_FILE; + public static final String PARAM_ADD_SHA_HASH = "AddShaHash"; + public static final String PARAM_TABLE_DOCUMENT = "DocumentTable"; + public static final String PARAM_TABLE_DOCUMENT_SCHEMA = "DocumentTableSchema"; + public static final String PARAM_TO_VISIT_KEYS = "ToVisitKeys"; + public static final String PARAM_ADD_TO_VISIT_KEYS = "AddToVisitKeys"; + public static final String PARAM_ADD_UNCHANGED_DOCUMENT_TEXT_FLAG = "AddUnchangedDocumentTextFlag"; + public static final String PARAM_TRUNCATE_AT_SIZE = "TruncateAtSize"; + + private final static Logger log = LoggerFactory.getLogger(XMLDBMultiplier.class); /** * Mapper which maps medline XML to a CAS with the specified UIMA type system * via an XML configuration file. @@ -41,8 +60,26 @@ public class XMLDBMultiplier extends DBMultiplier { protected String[] rowMappingArray; @ConfigurationParameter(name = PARAM_MAPPING_FILE, description = XMLDBReader.DESC_MAPPING_FILE) protected String mappingFileStr; + @ConfigurationParameter(name = PARAM_ADD_SHA_HASH, mandatory = false, description = "For use with AnnotationDefinedFlowController and XMIDBWriter. Possible values: document_text, defaults to 'document_text' and thus doesn't need to be specified manually at the moment. This parameter needs to match the value for the same parameter given to the XMIDBWriter in this pipeline. Then, a comparison between the existing hash in the database and the new hash of the CAS read in this pipeline can be made. In case the hashes match, the CAS is directly routed to the components specified in the " + PARAM_TO_VISIT_KEYS + " parameter, skipping all other components. Note that this only works with AAEs where the first component is an 'AnnotationControlledFlow'. Additionally, the DBProcessingMetaData#hasDocumentHashChanged is set. This can be used by the XMIDBWriter to omit the reset of mirror subsets when updating the base document when the actual CAS text stayed the same.") + private String documentItemToHash; + @ConfigurationParameter(name = PARAM_TABLE_DOCUMENT, mandatory = false, description = "For use with AnnotationDefinedFlowController. String parameter indicating the name of the " + + "table where the XMI data and, thus, the hash is stored. The name must be schema qualified. Note that in this component, only the ToVisit annotation is created that determines which components to apply to a CAS with matching (unchanged) hash. The logic to actually control the CAS flow is contained in the AnnotationDefinedFlowController.") + private String xmiStorageDataTable; + @ConfigurationParameter(name = PARAM_TABLE_DOCUMENT_SCHEMA, mandatory = false, description = "For use with AnnotationDefinedFlowController. The name of the schema that the document table - given with the " + PARAM_TABLE_DOCUMENT + " parameter - adheres to. Only the primary key part is required for hash value retrieval.") + private String xmiStorageDataTableSchema; + @ConfigurationParameter(name = PARAM_TO_VISIT_KEYS, mandatory = false, description = "For use with AnnotationDefinedFlowController. Specifies the delegate AE keys of the AEs this CAS should still applied on although the hash has not changed. Can be null or empty indicating that no component should be applied to the CAS. The task of the AnnotationDefinedFlowController is then to read those annotations and route the CAS accordingly.") + private String[] toVisitKeys; + @ConfigurationParameter(name = PARAM_ADD_TO_VISIT_KEYS, mandatory = false, description = "Toggles the creation of annotations for the AnnotationDefinedFlowController. Only needed when such a flow controller is used in the pipeline. For details, see the description of " + PARAM_TO_VISIT_KEYS + ".") + private boolean addToVisitKeys; + @ConfigurationParameter(name = PARAM_ADD_UNCHANGED_DOCUMENT_TEXT_FLAG, mandatory = false, description = "Toggles the addition of the 'document text is unchanged' flag. The value of this flag is determined via a SHA256 hash of the CAS document text. When " + PARAM_TABLE_DOCUMENT + " and " + PARAM_TABLE_DOCUMENT_SCHEMA + " are specified, the hash value of the document in storage is retrieved and compared to the current value. The flag is then set with respect to the comparison result.") + private boolean addUnchangedDocumentTextFlag; + @ConfigurationParameter(name = PARAM_TRUNCATE_AT_SIZE, mandatory = false, description = "Specify size in bytes of the XML document size. If the document surpasses that size, it is not populated from XMI but given some placeholder information. This can be necessary when large documents cannot be handled by subsequent components in the pipeline.") + private int truncationSize; + + private Row2CasMapper row2CasMapper; private CasPopulator casPopulator; + private Map docId2HashMap; private boolean initialized; @@ -51,14 +88,33 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept super.initialize(aContext); mappingFileStr = (String) aContext.getConfigParameterValue(PARAM_MAPPING_FILE); rowMappingArray = (String[]) aContext.getConfigParameterValue(PARAM_ROW_MAPPING); - + xmiStorageDataTable = (String) aContext.getConfigParameterValue(PARAM_TABLE_DOCUMENT); + xmiStorageDataTableSchema = (String) aContext.getConfigParameterValue(PARAM_TABLE_DOCUMENT_SCHEMA); + documentItemToHash = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_ADD_SHA_HASH)).orElse("document_text"); + toVisitKeys = (String[]) aContext.getConfigParameterValue(PARAM_TO_VISIT_KEYS); + addToVisitKeys = (boolean) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_ADD_TO_VISIT_KEYS)).orElse(false); + addUnchangedDocumentTextFlag = (boolean) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_ADD_UNCHANGED_DOCUMENT_TEXT_FLAG)).orElse(false); + truncationSize = Optional.ofNullable((Integer)aContext.getConfigParameterValue(PARAM_TRUNCATE_AT_SIZE)).orElse(Integer.MAX_VALUE); // We don't know yet which tables to read. Thus, we leave the row mapping out. // We will now once the DBMultiplier#process(JCas) will have been run. Initializer initializer = new Initializer(mappingFileStr, null, null); xmlMapper = initializer.getXmlMapper(); initialized = false; + + if ((addToVisitKeys || addUnchangedDocumentTextFlag)) { + if (!(xmiStorageDataTable == null && xmiStorageDataTableSchema == null) && !(xmiStorageDataTable != null && xmiStorageDataTableSchema != null && documentItemToHash != null)) { + String errorMsg = String.format("From the parameters '%s' and '%s' some are specified and some aren't. To activate hash value comparison in order to add aggregate component keys for CAS visit, specify all those parameters. Otherwise, specify none.", PARAM_TABLE_DOCUMENT, PARAM_TABLE_DOCUMENT_SCHEMA); + log.error(errorMsg); + throw new ResourceInitializationException(new IllegalArgumentException(errorMsg)); + } + } } + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException { + super.process(aJCas); + docId2HashMap = fetchCurrentHashesFromDatabase(JCasUtil.selectSingle(aJCas, RowBatch.class)); + } @Override public AbstractCas next() throws AnalysisEngineProcessException { @@ -73,11 +129,14 @@ public AbstractCas next() throws AnalysisEngineProcessException { } // The DBC is initialized in the super class in the process() method. Thus, at this point // the DBC should be set. - casPopulator = new CasPopulator(dbc, xmlMapper, row2CasMapper, rowMappingArray); + if (xmiStorageDataTable != null && !dbc.withConnectionQueryBoolean(d -> d.tableExists(xmiStorageDataTable))) + throw new AnalysisEngineProcessException(new IllegalArgumentException("The data table" + xmiStorageDataTable + " to retrieve hash values from for document text change detection does not exist in the database: " + dbc.getDbURL())); + casPopulator = new CasPopulator(dbc, xmlMapper, row2CasMapper, rowMappingArray, truncationSize); initialized = true; } byte[][] documentData = documentDataIterator.next(); populateCas(jCas, documentData); + setToVisitAnnotation(jCas); } } catch (Exception e) { log.error("Exception occurred: ", e); @@ -86,6 +145,49 @@ public AbstractCas next() throws AnalysisEngineProcessException { return jCas; } + /** + *

Creates a {@link ToVisit} annotation based on document text hash comparison and the defined parameter values.

+ *

Computes the hash of the newly read CAS and compares it to the hash for the same document retrieved from the + * database, if present. If there was a hash in the database and the hash values are equal, creates the ToVisit + * annotation and adds the toVisitKeys passed in the configuration of this component.

+ * + * @param jCas The newly read JCas. + */ + private void setToVisitAnnotation(JCas jCas) { + if (addToVisitKeys || addUnchangedDocumentTextFlag) { + DBProcessingMetaData dbProcessingMetaData = JCasUtil.selectSingle(jCas, DBProcessingMetaData.class); + StringArray pkArray = dbProcessingMetaData.getPrimaryKey(); + String pkString = String.join(",", pkArray.toArray()); + String existingHash = docId2HashMap.get(pkString); + if (existingHash != null) { + String newHash = getHash(jCas); + if (existingHash.equals(newHash)) { + if (log.isTraceEnabled()) + log.trace("Document {} has a document text hash that equals the one present in the database. Creating a ToVisit annotation routing it only to the components with delegate keys {}.", pkString, toVisitKeys); + if (addUnchangedDocumentTextFlag) + dbProcessingMetaData.setIsDocumentHashUnchanged(true); + if (addToVisitKeys) { + ToVisit toVisit = new ToVisit(jCas); + if (toVisitKeys != null && toVisitKeys.length != 0) { + StringArray keysArray = new StringArray(jCas, toVisitKeys.length); + keysArray.copyFromArray(toVisitKeys, 0, 0, toVisitKeys.length); + toVisit.setDelegateKeys(keysArray); + } + toVisit.addToIndexes(); + } + } + } else { + log.trace("No existing hash was found for document {}", pkString); + } + } + } + + private String getHash(JCas newCas) { + final String documentText = newCas.getDocumentText(); + final byte[] sha = DigestUtils.sha256(documentText.getBytes()); + return Base64.encodeBase64String(sha); + } + private void populateCas(JCas jCas, byte[][] documentData) throws AnalysisEngineProcessException { try { casPopulator.populateCas(jCas, documentData, @@ -96,8 +198,54 @@ private void populateCas(JCas jCas, byte[][] documentData) throws AnalysisEngine } protected List> getAllRetrievedColumns() { - List> fields = new ArrayList>(); Pair>> numColumnsAndFields = dbc.getNumColumnsAndFields(tables.length > 1, schemaNames); return numColumnsAndFields.getRight().stream().map(HashMap::new).collect(Collectors.toList()); } + + /** + *

Fetches the hashes of the currently stored documents in the database.

+ * + * @param rowBatch The annotation specifying which documents should be fetched by the multiplier and then be processed by the aggregate. + * @return A map from a string representation of the RowBatches document IDs to the hashes for the respective IDs. + * @throws AnalysisEngineProcessException If the SQL request fails. + */ + private Map fetchCurrentHashesFromDatabase(RowBatch rowBatch) throws AnalysisEngineProcessException { + if ((addToVisitKeys || addUnchangedDocumentTextFlag) && rowBatch.getIdentifiers() != null && rowBatch.getIdentifiers().size() > 0) { + String hashColumn = documentItemToHash + "_sha256"; + // Extract the document IDs in this RowBatch. The IDs could be composite keys. + List documentIds = new ArrayList<>(rowBatch.getIdentifiers().size()); + Iterator documentIDsIt = rowBatch.getIdentifiers().iterator(); + while (documentIDsIt.hasNext()) { + StringArray pkArray = (StringArray) documentIDsIt.next(); + documentIds.add(pkArray.toStringArray()); + } + Map id2hash = new HashMap<>(documentIds.size()); + // This is the map we want to fill that lets us look up the hash of the document text by document ID. + String sql = null; + // Query the database for the document IDs in the current RowBatch and retrieve hashes. + try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { + FieldConfig xmiTableSchema = dbc.getFieldConfiguration(xmiStorageDataTableSchema); + String idQuery = documentIds.stream() + .map(key -> Arrays.stream(key).map(part -> "%s='" + part + "'").toArray(String[]::new)) + .map(xmiTableSchema::expandPKNames).map(expandedKeys -> String.join(" AND ", expandedKeys)) + .collect(Collectors.joining(" OR ")); + sql = String.format("SELECT %s,%s FROM %s WHERE %s", xmiTableSchema.getPrimaryKeyString(), hashColumn, xmiStorageDataTable, idQuery); + ResultSet rs = conn.createStatement().executeQuery(sql); + while (rs.next()) { + StringBuilder pkSb = new StringBuilder(); + for (int i = 0; i < xmiTableSchema.getPrimaryKey().length; i++) + pkSb.append(rs.getString(i + 1)).append(','); + // Remove trailing comma + pkSb.deleteCharAt(pkSb.length() - 1); + String hash = rs.getString(xmiTableSchema.getPrimaryKey().length + 1); + id2hash.put(pkSb.toString(), hash); + } + } catch (SQLException e) { + log.error("Could not retrieve hashes from the database. SQL query was '{}':", sql, e); + throw new AnalysisEngineProcessException(e); + } + return id2hash; + } + return null; + } } diff --git a/jcore-xml-db-reader/src/main/resources/de/julielab/jcore/reader/xml/desc/jcore-xml-db-reader.xml b/jcore-xml-db-reader/src/main/resources/de/julielab/jcore/reader/xml/desc/jcore-xml-db-reader.xml index 6b562101f..a4539bcc8 100644 --- a/jcore-xml-db-reader/src/main/resources/de/julielab/jcore/reader/xml/desc/jcore-xml-db-reader.xml +++ b/jcore-xml-db-reader/src/main/resources/de/julielab/jcore/reader/xml/desc/jcore-xml-db-reader.xml @@ -7,7 +7,7 @@ A collection reader that receives XML document data from a PostgreSQL database. It employs the jcore-xml-mapper to populate UIMA CAS instances with the XML data according to a mapping file. For the same functionality without using a database, refer to the jcore-xml-reader. - 2.5.1-SNAPSHOT + 2.6.0 JULIE Lab, Germany diff --git a/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java b/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java new file mode 100644 index 000000000..ae154a30f --- /dev/null +++ b/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java @@ -0,0 +1,222 @@ +package de.julielab.jcore.reader.xml; + + +import de.julielab.costosys.dbconnection.CoStoSysConnection; +import de.julielab.costosys.dbconnection.DataBaseConnector; +import de.julielab.jcore.db.test.DBTestUtils; +import de.julielab.jcore.types.casflow.ToVisit; +import de.julielab.jcore.types.casmultiplier.RowBatch; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.configuration2.ex.ConfigurationException; +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.JCasIterator; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.factory.TypeSystemDescriptionFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.jcas.cas.StringArray; +import org.apache.uima.resource.metadata.TypeSystemDescription; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.PostgreSQLContainer; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +public class XMLDBMultiplierTest { + + private static final String SOURCE_XML_TABLE = "source_xml_table"; + private static final String TARGET_XMI_TABLE = "target_xmi_table"; + private static final String PMID_FIELD_NAME = "pmid"; + private static final String DOCID_FIELD_NAME = "docid"; + private static final String XML_FIELD_NAME = "xml"; + private static final String BASE_DOCUMENT_FIELD_NAME = "base_document"; + private static final String HASH_FIELD_NAME = "documentText_sha256"; + private static final String MAX_XMI_ID_FIELD_NAME = "max_xmi_id"; + private static final String SOFA_MAPPING_FIELD_NAME = "sofa_mapping"; + private static final String SUBSET_TABLE = "test_subset"; + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); + private static String costosysConfig; + + @BeforeAll + public static void setup() throws SQLException, UIMAException, IOException, ConfigurationException { + postgres.start(); + DBTestUtils.createAndSetHiddenConfig(Path.of("src", "test", "resources", "hiddenConfig").toString(), postgres); + + DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); + dbc.setActiveTableSchema("medline_2016_nozip"); + costosysConfig = DBTestUtils.createTestCostosysConfig("medline_2016_nozip", 2, postgres); + new File(costosysConfig).deleteOnExit(); + try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { + // We create two tables. One is the XML table the multiplier reads from and maps the contents to the JCas. + // The other is a simulation of an XMI table used to serialize CAS instances via the jcore-xmi-db-writer. + // We need that target table to test the hash value comparison mechanism: If a document does not exist + // in the target table or has a non-matching hash on its document text, proceed as normal. + // But if the hash matches, we want to reserve the possibility to skip most part of the subsequent pipeline. + // For this, we could use the AnnnotationDefinedFlowController for jcore-flow-controllers. This controller + // looks for annotations of the ToVisit type that specify which exact components in an aggregate should + // be applied to the CAS carrying the ToVisit annotation. + prepareSourceXMLTable(dbc, conn); + prepareTargetXMITable(dbc, conn); + } + dbc.defineSubset(SUBSET_TABLE, SOURCE_XML_TABLE, "Test subset"); + assertThat(dbc.getNumRows(SOURCE_XML_TABLE)).isEqualTo(10); + assertThat(dbc.getNumRows(TARGET_XMI_TABLE)).isEqualTo(5); + + dbc.close(); + } + + private static void prepareSourceXMLTable(DataBaseConnector dbc, CoStoSysConnection conn) throws SQLException { + String xmlFmt = "%dThis is document text number %d"; + dbc.createTable(SOURCE_XML_TABLE, "Test table for hash comparison test."); + String sql = String.format("INSERT INTO %s (%s,%s) VALUES (?,?)", SOURCE_XML_TABLE, PMID_FIELD_NAME, XML_FIELD_NAME); + PreparedStatement ps = conn.prepareStatement(sql); + for (int i = 0; i < 10; i++) { + String xml = String.format(xmlFmt, i, i); + ps.setString(1, String.valueOf(i)); + ps.setString(2, xml); + ps.addBatch(); + } + ps.executeBatch(); + } + + private static void prepareTargetXMITable(DataBaseConnector dbc, CoStoSysConnection conn) throws SQLException { + String documentTextFmt = "This is document text number %d"; + dbc.createTable(TARGET_XMI_TABLE, "xmi_text", "Test table for hash comparison test."); + dbc.assureColumnsExist(TARGET_XMI_TABLE, List.of(HASH_FIELD_NAME), "text"); + String sql = String.format("INSERT INTO %s (%s,%s,%s,%s,%s) VALUES (?,XMLPARSE(CONTENT ?),?,?,?)", TARGET_XMI_TABLE, DOCID_FIELD_NAME, BASE_DOCUMENT_FIELD_NAME, HASH_FIELD_NAME, MAX_XMI_ID_FIELD_NAME, SOFA_MAPPING_FIELD_NAME); + PreparedStatement ps = conn.prepareStatement(sql); + // Note that we only add half of the documents compared to the source XML import. This way we test + // if the code behaves right when the target document does not yet exist at all. + for (int i = 0; i < 5; i++) { + String xml = String.format(documentTextFmt, i, i); + ps.setString(1, String.valueOf(i)); + ps.setString(2, xml); + // For one document in the "target XMI" table we put in a wrong hash. Thus, this document should not trigger + // the "toVisit" mechanism. + if (i != 3) + ps.setString(3, getHash(xml)); + else ps.setString(3, "someanotherhash"); + ps.setInt(4, 0); + ps.setString(5, "dummy"); + ps.addBatch(); + } + ps.executeBatch(); + } + + @AfterAll + public static void tearDown() { + postgres.stop(); + } + + private static String getHash(String str) { + final byte[] sha = DigestUtils.sha256(str.getBytes()); + return Base64.encodeBase64String(sha); + } + + @Test + public void testMultiplier() throws Exception { + JCas jCas = prepareCas(); + AnalysisEngine engine = AnalysisEngineFactory.createEngine(XMLDBMultiplier.class, XMLDBMultiplier.PARAM_MAPPING_FILE, Path.of("src", "test", "resources", "test-mappingfile.xml").toString()); + JCasIterator jCasIterator = engine.processAndOutputNewCASes(jCas); + List documentTexts = new ArrayList<>(); + while (jCasIterator.hasNext()) { + JCas newCas = jCasIterator.next(); + documentTexts.add(newCas.getDocumentText()); + newCas.release(); + } + assertThat(documentTexts).containsExactly("This is document text number 0", "This is document text number 1", "This is document text number 2", "This is document text number 3", "This is document text number 4", "This is document text number 5", "This is document text number 6", "This is document text number 7", "This is document text number 8", "This is document text number 9"); + } + + /** + * Creates a JCas and adds a RowBatch for all 10 documents in the source XML table as well as the data table and subset table and schema names. + * + * @return A JCas prepared for the tests in this class. + * @throws UIMAException If some UIMA operation fails. + */ + private JCas prepareCas() throws UIMAException { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types", "de.julielab.jcore.types.jcore-casflow-types"); + RowBatch rowBatch = new RowBatch(jCas); + StringArray dataTable = new StringArray(jCas, 1); + dataTable.set(0, SOURCE_XML_TABLE); + rowBatch.setTables(dataTable); + StringArray tableSchema = new StringArray(jCas, 1); + tableSchema.set(0, "medline_2016_nozip"); + rowBatch.setTableSchemas(tableSchema); + rowBatch.setTableName(SUBSET_TABLE); + FSArray pks = new FSArray(jCas, 10); + // Read all documents + for (int i = 0; i < 10; i++) { + StringArray pk = new StringArray(jCas, 1); + pk.set(0, String.valueOf(i)); + pks = JCoReTools.addToFSArray(pks, pk); + } + rowBatch.setIdentifiers(pks); + rowBatch.setCostosysConfiguration(costosysConfig); + rowBatch.addToIndexes(); + return jCas; + } + + @Test + public void testHashComparison() throws Exception { + JCas jCas = prepareCas(); + TypeSystemDescription tsDesc = TypeSystemDescriptionFactory.createTypeSystemDescription("de.julielab.jcore.types.jcore-document-meta-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types", "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.jcore-casflow-types"); + AnalysisEngine engine = AnalysisEngineFactory.createEngine(XMLDBMultiplier.class, tsDesc, + XMLDBMultiplier.PARAM_MAPPING_FILE, Path.of("src", "test", "resources", "test-mappingfile.xml").toString(), + XMLDBMultiplier.PARAM_ADD_SHA_HASH, "documentText", + XMLDBMultiplier.PARAM_TABLE_DOCUMENT, TARGET_XMI_TABLE, + XMLDBMultiplier.PARAM_TABLE_DOCUMENT_SCHEMA, "xmi_text", + XMLDBMultiplier.PARAM_TO_VISIT_KEYS, "ThisIsTheVisitKey", + XMLDBMultiplier.PARAM_ADD_TO_VISIT_KEYS, true + ); + JCasIterator jCasIterator = engine.processAndOutputNewCASes(jCas); + List toVisitKeys = new ArrayList<>(); + while (jCasIterator.hasNext()) { + JCas newCas = jCasIterator.next(); + Collection select = JCasUtil.select(newCas, ToVisit.class); + select.forEach(tv -> tv.getDelegateKeys().forEach(k -> toVisitKeys.add(k))); + newCas.release(); + } + // There are 4 documents in the target table with the correct hash so we expect the delegate key 5 times + assertThat(toVisitKeys).containsExactly("ThisIsTheVisitKey", "ThisIsTheVisitKey", "ThisIsTheVisitKey", "ThisIsTheVisitKey"); + } + + @Test + public void testHashComparison2() throws Exception { + JCas jCas = prepareCas(); + TypeSystemDescription tsDesc = TypeSystemDescriptionFactory.createTypeSystemDescription("de.julielab.jcore.types.jcore-document-meta-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types", "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.jcore-casflow-types"); + // In this test, we do not specify the keys to visit; the whole subsequent pipeline should be skipped. + // To indicate that, there should be ToVisit annotations but they should be null. + AnalysisEngine engine = AnalysisEngineFactory.createEngine(XMLDBMultiplier.class, tsDesc, + XMLDBMultiplier.PARAM_MAPPING_FILE, Path.of("src", "test", "resources", "test-mappingfile.xml").toString(), + XMLDBMultiplier.PARAM_ADD_SHA_HASH, "documentText", + XMLDBMultiplier.PARAM_TABLE_DOCUMENT, TARGET_XMI_TABLE, + XMLDBMultiplier.PARAM_TABLE_DOCUMENT_SCHEMA, "xmi_text", + XMLDBMultiplier.PARAM_ADD_TO_VISIT_KEYS, true + ); + JCasIterator jCasIterator = engine.processAndOutputNewCASes(jCas); + List emptyToVisitAnnotation = new ArrayList<>(); + while (jCasIterator.hasNext()) { + JCas newCas = jCasIterator.next(); + Collection select = JCasUtil.select(newCas, ToVisit.class); + select.stream().filter(tv -> tv.getDelegateKeys() == null).forEach(emptyToVisitAnnotation::add); + newCas.release(); + } + // There are 4 documents in the target table with the correct hash so we expect the delegate key 5 times + assertThat(emptyToVisitAnnotation).hasSize(4); + } +} diff --git a/jcore-xml-db-reader/src/test/resources/test-mappingfile.xml b/jcore-xml-db-reader/src/test/resources/test-mappingfile.xml new file mode 100644 index 000000000..22af9d7cc --- /dev/null +++ b/jcore-xml-db-reader/src/test/resources/test-mappingfile.xml @@ -0,0 +1,17 @@ + + + + /xml/text + + + + de.julielab.jcore.types.Header + + + docId + + /xml/docid + java.lang.String + + + \ No newline at end of file diff --git a/jcore-xml-mapper/README.md b/jcore-xml-mapper/README.md index 5bcf986c3..e1fa47aac 100644 --- a/jcore-xml-mapper/README.md +++ b/jcore-xml-mapper/README.md @@ -3,7 +3,7 @@ NOTE: This is not a UIMA component but rather a library used by some JCoRe compo This is a generic XML mapper to create CAS instances reflecting contents of XML documents. ### Objective -The JULIE Lab XMLMapper is a mapper which maps XML elements from an XML document onto (UIMA) Types or Type Features. For that task it uses a mapping file, which comes as an input. +The JULIE Lab XMLMapper is a mapper which maps XML elements from an XML document onto (UIMA) types or type features. For that task it uses a mapping file, which comes as an input. Examples for mapping files are found in some [jcore-projects](https://github.com/JULIELab/jcore-projects) components, for example the [jcore-pubmed-reader](https://github.com/JULIELab/jcore-projects/tree/master/jcore-pubmed-reader), its MEDLINE-pendant or the database versions of both. @@ -14,4 +14,101 @@ The input and output of an AE is done via annotation objects. The classes corres ### Using the AE - Descriptor Configuration In UIMA, each component is configured by a descriptor in XML. Such a preconfigured descriptor is available under `src/main/resources/de/julielab/jcore/ ` but it can be further edited if so desired; see [UIMA SDK User's Guide](https://uima.apache.org/downloads/releaseDocs/2.1.0-incubating/docs/html/tools/tools.html#ugr.tools.cde) for further information. +### Mapping File Syntax +Please note that this section is incomplete. The mapping file of the [jcore-pubmed-reader](https://github.com/JULIELab/jcore-projects/tree/master/jcore-pubmed-reader) includes examples for all supported features. + +The basic structure of the mapping file consists of the `` root element, a `` root child element and an arbitrary number of `` ('type system type', referring to the UIMA type system to be employed) root child elements: + +```xml + + + ... + + + ... + + + ... + + ... + +``` + +## Document Text +The CAS document text is populated with the `` mapping element. It defines an arbitrary number of `` elements of whose mapping values the document text will be comprised, in the order of the `` elements in the mapping file. Each document part is given a mandatory, manually defined ID which can be referred to in order to create a UIMA annotation covering the respective document part text. The location of the actual character data in the mapped document XML files is specified via XPath. + +```xml + + + /MedlineCitation/Article/ArticleTitle + + + /MedlineCitation/Article/Abstract + + +``` + +This example collects the article title, and the abstract of a MEDLINE XML document for the CAS document text. + +The `` may have an optional child element named ``. This is useful or even a necessity when the document structure for this element is not static, i.e. has a varying number of children. In such a case, a user-delivered class on the classpath can be specified. This class must implement the `de.julielab.jcore.reader.xmlmapper.mapper.DocumentTextPartParser` interface and received the document XML element that the XPath in the mapping file points to. It then returns a list of strings using to comprise the respective part of the document text: + +```xml + + /MedlineCitation/Article/Abstract + + de.julielab.jcore.reader.xmlmapper.mapper.StructuredAbstractParser + + +``` + +The `StructuredAbstractParser` is able to parse the child elements of `/MedlineCitation/Article/Abstract`, namely `AbstractText` elements which also have attributes, `Label` and `NlmCategory`. Those are details to the MEDLINE XML format and are just use here as an example use case for external parsers. + +## UIMA Type Annotations + +Annotations are added with the `` element. Its main children are `` and ``, defining the actual type to be instantiated and any feature values that should be added to the type. Since a UIMA type feature can itself be a type, `` elements can be nested. Then, the `` child of a `` element is resolved *relative* the `` of the parent `` element. Thus, when the parent `` element does not specify an `` element, which is perfectly legal, the given xpath is resolved from the XML document root: + +```xml + + fully qualified UIMA type name + + feature name of the type + true if the feature value is a UIMA feature structure (annotation) itself + + The value data type of the feature as it is passed to the setter for this feature in Java code. + This can also be an array type, e.g. org.apache.uima.jcas.cas.FSArray. + + + optional if the parent tsFullClassName is an array type + true + + absolute xpath since the parent does not specify an xpath + + + fully qualified UIMA type name of this nested type + + + + name of this feature relative to the parent fsFullClassName type + relative xpath to the parent xpath + a primitive data type (or a string) since this is not a UIMA type itself (missing isType element). + + + + +``` +The above example showcases the structure of a nested annotation, i.e. a feature path. The outer type will have another type as feature value which in turn has a primitive value as the final feature value. + +**Important** The `` values are evaluated for *all occurrences* of the respective XPath in the XML document. Thus, the above annotations will be created for all XPath matches. This holds true for every level of `` specifications. This allows collecting child XML document elements into arrays. An outer xpath points to the collection document elements, and an inner xpath points the children. + +The `` element again accepts the child element ``. In this case, the external parser needs to implement the `de.julielab.jcore.reader.xmlmapper.typeParser.TypeParser` interface. It might be helpful to extend the class `de.julielab.jcore.reader.xmlmapper.typeParser.StandardTypeParser` and use its `parseSingleType` method. + +Finally, the `` element accepts the `` child element which can point to a part of document text, thus create an annotation for the respective document text part as identified by its ID: + +```xml + + + 0 + + +``` diff --git a/jcore-xml-mapper/pom.xml b/jcore-xml-mapper/pom.xml index dab5025f2..85aa0825f 100644 --- a/jcore-xml-mapper/pom.xml +++ b/jcore-xml-mapper/pom.xml @@ -13,7 +13,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 @@ -22,16 +22,6 @@ jcore-types ${jcore-types-version}
- - org.easytesting - fest-reflect - 1.2 - - - org.easytesting - fest-util - 1.1.4 - de.julielab julie-xml-tools @@ -44,8 +34,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/genericTypes/TypeFactory.java b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/genericTypes/TypeFactory.java index f9408edad..0a36ccc70 100644 --- a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/genericTypes/TypeFactory.java +++ b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/genericTypes/TypeFactory.java @@ -65,11 +65,11 @@ public class TypeFactory { /** * creates a new instance of the TypeFactory * - * @param mappingFile + * @param mappingFileData The mapping file contents. */ public TypeFactory(byte[] mappingFileData) { this.mappingFileData = mappingFileData; - types = new ArrayList(); + types = new ArrayList<>(); this.documentTextParser = new DocumentTextHandler(); } @@ -132,7 +132,7 @@ public List createTemplates() throws CollectionException { } } else { if (!nodeName.equals(ROOT)) { - LOGGER.warn("unknown tag in mapping file: " + nodeName + "!!"); + LOGGER.warn("unknown tag in mapping file (note that element names are case sensitive): " + nodeName); } } } @@ -156,7 +156,8 @@ private void fillDocumentParser(XMLEventReader reader) throws XMLStreamException id = Integer.parseInt(next.getValue()); documentTextParser.addPartOfDocumentTextXPath(id); } - } else { + } + else { LOGGER.error("no id for " + PART_OF_DOCUMENT_TEXT); throw new RuntimeException(); } @@ -172,7 +173,7 @@ private void fillDocumentParser(XMLEventReader reader) throws XMLStreamException if (xpath.length() > 0 && id >= 0) { documentTextParser.setXPathForPartOfDocumentText(id, xpath); } else { - LOGGER.error("Unkown data in " + DOCUMENT_TEXT + "/" + VALUE_X_PATH + " tag "); + LOGGER.error("Unknown data in " + DOCUMENT_TEXT + "/" + VALUE_X_PATH + " tag "); } } else if (nodeName.equals(EXTERNAL_PARSER)){ event = reader.nextEvent(); @@ -183,7 +184,7 @@ private void fillDocumentParser(XMLEventReader reader) throws XMLStreamException if (externalParserClassName.length() > 0 && id >= 0) { documentTextParser.setExternalParserForPartOfDocument(id, externalParserClassName); } else { - LOGGER.error("Unkown data in " + DOCUMENT_TEXT + "/" + VALUE_X_PATH + " tag "); + LOGGER.error("Unknown data in " + DOCUMENT_TEXT + "/" + VALUE_X_PATH + " tag "); } } else { @@ -242,15 +243,11 @@ private TypeTemplate parseType(XMLEventReader reader) throws XMLStreamException, type.addAdditionalData(event.asCharacters().getData().trim(), index); } } else { - LOGGER.warn("unknown tag in mapping file: " + nodeName + "!!"); + LOGGER.warn("unknown tag in mapping file (note that element names are case sensitive): " + nodeName); } } event = reader.nextEvent(); } - // reflection type anlegen - // iteration über alle features - // if(feature.type==null) - // über getter bestimmen return type; } @@ -273,7 +270,7 @@ private void parseOffset(TypeTemplate type, XMLEventReader reader) throws XMLStr } } } else { - LOGGER.error("Unknown element in mapping file: " + nodeName); + LOGGER.error("Unknown element in mapping file (note that element names are case sensitive): " + nodeName); } } } @@ -315,7 +312,7 @@ private FeatureTemplate parseFeature(XMLEventReader reader) throws XMLStreamExce FeatureTemplate newFeature = parseFeature(reader); feature.addFeature(newFeature); } else { - LOGGER.warn("unknown tag in mapping file: " + nodeName + "!!"); + LOGGER.warn("unknown tag in mapping file (note that element names are case sensitive): " + nodeName); } } event = reader.nextEvent(); diff --git a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/genericTypes/TypeTemplate.java b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/genericTypes/TypeTemplate.java index 383dc3215..466350e8c 100644 --- a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/genericTypes/TypeTemplate.java +++ b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/genericTypes/TypeTemplate.java @@ -24,7 +24,6 @@ import java.util.*; -import static org.fest.reflect.core.Reflection.constructor; /** * Represents a Template for a type which Contains a List of Feature Templates @@ -119,14 +118,17 @@ public void setFullClassName(String fullClassName) { public void setParser(String trim) throws CollectionException { if (trim != null) { externalParser = true; - Class externalParserClass; + Class externalParserClass = null; try { externalParserClass = Class.forName(trim); + this.parser = (TypeParser) externalParserClass.getConstructor().newInstance(); } catch (ClassNotFoundException e) { LOGGER.error("ExternalParser " + trim + " for type or feature " + fullClassName + " returns a ClassNotFoundException", e); throw new CollectionException(e); + } catch (Exception e) { + LOGGER.error("Could not create instance of class {}: ", externalParserClass, e); + throw new CollectionException(e); } - this.parser = (TypeParser) constructor().in(externalParserClass).newInstance(); }else{ this.parser = null; } diff --git a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/DocumentTextHandler.java b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/DocumentTextHandler.java index 02218ee8b..4ef868e6f 100644 --- a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/DocumentTextHandler.java +++ b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/DocumentTextHandler.java @@ -25,9 +25,6 @@ import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.atomic.AtomicInteger; - -import static org.fest.reflect.core.Reflection.constructor; /** * Handels to parse the DocumentText @@ -128,16 +125,19 @@ public void setXPathForPartOfDocumentText(int id, String xpath) { public void setExternalParserForPartOfDocument(int id, String externalParserClassName) throws CollectionException { if (externalParserClassName != null) { - Class externalParserClass; + Class externalParserClass = null; + DocumentTextPartParser parser; try { externalParserClass = Class.forName(externalParserClassName.trim()); + parser = (DocumentTextPartParser) externalParserClass.getConstructor().newInstance(); } catch (ClassNotFoundException e) { LOGGER.error("ExternalParser " + externalParserClassName + " for document text part " + id + " returns a ClassNotFoundException", e); throw new CollectionException(e); + } catch (Exception e) { + LOGGER.error("Could not create instance of {}: ", externalParserClass, e); + throw new CollectionException(e); } - DocumentTextPartParser parser = (DocumentTextPartParser) constructor().in(externalParserClass).newInstance(); this.docTextData.get(id).setParser(parser); } } - } diff --git a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/StructuredAbstractParser.java b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/StructuredAbstractParser.java index 5881ab36a..84efc4d6b 100644 --- a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/StructuredAbstractParser.java +++ b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/StructuredAbstractParser.java @@ -23,106 +23,108 @@ * component, if required.
* NOTE: Using this parser, the AbstractText annotation is already * created and should not be set in the mapping file. - * + * * @author faessler - * */ public class StructuredAbstractParser implements DocumentTextPartParser { - private static final boolean newlineBetweenSections = true; + private static final boolean newlineBetweenSections = true; - public List parseDocumentPart(VTDNav vn, PartOfDocument docTextPart, int offset, JCas jCas, - byte[] identifier) { - String baseXPath = docTextPart.getXPath(); + public List parseDocumentPart(VTDNav vn, PartOfDocument docTextPart, int offset, JCas jCas, + byte[] identifier) { + String baseXPath = docTextPart.getXPath(); - List> fields = new ArrayList<>(); - Map field = new HashMap<>(); - field.put(JulieXMLConstants.NAME, "Label"); - field.put(JulieXMLConstants.XPATH, "@Label"); - fields.add(field); + List> fields = new ArrayList<>(); + Map field = new HashMap<>(); + field.put(JulieXMLConstants.NAME, "Label"); + field.put(JulieXMLConstants.XPATH, "@Label"); + fields.add(field); - field = new HashMap<>(); - field.put(JulieXMLConstants.NAME, "NlmCategory"); - field.put(JulieXMLConstants.XPATH, "@NlmCategory"); - fields.add(field); + field = new HashMap<>(); + field.put(JulieXMLConstants.NAME, "NlmCategory"); + field.put(JulieXMLConstants.XPATH, "@NlmCategory"); + fields.add(field); - field = new HashMap<>(); - field.put(JulieXMLConstants.NAME, "AbstractText"); - field.put(JulieXMLConstants.XPATH, "."); - fields.add(field); - Iterator> rowIterator = JulieXMLTools.constructRowIterator(vn, baseXPath + "/AbstractText", - fields, new String(identifier)); - List abstractParts = new ArrayList<>(); - // for the text contents - StringBuilder sb = new StringBuilder(); + field = new HashMap<>(); + field.put(JulieXMLConstants.NAME, "AbstractText"); + field.put(JulieXMLConstants.XPATH, "."); + fields.add(field); + Iterator> rowIterator = JulieXMLTools.constructRowIterator(vn, baseXPath + "/AbstractText", + fields, new String(identifier)); + List abstractParts = new ArrayList<>(); + // for the text contents + StringBuilder sb = new StringBuilder(); - int sectionOffset = offset; - while (rowIterator.hasNext()) { - Map abstractSectionData = rowIterator.next(); - String label = (String) abstractSectionData.get("Label"); - String nlmCategory = (String) abstractSectionData.get("NlmCategory"); - String abstractSectionText = (String) abstractSectionData.get("AbstractText"); - if (newlineBetweenSections) { - // in case the last section was empty, we delete the trailing - // newline - if (sb.length() > 0 && StringUtils.isBlank(abstractSectionText)) { - sb.deleteCharAt(sb.length() - 1); - --sectionOffset; - } - } - sb.append(abstractSectionText); + int sectionOffset = offset; + while (rowIterator.hasNext()) { + Map abstractSectionData = rowIterator.next(); + String label = (String) abstractSectionData.get("Label"); + String nlmCategory = (String) abstractSectionData.get("NlmCategory"); + String abstractSectionText = ((String) abstractSectionData.get("AbstractText")); + if (newlineBetweenSections) { + // in case the last section was empty, we delete the trailing + // newline + if (sb.length() > 0 && StringUtils.isBlank(abstractSectionText)) { + sb.deleteCharAt(sb.length() - 1); + --sectionOffset; + } + } + // comment in to add the structured abstract section labels to the text, e.g. "AIMS: ...", "BACKGROUND: ..." +// if (null != label && !"unlabelled".equalsIgnoreCase(label)) +// sb.append(label).append(": "); + sb.append(abstractSectionText); - // if label and nlmCategory are null, there is no section heading; - // most probably this just isn't a structured abstract - if (null != label || null != nlmCategory) { - AbstractSectionHeading abstractPartHeading = new AbstractSectionHeading(jCas); - abstractPartHeading.setLabel(label); - abstractPartHeading.setNlmCategory(nlmCategory); - abstractPartHeading.setTitleType("abstractSection"); - abstractPartHeading.addToIndexes(); + // if label and nlmCategory are null, there is no section heading; + // most probably this just isn't a structured abstract + if (null != label || null != nlmCategory) { + AbstractSectionHeading abstractPartHeading = new AbstractSectionHeading(jCas); + abstractPartHeading.setLabel(label); + abstractPartHeading.setNlmCategory(nlmCategory); + abstractPartHeading.setTitleType("abstractSection"); + abstractPartHeading.addToIndexes(); - AbstractSection abstractPart = new AbstractSection(jCas); - abstractPart.setBegin(sectionOffset); - sectionOffset += abstractSectionText.length(); - abstractPart.setEnd(sectionOffset); - abstractPart.setAbstractSectionHeading(abstractPartHeading); - abstractPart.addToIndexes(); + AbstractSection abstractPart = new AbstractSection(jCas); + abstractPart.setBegin(sectionOffset); + sectionOffset += abstractSectionText.length(); + abstractPart.setEnd(sectionOffset); + abstractPart.setAbstractSectionHeading(abstractPartHeading); + abstractPart.addToIndexes(); - abstractParts.add(abstractPart); - } else { - sectionOffset += abstractSectionText.length(); - } + abstractParts.add(abstractPart); + } else { + sectionOffset += abstractSectionText.length(); + } - // let's insert a line break after each section text - if (newlineBetweenSections && sb.length() > 0 && rowIterator.hasNext()) { - sb.append("\n"); - ++sectionOffset; - } - } + // let's insert a line break after each section text + if (newlineBetweenSections && sb.length() > 0 && rowIterator.hasNext()) { + sb.append("\n"); + ++sectionOffset; + } + } - // only create an abstract annotation if there actually is an abstract - if (!abstractParts.isEmpty() || sectionOffset > offset) { - if (sectionOffset == offset) { - // there was no abstract but just empty abstract sections; decrement the offsets so we stay with existing document text - --offset; - --sectionOffset; - for (AbstractSection section : abstractParts) { - section.setBegin(offset); - section.setEnd(offset); - } - } - AbstractText abstractText = new AbstractText(jCas, offset, sectionOffset); - abstractText.setAbstractType("main"); - if (abstractParts.size() > 0) { - FSArray sectionsArray = new FSArray(jCas, abstractParts.size()); - for (int i = 0; i < abstractParts.size(); ++i) - sectionsArray.set(i, abstractParts.get(i)); - abstractText.setStructuredAbstractParts(sectionsArray); - } - abstractText.addToIndexes(); - return Collections.singletonList(sb.toString()); - } - return Collections.emptyList(); - } + // only create an abstract annotation if there actually is an abstract + if (!abstractParts.isEmpty() || sectionOffset > offset) { + if (sectionOffset == offset) { + // there was no abstract but just empty abstract sections; decrement the offsets so we stay with existing document text + --offset; + --sectionOffset; + for (AbstractSection section : abstractParts) { + section.setBegin(offset); + section.setEnd(offset); + } + } + AbstractText abstractText = new AbstractText(jCas, offset, sectionOffset); + abstractText.setAbstractType("main"); + if (abstractParts.size() > 0) { + FSArray sectionsArray = new FSArray(jCas, abstractParts.size()); + for (int i = 0; i < abstractParts.size(); ++i) + sectionsArray.set(i, abstractParts.get(i)); + abstractText.setStructuredAbstractParts(sectionsArray); + } + abstractText.addToIndexes(); + return Collections.singletonList(sb.toString()); + } + return Collections.emptyList(); + } } diff --git a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/XMLMapper.java b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/XMLMapper.java index 603c91ea8..c2875d739 100644 --- a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/XMLMapper.java +++ b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/XMLMapper.java @@ -30,6 +30,7 @@ import org.slf4j.LoggerFactory; import java.io.*; +import java.nio.charset.StandardCharsets; import java.util.List; /** @@ -49,6 +50,23 @@ public class XMLMapper { private DocumentTextHandler documentTextHandler; + private boolean ignoreTrivialWhitespaces = true; + + /** + *

+ * Whether or not to ignore trivial XML whitespaces and newlines according to {@link VTDGen#enableIgnoredWhiteSpace(boolean)}. + *

+ *

+ * Activating this will ignore whitespaces that exist between XML tags and have no other character data. + * This is not always desired behavior. Inline-annotated text may contain whitespaces between two tags that + * should actually retained in the document text. + *

+ * @param ignoreTrivialWhitespaces + */ + public void setIgnoreTrivialWhitespaces(boolean ignoreTrivialWhitespaces) { + this.ignoreTrivialWhitespaces = ignoreTrivialWhitespaces; + } + /** * Creates an new instacne of the XMLMapper * @@ -80,7 +98,7 @@ public void parse(byte[] data, byte[] identifier, JCas jcas) { // needed for extraction of mixed-content-XML // when there is a whitespace only between two // tags, e.g. ... ... - vg.enableIgnoredWhiteSpace(true); + vg.enableIgnoredWhiteSpace(!ignoreTrivialWhitespaces); vg.setDoc(data); vg.parse(true); VTDNav vn = vg.getNav(); @@ -140,7 +158,7 @@ private void buildTypes(byte[] identifier, JCas jcas, VTDNav vn) throws Collecti builder.buildType(concreteType, jcas); } } catch (Exception e) { - LOG.error("", e); + LOG.error("Exception occurred in document ID {}", new String(identifier, StandardCharsets.UTF_8), e); } } } diff --git a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/typeBuilder/StandardTypeBuilder.java b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/typeBuilder/StandardTypeBuilder.java index ca3bbec18..a010092c1 100644 --- a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/typeBuilder/StandardTypeBuilder.java +++ b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/typeBuilder/StandardTypeBuilder.java @@ -27,9 +27,6 @@ import java.util.HashMap; -import static org.fest.reflect.core.Reflection.constructor; -import static org.fest.reflect.core.Reflection.method; - /** * In this class, the actual UIMA types are built from the templates which have * been filled with values by the type parsers before. The standard type builder @@ -120,7 +117,11 @@ private Annotation buildSingleInstance(ConcreteType concreteType, JCas jcas) thr if (concreteType.getConcreteFeatures() != null) { // Create the UIMA type corresponding to the type description in // concreteType. - type = (Annotation) constructor().withParameterTypes(JCas.class).in(typeClass).newInstance(jcas); + try { + type = (Annotation) typeClass.getConstructor(JCas.class).newInstance(jcas); + } catch (Exception e){ + throw new CollectionException(e); + } // For each feature this type has, set the corret feature value. for (ConcreteFeature concreteFeature : concreteType.getConcreteFeatures()) { @@ -147,11 +148,10 @@ private Annotation buildSingleInstance(ConcreteType concreteType, JCas jcas) thr // itself. if (standardJavaTypesMap.get(concreteFeature.getFullClassName()) != null) { featureClass = standardJavaTypesMap.get(concreteFeature.getFullClassName()); - method(methodName).withParameterTypes(featureClass).in(type) - .invoke(parseValueStringToValueType(concreteFeature.getValue(), concreteFeature.getFullClassName())); + type.getClass().getMethod(methodName, featureClass).invoke(type, parseValueStringToValueType(concreteFeature.getValue(), concreteFeature.getFullClassName())); } else if (concreteFeature.getFullClassName().equals("String") || concreteFeature.getFullClassName().equals("java.lang.String")) { featureClass = Class.forName(concreteFeature.getFullClassName()); - method(methodName).withParameterTypes(featureClass).in(type).invoke(concreteFeature.getValue()); + typeClass.getMethod(methodName, featureClass).invoke(type, concreteFeature.getValue()); } else { String featureClassName = concreteFeature.getFullClassName(); if (StringUtils.isBlank(featureClassName)) @@ -160,7 +160,7 @@ private Annotation buildSingleInstance(ConcreteType concreteType, JCas jcas) thr + "\" the feature value class (e.g. String, Integer, another type...) was not defined in the mapping file."); featureClass = Class.forName(featureClassName); TOP top = concreteFeature.getTypeTemplate().getParser().getTypeBuilder().buildType(concreteFeature, jcas); - method(methodName).withParameterTypes(featureClass).in(type).invoke(top); + type.getClass().getMethod(methodName, featureClass).invoke(type, top); } } catch (Throwable e) { LOGGER.error("Wrong Feature Type: " + concreteFeature.getFullClassName(), e); diff --git a/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/EncodingTest.java b/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/EncodingTest.java index 23a256259..9d61cd532 100644 --- a/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/EncodingTest.java +++ b/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/EncodingTest.java @@ -13,9 +13,9 @@ import com.ximpleware.AutoPilot; import com.ximpleware.VTDGen; import com.ximpleware.VTDNav; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class EncodingTest { @Test diff --git a/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLMapperTest.java b/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLMapperTest.java index 8b3efcb59..a3e682208 100644 --- a/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLMapperTest.java +++ b/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLMapperTest.java @@ -33,13 +33,13 @@ import org.apache.uima.resource.metadata.ExternalResourceBinding; import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.XMLInputSource; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.HashMap; import java.util.Map; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; /** * TODO insert description diff --git a/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLReader.java b/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLReader.java index b24c27a13..63ee44bbe 100644 --- a/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLReader.java +++ b/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLReader.java @@ -32,6 +32,7 @@ import java.io.*; import java.util.ArrayList; import java.util.List; +import java.util.Optional; /** * Generic XML {@link CollectionReader}. Uses a mapping file to map elements of the XML document to @@ -44,6 +45,7 @@ public class XMLReader extends CollectionReader_ImplBase { private static final Logger LOGGER = LoggerFactory.getLogger(XMLReader.class); public static final String PARAM_INPUT_DIR = "InputDirectory"; public static final String PARAM_INPUT_FILE = "InputFile"; + public static final String PARAM_IGNORE_TRIVIAL_WS = "IgnoreTrivialWS"; public static final String RESOURCE_MAPPING_FILE = "MappingFile"; private List files = null; private int currentIndex = 0; @@ -59,6 +61,7 @@ public void initialize() throws ResourceInitializationException { String inputDir = (String) getUimaContext().getConfigParameterValue(PARAM_INPUT_DIR); String inputFile = (String) getUimaContext().getConfigParameterValue(PARAM_INPUT_FILE); + boolean ignoreTrivialWs = (boolean) Optional.ofNullable(getUimaContext().getConfigParameterValue(PARAM_IGNORE_TRIVIAL_WS)).orElse(true); InputStream is = null; try { is = getUimaContext().getResourceAsStream(RESOURCE_MAPPING_FILE); @@ -101,6 +104,7 @@ public void initialize() throws ResourceInitializationException { try { xmlMapper = new XMLMapper(JulieXMLTools.readStream(is, 1000)); + xmlMapper.setIgnoreTrivialWhitespaces(ignoreTrivialWs); } catch (FileNotFoundException e) { throw new ResourceInitializationException(e); } catch (IOException e) { diff --git a/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLReaderTest.java b/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLReaderTest.java index 1ecb95ccd..62de41982 100644 --- a/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLReaderTest.java +++ b/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLReaderTest.java @@ -6,8 +6,8 @@ package de.julielab.jcore.reader.xmlmapper; -import de.julielab.jcore.types.*; import de.julielab.jcore.types.Date; +import de.julielab.jcore.types.*; import de.julielab.jcore.types.pubmed.Header; import de.julielab.jcore.types.pubmed.ManualDescriptor; import org.apache.uima.UIMAException; @@ -29,7 +29,7 @@ import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; @@ -38,10 +38,10 @@ import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; -import java.util.*; import java.util.List; +import java.util.*; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; /** * Test for class MedlineReader @@ -174,6 +174,7 @@ public XMLReaderTest() { @Test public void testSingleEntityData() throws Throwable { + // medlineReader = createCollectionReader("src/test/resources/XMLReaderDescriptor_medline_singleFile2.xml"); try { assertTrue(medlineReader.hasNext()); @@ -183,7 +184,7 @@ public void testSingleEntityData() throws Throwable { if (DEBUG_MODE) { serializeCas(cas); } - assertTrue("test documenttext", cas.getDocumentText() != null && cas.getDocumentText().length() > 0); + assertTrue(cas.getDocumentText() != null && cas.getDocumentText().length() > 0); assertEquals( "Mitigation of graft-versus-host disease in rats treated with allogeneic and xenogeneic antilymphocytic sera.\nThis is a very short test abstract.", cas.getDocumentText()); @@ -191,7 +192,7 @@ public void testSingleEntityData() throws Throwable { int counter = 0; String[] types = new String[] { ":::diso:2,3", ":::spe", ":::pgn" }; String[] texts = new String[] { "graft-versus-host disease", "rats", "sera" }; - assertTrue("No entity mentions found in the CAS", iter.hasNext()); + assertTrue(iter.hasNext(), "No entity mentions found in the CAS"); while (iter.hasNext()) { EntityMention text = (EntityMention) iter.next(); String coveredText = text.getCoveredText(); @@ -398,8 +399,8 @@ public void testMissingInputDirectory() { medlineReader = getCollectionReader(DESC_XML_READER_MISSING_INPUT_DIR); fail("Expected exception was not thrown"); } catch (Exception e) { - assertTrue("Exception should be an instance of ResourceInitializationException , but was " - + e.getClass().getName(), e instanceof ResourceInitializationException); + assertTrue(e instanceof ResourceInitializationException, "Exception should be an instance of ResourceInitializationException , but was " + + e.getClass().getName()); } } @@ -497,71 +498,71 @@ private void checkElements() { String pmid = getPMID(cas); if (pmid.equals("11119751")) { checkCount++; - assertTrue("Invalid keyWordList", checkKeywords(cas, EXPECTED_KEYWORDS)); - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); - assertTrue("Invalid DBInfoList", ckeckDBInfos(cas, EXPECTED_DB_INFO)); - assertTrue("Invalid MeshHeading", checkMeshHeadings(cas, EXPECTED_MESH_HEADINGS)); - assertTrue("Invalid GeneSymbol", checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS)); - assertTrue("Invalid Chemical", checkChemicals(cas, EXPECTED_CHEMICALS)); - assertTrue("Invalid Header", checkHeader(cas, EXPECTED_HEADER)); - assertTrue("Invalid ManualDescriptor", checkManualDescriptor(cas)); - assertTrue("Invalid Journal", ckeckJournal(cas, EXPECTED_JOURNAL)); - assertTrue("Invalid DocumentText", checkDocumentText(cas, EXPECTED_DOCUMENT_TEXT)); - assertTrue("Invalid AbstractText", checkAbstractText(cas, EXPECTED_ABSTRACT_TEXT)); - assertTrue("Invalid Title", checkTitle(cas, EXPECTED_TITLE)); - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(checkKeywords(cas, EXPECTED_KEYWORDS), "Invalid keyWordList"); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); + assertTrue(ckeckDBInfos(cas, EXPECTED_DB_INFO), "Invalid DBInfoList"); + assertTrue(checkMeshHeadings(cas, EXPECTED_MESH_HEADINGS), "Invalid MeshHeading"); + assertTrue(checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS), "Invalid GeneSymbol"); + assertTrue(checkChemicals(cas, EXPECTED_CHEMICALS), "Invalid Chemical"); + assertTrue(checkHeader(cas, EXPECTED_HEADER), "Invalid Header"); + assertTrue(checkManualDescriptor(cas), "Invalid ManualDescriptor"); + assertTrue(ckeckJournal(cas, EXPECTED_JOURNAL), "Invalid Journal"); + assertTrue(checkDocumentText(cas, EXPECTED_DOCUMENT_TEXT), "Invalid DocumentText"); + assertTrue(checkAbstractText(cas, EXPECTED_ABSTRACT_TEXT), "Invalid AbstractText"); + assertTrue(checkTitle(cas, EXPECTED_TITLE), "Invalid Title"); + assertTrue(!checkSentences(cas), "Sentences Found"); } // check medline XML without most lists (gene, keywords,...) if (pmid.equals("11119751-a")) { checkCount++; - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); + assertTrue(!checkSentences(cas), "Sentences Found"); } // check medline XML with pub date: 2000 // Spring-Summer if (pmid.equals("11119751-b")) { checkCount++; - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); - assertTrue("Invalid GeneSymbol", checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS)); - assertTrue("Invalid Journal", ckeckJournal(cas, EXPECTED_JOURNAL)); - assertTrue("Invalid PubDate", checkPubDate(cas, EXPECTED_DATE_1)); - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); + assertTrue(checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS), "Invalid GeneSymbol"); + assertTrue(ckeckJournal(cas, EXPECTED_JOURNAL), "Invalid Journal"); + assertTrue(checkPubDate(cas, EXPECTED_DATE_1), "Invalid PubDate"); + assertTrue(!checkSentences(cas), "Sentences Found"); } // check medline XML with pub date: 2000 Dec // 23-30 if (pmid.equals("11119751-c")) { checkCount++; - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); - assertTrue("Invalid GeneSymbol", checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS)); - assertTrue("Invalid Journal", ckeckJournal(cas, EXPECTED_JOURNAL)); - assertTrue("Invalid PubDate", checkPubDate(cas, EXPECTED_DATE_2)); - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); + assertTrue(checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS), "Invalid GeneSymbol"); + assertTrue(ckeckJournal(cas, EXPECTED_JOURNAL), "Invalid Journal"); + assertTrue(checkPubDate(cas, EXPECTED_DATE_2), "Invalid PubDate"); + assertTrue(!checkSentences(cas), "Sentences Found"); } // check medline XML pub date: 2000 Oct-2001 // Mar if (pmid.equals("11119751-d")) { - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(!checkSentences(cas), "Sentences Found"); checkCount++; - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); - assertTrue("Invalid GeneSymbol", checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS)); - assertTrue("Invalid Journal", ckeckJournal(cas, EXPECTED_JOURNAL)); - assertTrue("Invalid PubDate", checkPubDate(cas, EXPECTED_DATE_3)); - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); + assertTrue(checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS), "Invalid GeneSymbol"); + assertTrue(ckeckJournal(cas, EXPECTED_JOURNAL), "Invalid Journal"); + assertTrue(checkPubDate(cas, EXPECTED_DATE_3), "Invalid PubDate"); + assertTrue(!checkSentences(cas), "Sentences Found"); } if (pmid.equals("8045680")) { checkCount++; - assertTrue("No Sentences Found", checkSentences(cas)); + assertTrue(checkSentences(cas), "No Sentences Found"); // assertTrue("Invalid Header", checkHeader(cas, // EXPECTED_HEADER_OTHER_LANGUAGE)); } if (pmid.equals("12626969")) { checkCount++; - assertTrue("No Sentences Found", checkSentences(cas)); + assertTrue(checkSentences(cas), "No Sentences Found"); // assertTrue("Invalid Header", checkHeader(cas, // EXPECTED_HEADER_OTHER_LANGUAGE)); } if (pmid.equals("11119751-e")) { - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(!checkSentences(cas), "Sentences Found"); checkCount++; // assertTrue("Invalid Header", checkHeader(cas, // EXPECTED_HEADER_OTHER_LANGUAGE)); @@ -569,25 +570,25 @@ private void checkElements() { // test the case that only a title is found and no abstractText // (documentText should be equal to title in this case) if (pmid.equals("17276851")) { - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(!checkSentences(cas), "Sentences Found"); checkCount++; - assertTrue("Invalid Document Title", checkTitle(cas, EXPECTED_TITLE_2)); - assertTrue("Invalid Document Text", checkDocumentText(cas, EXPECTED_TITLE_2)); + assertTrue(checkTitle(cas, EXPECTED_TITLE_2), "Invalid Document Title"); + assertTrue(checkDocumentText(cas, EXPECTED_TITLE_2), "Invalid Document Text"); } // PubMed has changed the XML element ForeName to FirstName, but // foreName should still // be supported if (pmid.equals("18439884")) { - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(!checkSentences(cas), "Sentences Found"); checkCount++; - assertTrue("Invalid foreName", checkForeNames(cas, EXPECTED_FORE_NAMES)); + assertTrue(checkForeNames(cas, EXPECTED_FORE_NAMES), "Invalid foreName"); checkJournalTitle(cas, EXPECTED_JOURNAL_TITLE); } if (pmid.equals("17306504")) { - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(!checkSentences(cas), "Sentences Found"); checkCount++; - assertTrue("Invalid pubTypeList", checkPubTypeList(cas, EXPECTED_PUBTYPES)); - assertTrue("Invalid DOI", checkDoi(cas, EXPECTED_DOI)); + assertTrue(checkPubTypeList(cas, EXPECTED_PUBTYPES), "Invalid pubTypeList"); + assertTrue(checkDoi(cas, EXPECTED_DOI), "Invalid DOI"); } } assertEquals(11, checkCount); @@ -668,7 +669,7 @@ private boolean checkAbstractText(CAS cas, String abstractTextString2) { * * @param cas * The CAS - * @param title + * @param expectedTitle * The correct title * @return true if the correct title is contained in the CAS */ @@ -1006,9 +1007,9 @@ private boolean checkSentences(CAS cas) { int count = 0; while (iter.hasNext()) { Sentence s = (Sentence) iter.next(); - assertTrue("Sentence has an ID", s.getId() != null); - assertTrue("Sentence has an Begin", s.getBegin() >= 0); - assertTrue("Sentence has an End", s.getEnd() >= 0); + assertTrue(s.getId() != null, "Sentence has an ID"); + assertTrue(s.getBegin() >= 0, "Sentence has an Begin"); + assertTrue(s.getEnd() >= 0, "Sentence has an End"); count++; } if (count == 0) @@ -1052,7 +1053,7 @@ private boolean checkAuthors(CAS cas, String[][] authors) { * foreName, but both should be supported) * * @param cas - * @param foreName + * @param foreNames * @return */ private boolean checkForeNames(CAS cas, String[] foreNames) { @@ -1186,4 +1187,15 @@ public void testStructuredAbstract() throws UIMAException, IOException { // exists both). // EF March 2018: Haven't I done this already? Structured abstracts are handled } + + @Test + public void testNewlines() throws UIMAException, IOException { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-pubmed-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + CollectionReader reader = CollectionReaderFactory.createReader(XMLReader.class, XMLReader.PARAM_INPUT_FILE, + "src/test/resources/doc_medline_mathml_newlines.xml", XMLReader.RESOURCE_MAPPING_FILE, + "src/test/resources/newMappingFile.xml"); + reader.getNext(jCas.getCas()); + System.out.printf(jCas.getDocumentText()); + } } diff --git a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor.xml b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor.xml index b1878a690..99d571eff 100644 --- a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor.xml +++ b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor.xml @@ -5,7 +5,7 @@ XMLReader - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_Unicode_outside_BMP.xml b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_Unicode_outside_BMP.xml index 31c6e8683..417fa726e 100755 --- a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_Unicode_outside_BMP.xml +++ b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_Unicode_outside_BMP.xml @@ -5,7 +5,7 @@ XMLReader - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_missingInputDir.xml b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_missingInputDir.xml index f103e0d5a..bb46aef5a 100644 --- a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_missingInputDir.xml +++ b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_missingInputDir.xml @@ -5,7 +5,7 @@ XMLReader - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_singleFile.xml b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_singleFile.xml index bf791c1c9..caa322dba 100644 --- a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_singleFile.xml +++ b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_singleFile.xml @@ -5,7 +5,7 @@ XMLReader - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_singleFile2.xml b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_singleFile2.xml index b0350909a..a7f19bbf8 100644 --- a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_singleFile2.xml +++ b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_singleFile2.xml @@ -5,7 +5,7 @@ XMLReader - 2.5.1-SNAPSHOT + 2.6.0 @@ -28,6 +28,12 @@ false true + + IgnoreTrivialWS + Boolean + false + false + @@ -42,6 +48,12 @@ src/test/resources/pubmedDocumentTag/testfile.xml
+ + IgnoreTrivialWS + + false + +
diff --git a/jcore-xml-mapper/src/test/resources/doc_medline_mathml_newlines.xml b/jcore-xml-mapper/src/test/resources/doc_medline_mathml_newlines.xml new file mode 100644 index 000000000..7b1159592 --- /dev/null +++ b/jcore-xml-mapper/src/test/resources/doc_medline_mathml_newlines.xml @@ -0,0 +1,171 @@ + + 30712376 + + 2019 + 07 + 01 + + + 2020 + 12 + 15 + +
+ + 1029-2454 + + 35 + 1 + + 2019 + 01 + + + Biofouling + Biofouling + + An investigation into the effects of marine biofilm on the roughness and drag characteristics of surfaces coated with different sized cuprous oxide (Cu2O) particles. + + 15-33 + + 10.1080/08927014.2018.1559305 + + Biofilms typically increase surface roughness and consequently the drag penalties on marine vessels. However, there is a lack of data regarding the time-dependent influence of biofilms on antifouling surface characteristics and frictional drag, especially for surface coatings with different sizes of cuprous oxide ( + + + Cu + + + 2 + + + O + ). In this study, a series of pressure drop measurements was carried out using flat plates coated with different sizes of + + + Cu + + + 2 + + + O + . The cuprous oxide-containing surfaces were deployed at sea for a period of six months to allow biofilm to develop. Surface microstructure and roughness analyses were carried out every six weeks using scanning electron microscopy and laser roughness surface profilometry. From the data, the added frictional drag caused by biofilm on ships was predicted, based on roughness function using Granville extrapolations. The analyses indicated that biofilms had significant impacts by altering the surface microstructure, resulting in higher frictional drag. However, due to the interaction between the biofilm and the physico-chemical properties of the substratum for panels coated with larger + + + Cu + + + 2 + + + O + , the roughness and drag measurement results were both found to have fluctuating increments. + + + + Li + Chang + C + 0000-0003-3514-7857 + + Marine, Offshore and Subsea Technology group, School of Engineering, Newcastle University, Newcastle upon Tyne, UK. + + + + Atlar + Mehmet + M + + Department of Naval Architecture Ocean and Marine Engineering, University of Strathclyde, Glasgow, UK. + + + + Haroutunian + Maryam + M + + Marine, Offshore and Subsea Technology group, School of Engineering, Newcastle University, Newcastle upon Tyne, UK. + + + + Norman + Rose + R + + Marine, Offshore and Subsea Technology group, School of Engineering, Newcastle University, Newcastle upon Tyne, UK. + + + + Anderson + Colin + C + + Department of Research and Development, American Chemet Corporation, East Helena, Montana 59635, USA. + + + + eng + + Journal Article + Research Support, Non-U.S. Gov't + + + 2019 + 02 + 04 + +
+ + England + Biofouling + 9200331 + 0892-7014 + + + + 789U1901C5 + Copper + + + T8BEA5064F + cuprous oxide + + + IM + + + Biofilms + + + Copper + chemistry + + + Friction + + + Materials Testing + + + Microscopy, Electron, Scanning + + + Seawater + + + Surface Properties + + + + added resistance + antifouling + biofilm + frictional drag + particle size + pressure drop measurement + roughness characteristic + roughness function + Cuprous oxide ( Cu 2 O ) + +
\ No newline at end of file diff --git a/jcore-xml-mapper/src/test/resources/medlineMappingFileStructuredAbstract.xml b/jcore-xml-mapper/src/test/resources/medlineMappingFileStructuredAbstract.xml index 9badb769f..eca924537 100644 --- a/jcore-xml-mapper/src/test/resources/medlineMappingFileStructuredAbstract.xml +++ b/jcore-xml-mapper/src/test/resources/medlineMappingFileStructuredAbstract.xml @@ -5,7 +5,8 @@ /MedlineCitation/Article/Abstract - de.julielab.jcore.reader.xmlmapper.mapper.StructuredAbstractParser + + de.julielab.jcore.reader.xmlmapper.mapper.StructuredAbstractParser diff --git a/jcore-xml-reader/component.meta b/jcore-xml-reader/component.meta index dec59c048..9f7f54c1e 100644 --- a/jcore-xml-reader/component.meta +++ b/jcore-xml-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-xml-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0" }, "name": "JCoRe XML Reader" } diff --git a/jcore-xml-reader/pom.xml b/jcore-xml-reader/pom.xml index 1deddb382..348c8879e 100644 --- a/jcore-xml-reader/pom.xml +++ b/jcore-xml-reader/pom.xml @@ -5,7 +5,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0 jcore-xml-reader JCoRe XML Reader @@ -14,7 +14,7 @@ de.julielab jcore-xml-mapper - 2.5.1-SNAPSHOT + 2.6.0 org.slf4j @@ -137,8 +137,8 @@ assertj-core - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-xml-reader/src/main/java/de/julielab/jcore/multiplier/xml/XMLMultiplier.java b/jcore-xml-reader/src/main/java/de/julielab/jcore/multiplier/xml/XMLMultiplier.java index aafcb1e8a..4b6e4f8d1 100644 --- a/jcore-xml-reader/src/main/java/de/julielab/jcore/multiplier/xml/XMLMultiplier.java +++ b/jcore-xml-reader/src/main/java/de/julielab/jcore/multiplier/xml/XMLMultiplier.java @@ -189,7 +189,7 @@ public void process(JCas cas) throws AnalysisEngineProcessException { try { rowIterator = JulieXMLTools.constructRowIterator( JulieXMLTools.readStream(UriUtilities.getInputStreamFromUri(new java.net.URI(currentUri)), 1024), - 1024, forEach, fields, currentUri); + 1024, forEach, fields, currentUri, true); } catch (IOException | URISyntaxException e) { throw new AnalysisEngineProcessException(e); } diff --git a/jcore-xml-reader/src/main/resources/de/julielab/jcore/reader/xml/desc/XMLMultiplierReader.xml b/jcore-xml-reader/src/main/resources/de/julielab/jcore/reader/xml/desc/XMLMultiplierReader.xml index 34d04d1c7..be9956b4d 100644 --- a/jcore-xml-reader/src/main/resources/de/julielab/jcore/reader/xml/desc/XMLMultiplierReader.xml +++ b/jcore-xml-reader/src/main/resources/de/julielab/jcore/reader/xml/desc/XMLMultiplierReader.xml @@ -8,7 +8,7 @@ This reader is to be used with the JCoRe XML CAS Multiplier. The reader merely distributes the files to be read. The actual parsing is done by the multiplier.
- 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLMultiplierReaderTest.java b/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLMultiplierReaderTest.java index 67faae92f..875be49ce 100644 --- a/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLMultiplierReaderTest.java +++ b/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLMultiplierReaderTest.java @@ -21,7 +21,6 @@ import de.julielab.jcore.types.Journal; import de.julielab.jcore.types.casmultiplier.JCoReURI; import de.julielab.jcore.types.pubmed.Header; -import junit.framework.TestCase; import org.apache.uima.UIMAException; import org.apache.uima.UIMAFramework; import org.apache.uima.cas.CAS; @@ -37,6 +36,7 @@ import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -45,10 +45,13 @@ import java.util.*; import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + /** * Test for class XML Reader */ -public class XMLMultiplierReaderTest extends TestCase { +public class XMLMultiplierReaderTest { private static final Logger LOGGER = LoggerFactory.getLogger(XMLMultiplierReaderTest.class); @@ -80,6 +83,7 @@ public XMLMultiplierReaderTest() { } } + @Test public void testZipInput() throws UIMAException, IOException { JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.casmultiplier.jcore-uri-multiplier-types", "org.apache.uima.ducc.FlowControllerTS"); @@ -97,10 +101,10 @@ public void testZipInput() throws UIMAException, IOException { String fileName = it.next(); if (jCoReURI.getUri().endsWith(fileName)) { found = true; - assertTrue("File name " + fileName + " was already found", foundFileNames.add(fileName)); + assertTrue(foundFileNames.add(fileName), "File name " + fileName + " was already found"); } } - assertTrue("The URI " + jCoReURI.getUri()+ " was not matched by any expected file names", found); + assertTrue(found, "The URI " + jCoReURI.getUri()+ " was not matched by any expected file names"); jCas.reset(); } assertThat(expectedFileNames).isEqualTo(foundFileNames); @@ -111,6 +115,7 @@ public void testZipInput() throws UIMAException, IOException { * * @throws ResourceInitializationException */ + @Test public void testGetNextCas_singleFile() throws Exception { xmlMultiplierReader = CollectionReaderFactory.createReader(DESC_XML_MULTIPLIER_READER_DIR, XMLMultiplierReader.PARAM_INPUT_FILE, "src/test/resources/pubmedXML/pubmedsample18n0001.xml.gz"); @@ -125,6 +130,7 @@ public void testGetNextCas_singleFile() throws Exception { } + @Test public void testGetNextCas_directory() throws Exception { xmlMultiplierReader = CollectionReaderFactory.createReader(DESC_XML_MULTIPLIER_READER_DIR, XMLMultiplierReader.PARAM_INPUT_DIR, "src/test/resources/pubmedXML/"); diff --git a/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLMultiplierTest.java b/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLMultiplierTest.java index c3913c702..c757166ba 100644 --- a/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLMultiplierTest.java +++ b/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLMultiplierTest.java @@ -10,12 +10,12 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class XMLMultiplierTest { diff --git a/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLReaderTest.java b/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLReaderTest.java index d1b67539f..cf54882b1 100644 --- a/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLReaderTest.java +++ b/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLReaderTest.java @@ -18,11 +18,10 @@ package de.julielab.jcore.reader; import de.julielab.jcore.reader.xml.XMLReader; -import de.julielab.jcore.types.*; import de.julielab.jcore.types.Date; +import de.julielab.jcore.types.*; import de.julielab.jcore.types.pubmed.Header; import de.julielab.jcore.types.pubmed.ManualDescriptor; -import junit.framework.TestCase; import org.apache.commons.lang3.ArrayUtils; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.metadata.AnalysisEngineMetaData; @@ -49,13 +48,16 @@ import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; -import java.util.*; import java.util.List; +import java.util.*; + +import static org.assertj.core.api.Fail.fail; +import static org.junit.jupiter.api.Assertions.*; /** * Test for class XML Reader */ -public class XMLReaderTest extends TestCase { +public class XMLReaderTest { private static final Logger LOGGER = LoggerFactory.getLogger(XMLReaderTest.class); @@ -228,7 +230,7 @@ public void testGetNextCas_singleFile() throws ResourceInitializationException { LOGGER.error(e.getMessage(), e); e.printStackTrace(); } - assertEquals("reading single file", EXPECTED_DOCUMENT_TEXT, cas.getDocumentText()); + assertEquals( EXPECTED_DOCUMENT_TEXT, cas.getDocumentText(), "reading single file"); } /** @@ -239,8 +241,8 @@ public void testMissingInputDirectory() { medlineReader = getCollectionReader(DESC_MEDLINE_READER_MISSING_INPUT_DIR); fail("Expected exception was not thrown"); } catch (Exception e) { - assertTrue("Exception should be an instance of ResourceInitializationException , but was " - + e.getClass().getName(), e instanceof ResourceInitializationException); + assertTrue(e instanceof ResourceInitializationException, "Exception should be an instance of ResourceInitializationException , but was " + + e.getClass().getName()); } } @@ -332,25 +334,25 @@ private void checkElements() { // check medline XML with all items if (getPMID(cas).equals("11119751")) { checkCount++; - assertTrue("Invalid keyWordList", checkKeywords(cas, EXPECTED_KEYWORDS)); - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); - assertTrue("Invalid DBInfoList", ckeckDBInfos(cas, EXPECTED_DB_INFO)); - assertTrue("Invalid MeshHeading", checkMeshHeadings(cas, EXPECTED_MESH_HEADINGS)); - assertTrue("Invalid GeneSymbol", checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS)); - assertTrue("Invalid Chemical", checkChemicals(cas, EXPECTED_CHEMICALS)); - assertTrue("Invalid Header in document " + getPMID(cas), checkHeader(cas, EXPECTED_HEADER)); - assertTrue("Invalid ManualDescriptor", checkManualDescriptor(cas)); - assertTrue("Invalid Journal", ckeckJournal(cas, EXPECTED_JOURNAL)); - assertTrue("Invalid DocumentText in document " + getPMID(cas), - checkDocumentText(cas, EXPECTED_DOCUMENT_TEXT)); - assertTrue("Invalid AbstractText", checkAbstractText(cas, EXPECTED_ABSTRACT_TEXT)); - assertTrue("Invalid Title", checkTitle(cas, EXPECTED_TITLE)); + assertTrue(checkKeywords(cas, EXPECTED_KEYWORDS), "Invalid keyWordList"); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); + assertTrue(ckeckDBInfos(cas, EXPECTED_DB_INFO), "Invalid DBInfoList"); + assertTrue(checkMeshHeadings(cas, EXPECTED_MESH_HEADINGS), "Invalid MeshHeading"); + assertTrue(checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS), "Invalid GeneSymbol"); + assertTrue(checkChemicals(cas, EXPECTED_CHEMICALS), "Invalid Chemical"); + assertTrue(checkHeader(cas, EXPECTED_HEADER), "Invalid Header in document " + getPMID(cas)); + assertTrue(checkManualDescriptor(cas), "Invalid ManualDescriptor"); + assertTrue(ckeckJournal(cas, EXPECTED_JOURNAL), "Invalid Journal"); + assertTrue(checkDocumentText(cas, EXPECTED_DOCUMENT_TEXT), + "Invalid DocumentText in document " + getPMID(cas)); + assertTrue(checkAbstractText(cas, EXPECTED_ABSTRACT_TEXT), "Invalid AbstractText"); + assertTrue(checkTitle(cas, EXPECTED_TITLE), "Invalid Title"); } // check medline XML without most lists (gene, keywords,...) if (getPMID(cas).equals("11119751-a")) { checkCount++; - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); } @@ -358,30 +360,30 @@ private void checkElements() { // Spring-Summer if (getPMID(cas).equals("11119751-b")) { checkCount++; - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); - assertTrue("Invalid GeneSymbol", checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS)); - assertTrue("Invalid Journal in document " + getPMID(cas), ckeckJournal(cas, EXPECTED_JOURNAL)); - assertTrue("Invalid PubDate", checkPubDate(cas, EXPECTED_DATE_1)); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); + assertTrue(checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS), "Invalid GeneSymbol"); + assertTrue(ckeckJournal(cas, EXPECTED_JOURNAL), "Invalid Journal in document " + getPMID(cas)); + assertTrue(checkPubDate(cas, EXPECTED_DATE_1), "Invalid PubDate"); } // check medline XML with pub date: 2000 Dec // 23-30 if (getPMID(cas).equals("11119751-c")) { checkCount++; - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); - assertTrue("Invalid GeneSymbol", checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS)); - assertTrue("Invalid Journal", ckeckJournal(cas, EXPECTED_JOURNAL)); - assertTrue("Invalid PubDate", checkPubDate(cas, EXPECTED_DATE_2)); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); + assertTrue(checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS), "Invalid GeneSymbol"); + assertTrue(ckeckJournal(cas, EXPECTED_JOURNAL), "Invalid Journal"); + assertTrue(checkPubDate(cas, EXPECTED_DATE_2), "Invalid PubDate"); } // check medline XML pub date: 2000 Oct-2001 // Mar if (getPMID(cas).equals("11119751-d")) { checkCount++; - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); - assertTrue("Invalid GeneSymbol", checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS)); - assertTrue("Invalid Journal", ckeckJournal(cas, EXPECTED_JOURNAL)); - assertTrue("Invalid PubDate", checkPubDate(cas, EXPECTED_DATE_3)); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); + assertTrue(checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS), "Invalid GeneSymbol"); + assertTrue(ckeckJournal(cas, EXPECTED_JOURNAL), "Invalid Journal"); + assertTrue(checkPubDate(cas, EXPECTED_DATE_3), "Invalid PubDate"); } if (getPMID(cas).equals("11119751-e")) { @@ -394,22 +396,22 @@ private void checkElements() { // (documentText should be equal to title in this case) if (getPMID(cas).equals("17276851")) { checkCount++; - assertTrue("Invalid Document Title", checkTitle(cas, EXPECTED_TITLE_2)); - assertTrue("Invalid Document Text", checkDocumentText(cas, EXPECTED_TITLE_2)); + assertTrue(checkTitle(cas, EXPECTED_TITLE_2), "Invalid Document Title"); + assertTrue(checkDocumentText(cas, EXPECTED_TITLE_2), "Invalid Document Text"); } // PubMed has changed the XML element ForeName to FirstName, but // foreName should still be supported if (getPMID(cas).equals("18439884")) { checkCount++; - assertTrue("Invalid foreName", checkForeNames(cas, EXPECTED_FORE_NAMES)); + assertTrue(checkForeNames(cas, EXPECTED_FORE_NAMES), "Invalid foreName"); checkJournalTitle(cas, EXPECTED_JOURNAL_TITLE); } if (getPMID(cas).equals("17306504")) { checkCount++; - assertTrue("Invalid pubTypeList", checkPubTypeList(cas, EXPECTED_PUBTYPES)); - assertTrue("Invalid DOI in document " + getPMID(cas), checkDoi(cas, EXPECTED_DOI)); + assertTrue(checkPubTypeList(cas, EXPECTED_PUBTYPES), "Invalid pubTypeList"); + assertTrue(checkDoi(cas, EXPECTED_DOI), "Invalid DOI in document " + getPMID(cas)); } } assertEquals(9, checkCount); @@ -491,7 +493,7 @@ private boolean checkAbstractText(CAS cas, String abstractTextString2) { * * @param cas * The CAS - * @param title + * @param expectedTitle * The correct title * @return true if the correct title is contained in the CAS */ @@ -896,7 +898,7 @@ private boolean checkAuthors(CAS cas, String[][] authors) { * Check if foreName was correctly parsed (PubMed changed firstName to foreName, but both should be supported) * * @param cas - * @param foreName + * @param foreNames * @return */ private boolean checkForeNames(CAS cas, String[] foreNames) { diff --git a/jcore-xml-reader/src/test/resources/MedlineReaderDescriptor_missingInputDir.xml b/jcore-xml-reader/src/test/resources/MedlineReaderDescriptor_missingInputDir.xml index cd9a3ac70..68d33c44e 100644 --- a/jcore-xml-reader/src/test/resources/MedlineReaderDescriptor_missingInputDir.xml +++ b/jcore-xml-reader/src/test/resources/MedlineReaderDescriptor_missingInputDir.xml @@ -5,7 +5,7 @@ MedlineReaderDescriptor_missingInputDir - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-xml-reader/src/test/resources/PubmedXMLMultiplier.xml b/jcore-xml-reader/src/test/resources/PubmedXMLMultiplier.xml index d8ad0005b..1a8b378ab 100644 --- a/jcore-xml-reader/src/test/resources/PubmedXMLMultiplier.xml +++ b/jcore-xml-reader/src/test/resources/PubmedXMLMultiplier.xml @@ -5,7 +5,7 @@ PubmedXMLMultiplierDescriptor - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jcore-xml-reader/src/test/resources/XMLMultiplierReader.xml b/jcore-xml-reader/src/test/resources/XMLMultiplierReader.xml index 90a50848b..c32e2ae7e 100644 --- a/jcore-xml-reader/src/test/resources/XMLMultiplierReader.xml +++ b/jcore-xml-reader/src/test/resources/XMLMultiplierReader.xml @@ -5,7 +5,7 @@ MedlineReaderDescriptor_missingInputDir - 2.5.1-SNAPSHOT + 2.6.0 diff --git a/jedis-parent/pom.xml b/jedis-parent/pom.xml index 4d1302786..b09f97f48 100644 --- a/jedis-parent/pom.xml +++ b/jedis-parent/pom.xml @@ -4,7 +4,7 @@ jcore-base de.julielab - 2.5.1-SNAPSHOT + 2.6.0 pom 4.0.0 @@ -17,17 +17,17 @@ de.julielab costosys - 1.5.1 + 1.6.1-SNAPSHOT de.julielab jcore-db-test-utilities - 2.5.0 + 2.6.0-SNAPSHOT de.julielab jcore-xmi-splitter - 2.3.4 + 2.4.0-SNAPSHOT @@ -36,6 +36,7 @@ ../jcore-xml-db-reader ../jcore-xmi-db-reader ../jcore-xmi-db-writer + ../jcore-pmc-db-reader diff --git a/pom.xml b/pom.xml index 5687e86e0..16312b2db 100644 --- a/pom.xml +++ b/pom.xml @@ -1,112 +1,707 @@ - - 4.0.0 - - de.julielab - jcore-parent - 2.5.1 - - jcore-base - pom - JCoRe Base - The POM for the JCoRe Base projects. - 2.5.1-SNAPSHOT - - JULIE Lab, Germany - http://www.julielab.de - - - - BSD-2-Clause - https://opensource.org/licenses/BSD-2-Clause - - - https://github.com/JULIELab/jcore-base - - - org.apache.uima - uimaj-core - ${uima-version} - - - org.apache.uima - uimafit-core - ${uimafit-version} - - - - jcore-ace-reader - jcore-acronym-ae - jcore-banner-ae - jcore-biolemmatizer-ae - jcore-bionlpformat-consumer - jcore-bionlpformat-reader - jcore-biosem-ae - jcore-conll-consumer - jcore-coordination-baseline-ae - jcore-ct-reader - jcore-descriptor-creator - jcore-dta-reader - jcore-ec-code-ae - jcore-elasticsearch-consumer - jcore-embedding-writer - jcore-event-flattener-ae - jcore-feature-value-replacement-ae - jcore-file-reader - jcore-flair-ner-ae - jcore-iexml-consumer - jcore-iexml-reader - jcore-ign-reader - jcore-iob-consumer - jcore-jnet-ae - jcore-jpos-ae - jcore-jsbd-ae - jcore-jtbd-ae - jcore-julielab-entity-evaluator-consumer - jcore-likelihood-assignment-ae - jcore-likelihood-detection-ae - jcore-lingpipegazetteer-ae - jcore-lingpipe-porterstemmer-ae - jcore-lingscope-ae - jcore-linnaeus-species-ae - jcore-mantra-xml-types - jcore-medxn-ae - jcore-msdoc-reader - jcore-mstparser-ae - jcore-muc7-reader - jcore-mutationfinder-ae - jcore-opennlp-chunk-ae - jcore-opennlp-parser-ae - jcore-opennlp-postag-ae - jcore-opennlp-sentence-ae - jcore-opennlp-token-ae - jcore-pmc-reader - jcore-pubtator-reader - jcore-stanford-lemmatizer-ae - jcore-topic-indexing-ae - jcore-topics-writer - jcore-txt-consumer - jcore-types - jcore-utilities - jcore-xml-mapper - jcore-xml-reader - jcore-xmi-reader - jcore-xmi-writer - jedis-parent - jcore-db-checkpoint-ae - jcore-ppd-writer - jcore-bc2gmformat-writer - jcore-bc2gm-reader - jcore-annotation-adder-ae - jcore-flair-token-embedding-ae - jcore-line-multiplier - jcore-cord19-reader - - - scm:git:https://github.com/JULIELab/jcore-base + + + + + + + 4.0.0 + + + + + + + + + + + + de.julielab + + + + + + jcore-parent + + + + + + 2.5.2 + + + + + + + + + + + + jcore-base + + + + + + pom + + + + + + JCoRe Base + + + + + + The POM for the JCoRe Base projects. + + + + + + 2.6.0 + + + + + + + + + + + + JULIE Lab, Germany + + + + + + http://www.julielab.de + + + + + + + + + + + + + + + + + + + + + + + + BSD-2-Clause + + + + + + https://opensource.org/licenses/BSD-2-Clause + + + + + + + + + + + + + + + + + + https://github.com/JULIELab/jcore-base + + + + + + + + + + + + + + + + + + org.apache.uima + + + + + + uimaj-core + + + + + + ${uima-version} + + + + + + + + + + + + + + + + + + org.apache.uima + + + + + + uimafit-core + + + + + + ${uimafit-version} + + + + + + + + + + + + + + + + + + + + + + + + jcore-annotation-adder-ae + + + + + + jcore-ace-reader + + + + + + jcore-acronym-ae + + + + + + jcore-acronym-writer + + + + + + jcore-banner-ae + + + + + + jcore-bc2gm-reader + + + + + + jcore-bc2gmformat-writer + + + + + + jcore-biolemmatizer-ae + + + + + + jcore-bionlpformat-consumer + + + + + + jcore-bionlpformat-reader + + + + + + jcore-biosem-ae + + + + + + jcore-conll-consumer + + + + + + jcore-coordination-baseline-ae + + + + + + jcore-cord19-reader + + + + + + jcore-coreference-writer + + + + + + jcore-ct-reader + + + + + + jcore-db-checkpoint-ae + + + + + + jcore-descriptor-creator + + + + + + jcore-dta-reader + + + + + + jcore-ec-code-ae + + + + + + jcore-elasticsearch-consumer + + + + + + jcore-embedding-writer + + + + + + jcore-event-flattener-ae + + + + + + jcore-feature-value-replacement-ae + + + + + + jcore-file-reader + + + + + + jcore-flair-ner-ae + + + + + + jcore-flair-token-embedding-ae + + + + + + jcore-flow-controllers + + + + + + jcore-gnp-bioc-reader + + + + + + jcore-gnp-bioc-writer + + + + + + jcore-iexml-consumer + + + + + + jcore-iexml-reader + + + + + + jcore-ign-reader + + + + + + jcore-iob-consumer + + + + + + jcore-jnet-ae + + + + + + jcore-jpos-ae + + + + + + jcore-jsbd-ae + + + + + + jcore-jtbd-ae + + + + + + jcore-julielab-entity-evaluator-consumer + + + + + + jcore-likelihood-assignment-ae + + + + + + jcore-likelihood-detection-ae + + + + + + jcore-line-multiplier + + + + + + jcore-lingpipegazetteer-ae + + + + + + jcore-lingpipe-porterstemmer-ae + + + + + + jcore-lingscope-ae + + + + + + jcore-linnaeus-species-ae + + + + + + jcore-mantra-xml-types + + + + + + jcore-medxn-ae + + + + + + jcore-msdoc-reader + + + + + + jcore-mstparser-ae + + + + + + jcore-muc7-reader + + + + + + jcore-mutationfinder-ae + + + + + + jcore-neo4j-relations-consumer + + + + + + jcore-opennlp-chunk-ae + + + + + + jcore-opennlp-parser-ae + + + + + + jcore-opennlp-postag-ae + + + + + + jcore-opennlp-sentence-ae + + + + + + jcore-opennlp-token-ae + + + + + + jcore-ppd-writer + + + + + + jcore-pmc-reader + + + + + + jcore-pubtator-reader + + + + + + jcore-stanford-lemmatizer-ae + + + + + + jcore-topic-indexing-ae + + + + + + jcore-topics-writer + + + + + + jcore-txt-consumer + + + + + + jcore-types + + + + + + jcore-utilities + + + + + + jcore-xml-mapper + + + + + + jcore-xml-reader + + + + + + jcore-xmi-reader + + + + + + jcore-xmi-writer + + + + + + jedis-parent + + + + + + jcore-jedis-integration-tests + + + + + + jcore-mmax2-reader + + + + + jcore-nlmgene-reader + + + + jcore-gnormplus-ae + + + jcore-annotation-removal-ae + + + + + + + + + + + + + + scm:git:https://github.com/JULIELab/jcore-base - scm:git:https://github.com/JULIELab/jcore-base - scm:git:https://github.com/JULIELab/jcore-base - + + + + + + scm:git:https://github.com/JULIELab/jcore-base + + + + + + scm:git:https://github.com/JULIELab/jcore-base + + + + + + + + + + + diff --git a/scripts/createMetaDescriptors.py b/scripts/createMetaDescriptors.py index 2da6f7de0..6c940f20b 100755 --- a/scripts/createMetaDescriptors.py +++ b/scripts/createMetaDescriptors.py @@ -12,15 +12,14 @@ -v: The version of the repository -u: If the repository does not yet exist: If is updateable or not """ +import fnmatch +import json import os -import sys import re -from os.path import expanduser -import json -import fnmatch +import sys import xml.etree.ElementTree as ET +from os.path import expanduser from xml.etree.ElementTree import ParseError -from collections import Counter # For testing we define in and out names so we can create new versions and compare META_DESC_IN_NAME = "component.meta" @@ -66,6 +65,8 @@ def getArtifactInfo(pomFile): category = "consumer" if (artifactId.endswith("writer")): category = "consumer" + if (artifactId.endswith("flowcontroller")): + category = "flowcontroller" artifact = {} artifact["artifactId"] = artifactId @@ -111,6 +112,8 @@ def getDescriptors(projectpath): category = "consumer" if descriptorRoot.tag.endswith("casConsumerDescription"): category = "consumer" + if descriptorRoot.tag.endswith("flowControllerDescription"): + category = "flowcontroller" if category != None: # From the complete file name, exclude the system dependent part. That is, make the path relative to the # project directory's src/main/resources directory.