forked from bst-mug/n2c2
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add MetaMapLite support (fixes bst-mug#2)
MetaMapLite (https://metamap.nlm.nih.gov/MetaMapLite.shtml) is not FOSS, but a free UMLS license can be obtained on the website https://utslogin.nlm.nih.gov/cas/login. The current pom.xml points to a private Maven repository with pre-compiled JARs, but one can use their local repository using the MetaMapLite instructions for "Using Maven" (https://metamap.nlm.nih.gov/Docs/README_MetaMapLite_3.1.html). OpenNLP models, the index directory and the termsfile required by MetaMapLite are also available on the MetaMapLite distribution and should be set on this project's metamaplite.properties.
- Loading branch information
Showing
6 changed files
with
317 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
19 changes: 19 additions & 0 deletions
19
src/main/java/at/medunigraz/imi/bst/n2c2/preprocess/conceptmapper/ConceptMapper.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
package at.medunigraz.imi.bst.n2c2.preprocess.conceptmapper; | ||
|
||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Set; | ||
|
||
/** | ||
* @author Michel Oleynik <[email protected]> | ||
* @link https://github.com/michelole/reassess/blob/master/src/main/java/at/medunigraz/imi/reassess/conceptmapper/ConceptMapper.java | ||
*/ | ||
public interface ConceptMapper { | ||
List<String> map(String text); | ||
|
||
String annotate(String text); | ||
|
||
default Set<String> uniqueMap(String text) { | ||
return new HashSet<String>(map(text)); | ||
} | ||
} |
170 changes: 170 additions & 0 deletions
170
src/main/java/at/medunigraz/imi/bst/n2c2/preprocess/conceptmapper/MetaMapLiteFacade.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
package at.medunigraz.imi.bst.n2c2.preprocess.conceptmapper; | ||
|
||
import bioc.BioCDocument; | ||
import gov.nih.nlm.nls.metamap.document.FreeText; | ||
import gov.nih.nlm.nls.metamap.lite.types.ConceptInfo; | ||
import gov.nih.nlm.nls.metamap.lite.types.Entity; | ||
import gov.nih.nlm.nls.metamap.lite.types.Ev; | ||
import gov.nih.nlm.nls.ner.MetaMapLite; | ||
import org.apache.logging.log4j.LogManager; | ||
import org.apache.logging.log4j.Logger; | ||
|
||
import java.io.File; | ||
import java.io.FileReader; | ||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Properties; | ||
|
||
/** | ||
* Facade for MetaMapLite (https://metamap.nlm.nih.gov/MetaMapLite.shtml). | ||
* Requires an UMLS license. | ||
* | ||
* @author Michel Oleynik <[email protected]> | ||
* @link https://github.com/michelole/reassess/blob/master/src/main/java/at/medunigraz/imi/reassess/conceptmapper/metamap/MetaMapLiteFacade.java | ||
*/ | ||
public class MetaMapLiteFacade implements ConceptMapper { | ||
|
||
private static final Logger LOG = LogManager.getLogger(); | ||
|
||
private static MetaMapLiteFacade instance = null; | ||
private static Properties properties; | ||
private MetaMapLite metaMapLiteInst; | ||
|
||
private MetaMapLiteFacade() { | ||
LOG.info("Building MetaMap instance..."); | ||
|
||
initProperties(); | ||
|
||
try { | ||
metaMapLiteInst = new MetaMapLite(properties); | ||
} catch (Exception e) { | ||
// TODO Auto-generated catch block | ||
e.printStackTrace(); | ||
} | ||
|
||
LOG.info("Building MetaMap instance finished."); | ||
} | ||
|
||
public static MetaMapLiteFacade getInstance() { | ||
if (instance == null) { | ||
instance = new MetaMapLiteFacade(); | ||
} | ||
return instance; | ||
} | ||
|
||
private static void initProperties() { | ||
properties = MetaMapLite.getDefaultConfiguration(); | ||
|
||
String configPropertyFilename = System.getProperty("metamaplite.property.file", | ||
MetaMapLiteFacade.class.getResource("/metamaplite.properties").getFile()); | ||
|
||
try { | ||
properties.load(new FileReader(configPropertyFilename)); | ||
} catch (IOException e) { | ||
// TODO Auto-generated catch block | ||
e.printStackTrace(); | ||
} | ||
|
||
MetaMapLite.expandModelsDir(properties); | ||
MetaMapLite.expandIndexDir(properties); | ||
} | ||
|
||
public static boolean isModelsDirValid() { | ||
initProperties(); | ||
return (new File(properties.getProperty("opennlp.models.directory"))).canRead(); | ||
} | ||
|
||
/* | ||
* (non-Javadoc) | ||
* @see at.medunigraz.imi.reassess.conceptmapper.ConceptMapper#map(java.lang.String) | ||
*/ | ||
public List<String> map(String text) { | ||
List<String> ret = new ArrayList<String>(); | ||
|
||
List<Entity> entityList = process(text); | ||
|
||
for (Entity entity : entityList) { | ||
// TODO Should submatches be skipped as in annotate()? | ||
for (Ev ev : entity.getEvSet()) { | ||
ret.add(ev.getConceptInfo().getCUI()); | ||
LOG.trace(ev); | ||
} | ||
} | ||
|
||
return ret; | ||
} | ||
|
||
private List<Entity> process(String text) { | ||
int length = text.length(); | ||
LOG.debug("Processing \"{}\"...", text.substring(0, Math.min(length, 20))); | ||
|
||
long start = System.currentTimeMillis(); | ||
|
||
BioCDocument document = FreeText.instantiateBioCDocument(text); | ||
document.setID("1"); | ||
List<BioCDocument> documentList = new ArrayList<BioCDocument>(); | ||
documentList.add(document); | ||
|
||
List<Entity> entityList = null; | ||
try { | ||
entityList = metaMapLiteInst.processDocumentList(documentList); | ||
} catch (Exception e) { | ||
// TODO Auto-generated catch block | ||
e.printStackTrace(); | ||
} | ||
|
||
long end = System.currentTimeMillis(); | ||
|
||
float duration = (end - start + 1) / 1000f; | ||
|
||
LOG.debug("Processed {} chars in {} sec ({} chars/sec).", length, duration, length / duration); | ||
|
||
return entityList; | ||
} | ||
|
||
/* | ||
* (non-Javadoc) | ||
* @see at.medunigraz.imi.reassess.conceptmapper.ConceptMapper#annotate(java.lang.String) | ||
*/ | ||
public String annotate(String text) { | ||
List<Entity> entityList = process(text); | ||
|
||
int length = text.length(); | ||
|
||
StringBuilder sb = new StringBuilder(length); | ||
|
||
int i = 0; | ||
for (Entity entity : entityList) { | ||
int start = entity.getStart(); | ||
|
||
// Skip submatches | ||
if (start < i) { | ||
continue; | ||
} | ||
|
||
String matched = entity.getMatchedText(); | ||
|
||
sb.append(text, i, start); | ||
sb.append("<"); | ||
sb.append(matched); | ||
sb.append("|"); | ||
|
||
for (Ev ev : entity.getEvSet()) { | ||
ConceptInfo conceptInfo = ev.getConceptInfo(); | ||
sb.append(conceptInfo.getCUI()); | ||
sb.append(":"); | ||
sb.append(conceptInfo.getPreferredName()); | ||
sb.append("|"); | ||
} | ||
sb.append(">"); | ||
|
||
i = entity.getStart() + entity.getLength(); | ||
} | ||
|
||
sb.append(text, i, length); | ||
|
||
return sb.toString(); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
opennlp.models.directory=data/models | ||
metamaplite.index.directory=data/ivf/strict | ||
metamaplite.excluded.termsfile=data/specialterms.txt | ||
metamaplite.segmentation.method=BLANKLINES | ||
metamaplite.sourceset=all | ||
metamaplite.semanticgroup=all | ||
#metamaplite.sourceset = SNOMEDCT_US | ||
#metamaplite.semanticgroup = neop |
79 changes: 79 additions & 0 deletions
79
src/test/java/at/medunigraz/imi/bst/n2c2/preprocess/conceptmapper/MetaMapLiteFacadeTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
package at.medunigraz.imi.bst.n2c2.preprocess.conceptmapper; | ||
|
||
import org.junit.Assume; | ||
import org.junit.Before; | ||
import org.junit.Test; | ||
|
||
import java.util.ArrayList; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Set; | ||
|
||
import static org.junit.Assert.assertEquals; | ||
|
||
public class MetaMapLiteFacadeTest { | ||
|
||
private static final String BREAST_CANCER = "The patient has breast cancer."; | ||
|
||
@Before | ||
public void setUp() { | ||
Assume.assumeTrue(MetaMapLiteFacade.isModelsDirValid()); | ||
} | ||
|
||
@Test | ||
public void testMap() { | ||
MetaMapLiteFacade mm = MetaMapLiteFacade.getInstance(); | ||
|
||
List<String> expected = new ArrayList<String>(); | ||
expected.add("C0030705"); // Patients | ||
expected.add("C0006142"); // Malignant neoplasm of breast | ||
expected.add("C0678222"); // Breast Carcinoma | ||
List<String> actual = mm.map(BREAST_CANCER); | ||
|
||
assertEquals(expected, actual); | ||
} | ||
|
||
@Test | ||
public void testUniqueMap() { | ||
final String doubledText = BREAST_CANCER + ". " + BREAST_CANCER; | ||
MetaMapLiteFacade mm = MetaMapLiteFacade.getInstance(); | ||
|
||
List<String> expectedList = new ArrayList<String>(); | ||
// Expects doubled CUIs | ||
expectedList.add("C0030705"); // Patients | ||
expectedList.add("C0006142"); // Malignant neoplasm of breast | ||
expectedList.add("C0678222"); // Breast Carcinoma | ||
expectedList.add("C0030705"); // Patients | ||
expectedList.add("C0006142"); // Malignant neoplasm of breast | ||
expectedList.add("C0678222"); // Breast Carcinoma | ||
List<String> actualList = mm.map(doubledText); | ||
assertEquals(expectedList, actualList); | ||
|
||
Set<String> expectedSet = new HashSet<String>(); | ||
expectedSet.add("C0006142"); // Malignant neoplasm of breast | ||
expectedSet.add("C0678222"); // Breast Carcinoma | ||
expectedSet.add("C0030705"); // Patients | ||
Set<String> actualSet = mm.uniqueMap(doubledText); | ||
assertEquals(expectedSet, actualSet); | ||
} | ||
|
||
@Test | ||
public void testAnnotate() { | ||
MetaMapLiteFacade mm = MetaMapLiteFacade.getInstance(); | ||
|
||
// Basic test | ||
String actual = mm.annotate(BREAST_CANCER); | ||
String expected = "The <patient|C0030705:Patients|> has <breast cancer|C0006142:Malignant neoplasm of breast|C0678222:Breast Carcinoma|>."; | ||
assertEquals(expected, actual); | ||
|
||
// Submatches | ||
actual = mm.annotate("History of present illness"); | ||
expected = "<History of present illness|C0262512:History of present illness|C0488508:History of present illness:Finding:Point in time:^Patient:Nominal:Reported|>"; | ||
assertEquals(expected, actual); | ||
|
||
// Double spacing | ||
actual = mm.annotate("headache. headache."); | ||
expected = "<headache|C0018681:Headache|C2096315:ENT surgical result nose headache|>. <headache|C2096315:ENT surgical result nose headache|C0018681:Headache|>."; | ||
assertEquals(expected, actual); | ||
} | ||
} |