Skip to content

Commit

Permalink
Merge pull request #120 from michelole/issue-105
Browse files Browse the repository at this point in the history
Refactor the code and add tests for cleaning
  • Loading branch information
michelole authored Jun 5, 2019
2 parents 104790c + 31dacb6 commit f5cc6e4
Show file tree
Hide file tree
Showing 6 changed files with 650 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public String getText() {
}

public String getCleanedText() {
return DataUtilities.cleanText(text);
return DataUtilities.removeWhitespaces(text);
}

public Eligibility getEligibility(Criterion criterion) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ public String getVisit_text() {
}

public String getCleanedVisitText() {
return DataUtilities.cleanText(visit_text);
return DataUtilities.removeWhitespaces(visit_text);
}

public void setVisit_text(String visit_text) {
Expand Down
71 changes: 44 additions & 27 deletions src/main/java/at/medunigraz/imi/bst/n2c2/nn/DataUtilities.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
*/
public abstract class DataUtilities {

private static final Pattern CLEANER_REGEX = Pattern.compile("\\p{javaWhitespace}+");
private static final Pattern WHITESPACES = Pattern.compile("\\p{javaWhitespace}+");

private static String[] tokenStreamToArray(TokenStream stream) throws IOException {
CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);
Expand Down Expand Up @@ -114,51 +114,68 @@ public static String getChar3GramRepresentation(String toProcess) throws IOExcep
return charNGramRepresentation.trim();
}

/**
* Cleans the text and detect sentences using a rule-based algorithm.
*
* @param narrative
* @return
*/
public static List<String> getSentences(String narrative) {
String cleanedNarrative = clean(narrative);

String abbreviations = "\\d|[mM][rR]|[dD][rR]|[dD][rR][sS]|[sM][sS]|[cC]";
String cleanPatternA = "[\t\\*_\\%=#]+";
String cleanPatternB = "&nbsp;|<BR>|\\s+|--";

String cleanedNarrative = "";
String tempString = "";

// cleansing beginning input lines
try {
List<String> lines = IOUtils.readLines(new StringReader(narrative));
for (String line : lines) {
if (line.length() > 0) {
tempString = line.replaceAll(cleanPatternA, " ");
tempString = tempString.replaceAll(cleanPatternB, " ");
tempString = tempString.replaceAll("\\.+", ".").trim();
if (tempString.length() > 0)
cleanedNarrative += tempString + "\n";
}
}
} catch (IOException e) {
e.printStackTrace();
}

// new line split logic
// Positive lookahead ensure next character is alphanumeric, thus removing duplicate linebreaks.
String[] newLineSplits = cleanedNarrative.split("\n(?=[A-Z]|[0-9])");
ArrayList<String> sentences = new ArrayList<String>();

// period character split logic
for (String newLineSplit : newLineSplits) {
// Remove duplicate linebreaks eventually found in the split (see comment above).
newLineSplit = newLineSplit.replaceAll("[\r\n\\s]+", " ").trim();
if (newLineSplit.length() > 0) {
// Split into period markers (not preceeded by abbreviations) followed by any number of whitespaces.
sentences.addAll(Arrays.asList(newLineSplit.split("(?<!" + abbreviations + ")(\\.)(\\s+)")));
}
}

// post cleansing
// sentences.forEach(sentence -> System.out.println(sentence));

return sentences;
}

public static String cleanText(String text) {
return CLEANER_REGEX.matcher(text).replaceAll(" ");
/**
* Cleans a text by removing symbols, special characters, and deduplicated spaces and period marks.
*
* @param text
* @return
*/
public static String clean(String text) {
String cleanPatternA = "[\t\\*_\\%=#]+";
String cleanPatternB = "&nbsp;|<BR>|\\s+|--";

String cleanedNarrative = "";
String tempString = "";

// cleansing beginning input lines
try {
List<String> lines = IOUtils.readLines(new StringReader(text));
for (String line : lines) {
if (line.length() > 0) {
tempString = line.replaceAll(cleanPatternA, " ");
tempString = tempString.replaceAll(cleanPatternB, " ");
tempString = tempString.replaceAll("\\.+", ".").trim();
if (tempString.length() > 0)
cleanedNarrative += tempString + "\n";
}
}
} catch (IOException e) {
e.printStackTrace();
}
return cleanedNarrative;
}

public static String removeWhitespaces(String text) {
return WHITESPACES.matcher(text).replaceAll(" ");
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

public class DataUtilitiesTest {

private static final File SAMPLE = new File(DataUtilitiesTest.class.getResource("/gold-standard/sample.xml").getPath());

@Test
public void processTextReduced() throws IOException {
String normalized = DataUtilities.processTextReduced("This is a, test sentence: test_sentence.");
Expand All @@ -32,7 +34,6 @@ public void getChar3GramRepresentation() throws IOException {

@Test
public void sample() throws IOException, SAXException {
final File SAMPLE = new File(getClass().getResource("/gold-standard/sample.xml").getPath());
Patient p = new PatientDAO().fromXML(SAMPLE);

StringBuilder normalizedText = new StringBuilder();
Expand All @@ -57,6 +58,32 @@ public void sample() throws IOException, SAXException {
assertEquals(FileUtils.readFileToString(expectedTrigrams, "UTF-8"), textTrigrams.toString());
}

@Test
public void getSentences() throws IOException, SAXException {
final File expectedFile = new File(getClass().getResource("/preprocessing/sample-sentences.txt").getFile());

List<String> expected = FileUtils.readLines(expectedFile, "UTF-8");
List<String> actual = DataUtilities.getSentences(new PatientDAO().fromXML(SAMPLE).getText());

assertEquals(expected, actual);

// TODO First period mark is dropped if followed by two whitespaces
// expected = Arrays.asList("One sentence.", "Second sentence.");
expected = Arrays.asList("One sentence", "Second sentence.");
actual = DataUtilities.getSentences("One sentence. Second sentence.");
assertEquals(expected, actual);
}

@Test
public void clean() {
// TODO newline is unexpected.
// TODO Extra whitespaces
// String expected = "This is a sentence.";
String expected = "This is a sentence.\n";
String actual = DataUtilities.clean("This is a ***%%%===--\tsentence..... ");
assertEquals(expected, actual);
}

@Test
public void tokenize() {
// Example from https://nlp.stanford.edu/software/tokenizer.shtml
Expand All @@ -74,6 +101,16 @@ public void tokenize() {
assertEquals("hi my name can't hello", String.join(" ", actual));
}

@Test
public void getTokens() throws IOException, SAXException {
final File expectedFile = new File(getClass().getResource("/preprocessing/sample-tokens.txt").getFile());

List<String> expected = FileUtils.readLines(expectedFile, "UTF-8");
List<String> actual = DataUtilities.getTokens(new PatientDAO().fromXML(SAMPLE).getText());

assertEquals(expected, actual);
}

@Test
public void getVocabulary() {
Set<String> actual = DataUtilities.getVocabulary("This is a test sentence. Can't you think of a better sentence?");
Expand Down
73 changes: 73 additions & 0 deletions src/test/resources/preprocessing/sample-sentences.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
Record date:
Record date: 2067-05-22
FISHKILL MEDICAL CENTER
Internal Medicine Associates
33 Mercy Plaza
Spencer, AK 72985
Robinson, John
87496736
05/22/67
HISTORY OF PRESENT ILLNESS: Mr. Robinson is returning for follow up for arm fracture after falling off stage directing school production of Midsummer Night's Dream
Arm remains in cast and sling; patient reports minor discomfort but is in good spirits
x-ray showed fracture is healing; 2 more weeks with cast and sling recommended.
Patient is concerned about weight gain as he is less active now with the cast
Recommended walking; watching food intake
Patient has history of diabetes and blood sugar control is slipping with lack of movement due to fracture
HBA1c at last checkup was
7.2
Recommend more increased monitoring of blood sugar while less active.
PHYSICAL EXAMINATION: BP 136/80. Weight increased from 250 to 260 pounds
Pulse is 71.
ASSESSMENT AND PLAN
1. Fracture healing well; 2 more weeks with cast and sling
2. Weight
Advised to increase amount of time walking.
Avoid stages.
3. HBA1c. Monitor more closely during next 2 weeks.
Ann Stephenson, M.D.
Record date: 2068-12-18
HPI:
59 yo male with history of DM, family history of CAD presented with new chest pain and shortness of breath
was found to have lateral STEMI
Pain began 2 hours ago; patient called EMS when began to feel lightheaded
Intubated on arrival for resiratory distress.
ECG showed lateral ST elevations and a CT chest scan was negative for aortic dissection
Blood sugars extremely elevated: 500s in the ED.
On immediate LHC, a 100 occlusion of his OM1 was found and recannalized as well as stented with a Vision BMS, resulting in
TIMI 2 flow
An IABP was placed with initial augmented diastolic pressures recorded in the 80s; started on Levophed, Dobutamine and Dopamine.
Record date: 2069-11-02
Mr. Robinson reported to the emergency department today with difficulty breathing and numbness in hands and feet
Goes away after 15 minutes, but has been happening more frequently
Patient has history of asthma, but says these symptoms do not match.
PHYSICAL EXAMINATION: GENERAL APPEARANCE: No acute distress, pain-free
VITAL SIGNS: Afebrile
Pulse 95. Respirations 20.
Blood pressure 135/85. Pulse oximetry is 90 on room air.
CARDIAC: Regular rate and rhythm
Normal S1 and S2. possible murmur; sent for evalution
NECK: JVP 5 cm
LUNGS: Labored breathing.
ABDOMEN: Soft, nontender, and nondistended.
Bood tests showed normal HBA1c levels.
REVIEW OF SYSTEMS: As indicated.
PAST MEDICAL HISTORY: insulin-dependent diabetes since 25yo, retinal neuropathy, asthma
Obesity.
MI treated here previously
Ischemia.
SOCIAL/FAMILY HISTORY: No alcohol
Smoked in the past, quit 10 years ago
Family history of ischemia and CAD
MEDICATIONS:
1. Provigil
2. Atenolol
3. Ativan.
4. Glucophage 850 mg t.i.d.
5. Humulin 15 units at night.
6. Folate.
7. Metoprolol.
8. Cardia.
9. Vitamin E.
10. Coated aspirin.
Recommended full cardiac evalution; possible need for stent
Patient opted to return following day.
Loading

0 comments on commit f5cc6e4

Please sign in to comment.