package at.medunigraz.imi.bst.n2c2.nn;

import static org.junit.Assert.assertEquals;

import java.util.ArrayList;
import java.util.List;

import org.junit.Ignore;
import org.xml.sax.SAXException;

import at.medunigraz.imi.bst.n2c2.dao.PatientDAO;
import at.medunigraz.imi.bst.n2c2.model.Patient;

public class DataUtilitiesTest {

public void train() {

// read in patients
File sampleDirectory = new File("Z:/n2c2/data/samplesTraining");
List<File> sampleFiles = (List<File>) FileUtils.listFiles(sampleDirectory, TrueFileFilter.INSTANCE,

DataUtilities utilities = new DataUtilities();

List<Patient> patients;
List<String> sentences;
try {
patients = new ArrayList<Patient>();
for (File patientSample : sampleFiles) {
patients.add(new PatientDAO().fromXML(patientSample));

sentences = DataUtilities.getSentences(patients.get(200).getText());

for (String sentence : sentences) {
String normalized = utilities.processTextReduced(sentence);
String char3Grams001 = utilities.getChar3GramRepresentation(normalized);
System.out.println(sentence + "\t" + normalized + "\t" + char3Grams001);

} catch (IOException e) {
} catch (SAXException e) {
assertEquals(true, true);
package at.medunigraz.imi.bst.n2c2.nn;

import static org.junit.Assert.assertEquals;

import java.util.ArrayList;
import java.util.List;

import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.SAXException;

import at.medunigraz.imi.bst.n2c2.dao.PatientDAO;
import at.medunigraz.imi.bst.n2c2.model.Patient;

public class DataUtilitiesTest {

private static final DataUtilities UTILITIES = new DataUtilities();

public void processTextReduced() throws IOException {
String normalized = UTILITIES.processTextReduced("This is a, test sentence: test_sentence.");
assertEquals("this is a test sentenc test sent", normalized);

public void getChar3GramRepresentation() throws IOException {
String normalized = UTILITIES.getChar3GramRepresentation("this is a test sentence");
assertEquals("_th thi his is_ _is is_ _a_ _te tes est st_ _se sen ent nte ten enc nce ce_", normalized);

public void sample() throws IOException, SAXException {
final File SAMPLE = new File(getClass().getResource("/gold-standard/sample.xml").getPath());
Patient p = new PatientDAO().fromXML(SAMPLE);

StringBuilder normalizedText = new StringBuilder();
StringBuilder textTrigrams = new StringBuilder();

List<String> sentences = DataUtilities.getSentences(p.getText());
for (String sentence : sentences) {
String normalized = UTILITIES.processTextReduced(sentence);
String charTrigrams = UTILITIES.getChar3GramRepresentation(normalized);



final File expectedNormalized = new File(getClass().getResource("/nn/sample-normalized.txt").getFile());
final File expectedTrigrams = new File(getClass().getResource("/nn/sample-trigrams.txt").getFile());

assertEquals(FileUtils.readFileToString(expectedNormalized, "UTF-8"), normalizedText.toString());
assertEquals(FileUtils.readFileToString(expectedTrigrams, "UTF-8"), textTrigrams.toString());

public void train() {

// read in patients
File sampleDirectory = new File("Z:/n2c2/data/samplesTraining");
List<File> sampleFiles = (List<File>) FileUtils.listFiles(sampleDirectory, TrueFileFilter.INSTANCE,

DataUtilities utilities = new DataUtilities();

List<Patient> patients;
List<String> sentences;
try {
patients = new ArrayList<Patient>();
for (File patientSample : sampleFiles) {
patients.add(new PatientDAO().fromXML(patientSample));

sentences = DataUtilities.getSentences(patients.get(200).getText());

for (String sentence : sentences) {
String normalized = utilities.processTextReduced(sentence);
String char3Grams001 = utilities.getChar3GramRepresentation(normalized);
System.out.println(sentence + "\t" + normalized + "\t" + char3Grams001);

} catch (IOException e) {
} catch (SAXException e) {
assertEquals(true, true);
73 changes: 73 additions & 0 deletions src/test/resources/nn/sample-normalized.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
record date
record date 2067 05 22
fishkil medic center
intern medicin associ
33 merci plaza
spencer ak 72985
robinson john
05 22 67
histori of present ill mr robinson is return for follow up for arm fractur after fall off stage direct school product of midsumm night dream
arm remain in cast and sling patient report minor discomfort but is in good spirit
x ray show fractur is heal 2 more week with cast and sling recommend
patient is concern about weight gain as he is less activ now with the cast
recommend walk watch food intak
patient has histori of diabet and blood sugar control is slip with lack of movement due to fractur
hba1c at last checkup was
7 2
recommend more increas monitor of blood sugar while less activ
physic examin bp 136 80 weight increas from 250 to 260 pound
puls is 71
assess and plan
1 fractur heal well 2 more week with cast and sling
2 weight
advis to increas amount of time walk
avoid stage
3 hba1c monitor more close dure next 2 week
ann stephenson m d
record date 2068 12 18
59 yo male with histori of dm famili histori of cad present with new chest pain and short of breath
was found to have later stemi
pain began 2 hour ago patient call em when began to feel lighthead
intub on arriv for resiratori distress
ecg show later st elev and a ct chest scan was negat for aortic dissect
blood sugar extrem elev 500s in the ed
on immedi lhc a 100 occlus of his om1 was found and recann as well as stent with a vision bms result in
timi 2 flow
an iabp was place with initi augment diastol pressur record in the 80s start on levoph dobutamin and dopamin
record date 2069 11 02
mr robinson report to the emerg depart today with difficulti breath and numb in hand and feet
goe away after 15 minut but has been happen more frequent
patient has histori of asthma but say these symptom do not match
physic examin general appear no acut distress pain free
vital sign afebril
puls 95 respir 20
blood pressur 135 85 puls oximetri is 90 on room air
cardiac regular rate and rhythm
normal s1 and s2 possibl murmur sent for evalut
neck jvp 5 cm
lung labor breath
abdomen soft nontend and nondistend
bood test show normal hba1c level
review of system as indic
past medic histori insulin depend diabet sinc 25yo retin neuropathi asthma
mi treat here previous
social famili histori no alcohol
smoke in the past quit 10 year ago
famili histori of ischemia and cad
1 provigil
2 atenolol
3 ativan
4 glucophag 850 mg t i d
5 humulin 15 unit at night
6 folat
7 metoprolol
8 cardia
9 vitamin e
10 coat aspirin
recommend full cardiac evalut possibl need for stent
patient opt to return follow day
73 changes: 73 additions & 0 deletions src/test/resources/nn/sample-trigrams.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
_re rec eco cor ord rd_ _da dat ate te_
_re rec eco cor ord rd_ _da dat ate te_ _20 206 067 67_ _05 05_ _22 22_
_fi fis ish shk hki kil il_ _me med edi dic ic_ _ce cen ent nte ter er_
_in int nte ter ern rn_ _me med edi dic ici cin in_ _as ass sso soc oci ci_
_33 33_ _me mer erc rci ci_ _pl pla laz aza za_
_sp spe pen enc nce cer er_ _ak ak_ _72 729 298 985 85_
_ro rob obi bin ins nso son on_ _jo joh ohn hn_
_87 874 749 496 967 673 736 36_
_05 05_ _22 22_ _67 67_
_hi his ist sto tor ori ri_ _of of_ _pr pre res ese sen ent nt_ _il ill ll_ _mr mr_ _ro rob obi bin ins nso son on_ _is is_ _re ret etu tur urn rn_ _fo for or_ _fo fol oll llo low ow_ _up up_ _fo for or_ _ar arm rm_ _fr fra rac act ctu tur ur_ _af aft fte ter er_ _fa fal all ll_ _of off ff_ _st sta tag age ge_ _di dir ire rec ect ct_ _sc sch cho hoo ool ol_ _pr pro rod odu duc uct ct_ _of of_ _mi mid ids dsu sum umm mm_ _ni nig igh ght ht_ _dr dre rea eam am_
_ar arm rm_ _re rem ema mai ain in_ _in in_ _ca cas ast st_ _an and nd_ _sl sli lin ing ng_ _pa pat ati tie ien ent nt_ _re rep epo por ort rt_ _mi min ino nor or_ _di dis isc sco com omf mfo for ort rt_ _bu but ut_ _is is_ _in in_ _go goo ood od_ _sp spi pir iri rit it_
_x_ _ra ray ay_ _sh sho how ow_ _fr fra rac act ctu tur ur_ _is is_ _he hea eal al_ _2_ _mo mor ore re_ _we wee eek ek_ _wi wit ith th_ _ca cas ast st_ _an and nd_ _sl sli lin ing ng_ _re rec eco com omm mme men end nd_
_pa pat ati tie ien ent nt_ _is is_ _co con onc nce cer ern rn_ _ab abo bou out ut_ _we wei eig igh ght ht_ _ga gai ain in_ _as as_ _he he_ _is is_ _le les ess ss_ _ac act cti tiv iv_ _no now ow_ _wi wit ith th_ _th the he_ _ca cas ast st_
_re rec eco com omm mme men end nd_ _wa wal alk lk_ _wa wat atc tch ch_ _fo foo ood od_ _in int nta tak ak_
_pa pat ati tie ien ent nt_ _ha has as_ _hi his ist sto tor ori ri_ _of of_ _di dia iab abe bet et_ _an and nd_ _bl blo loo ood od_ _su sug uga gar ar_ _co con ont ntr tro rol ol_ _is is_ _sl sli lip ip_ _wi wit ith th_ _la lac ack ck_ _of of_ _mo mov ove vem eme men ent nt_ _du due ue_ _to to_ _fr fra rac act ctu tur ur_
_hb hba ba1 a1c 1c_ _at at_ _la las ast st_ _ch che hec eck cku kup up_ _wa was as_
_7_ _2_
_re rec eco com omm mme men end nd_ _mo mor ore re_ _in inc ncr cre rea eas as_ _mo mon oni nit ito tor or_ _of of_ _bl blo loo ood od_ _su sug uga gar ar_ _wh whi hil ile le_ _le les ess ss_ _ac act cti tiv iv_
_ph phy hys ysi sic ic_ _ex exa xam ami min in_ _bp bp_ _13 136 36_ _80 80_ _we wei eig igh ght ht_ _in inc ncr cre rea eas as_ _fr fro rom om_ _25 250 50_ _to to_ _26 260 60_ _po pou oun und nd_
_pu pul uls ls_ _is is_ _71 71_
_as ass sse ses ess ss_ _an and nd_ _pl pla lan an_
_1_ _fr fra rac act ctu tur ur_ _he hea eal al_ _we wel ell ll_ _2_ _mo mor ore re_ _we wee eek ek_ _wi wit ith th_ _ca cas ast st_ _an and nd_ _sl sli lin ing ng_
_2_ _we wei eig igh ght ht_
_ad adv dvi vis is_ _to to_ _in inc ncr cre rea eas as_ _am amo mou oun unt nt_ _of of_ _ti tim ime me_ _wa wal alk lk_
_av avo voi oid id_ _st sta tag age ge_
_3_ _hb hba ba1 a1c 1c_ _mo mon oni nit ito tor or_ _mo mor ore re_ _cl clo los ose se_ _du dur ure re_ _ne nex ext xt_ _2_ _we wee eek ek_
_an ann nn_ _st ste tep eph phe hen ens nso son on_ _m_ _d_
_re rec eco cor ord rd_ _da dat ate te_ _20 206 068 68_ _12 12_ _18 18_
_hp hpi pi_
_59 59_ _yo yo_ _ma mal ale le_ _wi wit ith th_ _hi his ist sto tor ori ri_ _of of_ _dm dm_ _fa fam ami mil ili li_ _hi his ist sto tor ori ri_ _of of_ _ca cad ad_ _pr pre res ese sen ent nt_ _wi wit ith th_ _ne new ew_ _ch che hes est st_ _pa pai ain in_ _an and nd_ _sh sho hor ort rt_ _of of_ _br bre rea eat ath th_
_wa was as_ _fo fou oun und nd_ _to to_ _ha hav ave ve_ _la lat ate ter er_ _st ste tem emi mi_
_pa pai ain in_ _be beg ega gan an_ _2_ _ho hou our ur_ _ag ago go_ _pa pat ati tie ien ent nt_ _ca cal all ll_ _em em_ _wh whe hen en_ _be beg ega gan an_ _to to_ _fe fee eel el_ _li lig igh ght hth the hea ead ad_
_in int ntu tub ub_ _on on_ _ar arr rri riv iv_ _fo for or_ _re res esi sir ira rat ato tor ori ri_ _di dis ist str tre res ess ss_
_ec ecg cg_ _sh sho how ow_ _la lat ate ter er_ _st st_ _el ele lev ev_ _an and nd_ _a_ _ct ct_ _ch che hes est st_ _sc sca can an_ _wa was as_ _ne neg ega gat at_ _fo for or_ _ao aor ort rti tic ic_ _di dis iss sse sec ect ct_
_bl blo loo ood od_ _su sug uga gar ar_ _ex ext xtr tre rem em_ _el ele lev ev_ _50 500 00s 0s_ _in in_ _th the he_ _ed ed_
_on on_ _im imm mme med edi di_ _lh lhc hc_ _a_ _10 100 00_ _oc occ ccl clu lus us_ _of of_ _hi his is_ _om om1 m1_ _wa was as_ _fo fou oun und nd_ _an and nd_ _re rec eca can ann nn_ _as as_ _we wel ell ll_ _as as_ _st ste ten ent nt_ _wi wit ith th_ _a_ _vi vis isi sio ion on_ _bm bms ms_ _re res esu sul ult lt_ _in in_
_ti tim imi mi_ _2_ _fl flo low ow_
_an an_ _ia iab abp bp_ _wa was as_ _pl pla lac ace ce_ _wi wit ith th_ _in ini nit iti ti_ _au aug ugm gme men ent nt_ _di dia ias ast sto tol ol_ _pr pre res ess ssu sur ur_ _re rec eco cor ord rd_ _in in_ _th the he_ _80 80s 0s_ _st sta tar art rt_ _on on_ _le lev evo vop oph ph_ _do dob obu but uta tam ami min in_ _an and nd_ _do dop opa pam ami min in_
_re rec eco cor ord rd_ _da dat ate te_ _20 206 069 69_ _11 11_ _02 02_
_mr mr_ _ro rob obi bin ins nso son on_ _re rep epo por ort rt_ _to to_ _th the he_ _em eme mer erg rg_ _de dep epa par art rt_ _to tod oda day ay_ _wi wit ith th_ _di dif iff ffi fic icu cul ult lti ti_ _br bre rea eat ath th_ _an and nd_ _nu num umb mb_ _in in_ _ha han and nd_ _an and nd_ _fe fee eet et_
_go goe oe_ _aw awa way ay_ _af aft fte ter er_ _15 15_ _mi min inu nut ut_ _bu but ut_ _ha has as_ _be bee een en_ _ha hap app ppe pen en_ _mo mor ore re_ _fr fre req equ que uen ent nt_
_pa pat ati tie ien ent nt_ _ha has as_ _hi his ist sto tor ori ri_ _of of_ _as ast sth thm hma ma_ _bu but ut_ _sa say ay_ _th the hes ese se_ _sy sym ymp mpt pto tom om_ _do do_ _no not ot_ _ma mat atc tch ch_
_ph phy hys ysi sic ic_ _ex exa xam ami min in_ _ge gen ene ner era ral al_ _ap app ppe pea ear ar_ _no no_ _ac acu cut ut_ _di dis ist str tre res ess ss_ _pa pai ain in_ _fr fre ree ee_
_vi vit ita tal al_ _si sig ign gn_ _af afe feb ebr bri ril il_
_pu pul uls ls_ _95 95_ _re res esp spi pir ir_ _20 20_
_bl blo loo ood od_ _pr pre res ess ssu sur ur_ _13 135 35_ _85 85_ _pu pul uls ls_ _ox oxi xim ime met etr tri ri_ _is is_ _90 90_ _on on_ _ro roo oom om_ _ai air ir_
_ca car ard rdi dia iac ac_ _re reg egu gul ula lar ar_ _ra rat ate te_ _an and nd_ _rh rhy hyt yth thm hm_
_no nor orm rma mal al_ _s1 s1_ _an and nd_ _s2 s2_ _po pos oss ssi sib ibl bl_ _mu mur urm rmu mur ur_ _se sen ent nt_ _fo for or_ _ev eva val alu lut ut_
_ne nec eck ck_ _jv jvp vp_ _5_ _cm cm_
_lu lun ung ng_ _la lab abo bor or_ _br bre rea eat ath th_
_ab abd bdo dom ome men en_ _so sof oft ft_ _no non ont nte ten end nd_ _an and nd_ _no non ond ndi dis ist ste ten end nd_
_bo boo ood od_ _te tes est st_ _sh sho how ow_ _no nor orm rma mal al_ _hb hba ba1 a1c 1c_ _le lev eve vel el_
_re rev evi vie iew ew_ _of of_ _sy sys yst ste tem em_ _as as_ _in ind ndi dic ic_
_pa pas ast st_ _me med edi dic ic_ _hi his ist sto tor ori ri_ _in ins nsu sul uli lin in_ _de dep epe pen end nd_ _di dia iab abe bet et_ _si sin inc nc_ _25 25y 5yo yo_ _re ret eti tin in_ _ne neu eur uro rop opa pat ath thi hi_ _as ast sth thm hma ma_
_ob obe bes es_
_mi mi_ _tr tre rea eat at_ _he her ere re_ _pr pre rev evi vio iou ous us_
_is isc sch che hem emi mia ia_
_so soc oci cia ial al_ _fa fam ami mil ili li_ _hi his ist sto tor ori ri_ _no no_ _al alc lco coh oho hol ol_
_sm smo mok oke ke_ _in in_ _th the he_ _pa pas ast st_ _qu qui uit it_ _10 10_ _ye yea ear ar_ _ag ago go_
_fa fam ami mil ili li_ _hi his ist sto tor ori ri_ _of of_ _is isc sch che hem emi mia ia_ _an and nd_ _ca cad ad_
_me med edi dic ic_
_1_ _pr pro rov ovi vig igi gil il_
_2_ _at ate ten eno nol olo lol ol_
_3_ _at ati tiv iva van an_
_4_ _gl glu luc uco cop oph pha hag ag_ _85 850 50_ _mg mg_ _t_ _i_ _d_
_5_ _hu hum umu mul uli lin in_ _15 15_ _un uni nit it_ _at at_ _ni nig igh ght ht_
_6_ _fo fol ola lat at_
_7_ _me met eto top opr pro rol olo lol ol_
_8_ _ca car ard rdi dia ia_
_9_ _vi vit ita tam ami min in_ _e_
_10 10_ _co coa oat at_ _as asp spi pir iri rin in_
_re rec eco com omm mme men end nd_ _fu ful ull ll_ _ca car ard rdi dia iac ac_ _ev eva val alu lut ut_ _po pos oss ssi sib ibl bl_ _ne nee eed ed_ _fo for or_ _st ste ten ent nt_
_pa pat ati tie ien ent nt_ _op opt pt_ _to to_ _re ret etu tur urn rn_ _fo fol oll llo low ow_ _da day ay_

