From fe0f9c64c4df2736e6b2b60d976f2b6f2363876e Mon Sep 17 00:00:00 2001 From: Mohammed Saqib <msaqib3@gatech.edu> Date: Sat, 28 Oct 2017 20:34:46 -0400 Subject: [PATCH 01/10] explicitly declare code sections in readme.md, added requirements.txt, added .gitignore for possible config --- .gitignore | 3 ++ README.md | 93 ++++++++++++++++++++++++++---------------------- requirements.txt | 5 +++ 3 files changed, 58 insertions(+), 43 deletions(-) create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index b65cab32..f2459449 100644 --- a/.gitignore +++ b/.gitignore @@ -96,3 +96,6 @@ ENV/ # States of models keras_states/ states/ + +# config for connection to DB +scripts/config.json diff --git a/README.md b/README.md index 26558705..e766f8f0 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,12 @@ In [Harutyunyan, Khachatrian, Kale, and Galstyan 2017](https://arxiv.org/abs/170 ## Requirements +Use the following command if you have pip installed + +``` +pip install -r requirements.txt +``` + We do not provide the MIMIC-III data itself. You must acquire the data yourself from https://mimic.physionet.org/. Specifically, download the CSVs. Otherwise, generally we make liberal use of the following packages: - numpy @@ -42,37 +48,38 @@ For logistic regression baselines [sklearn](http://scikit-learn.org/) is requir Here are the required steps to build the benchmark. It assumes that you already have MIMIC-III dataset (lots of CSV files) on the disk. 1. Clone the repo. +``` git clone https://github.com/YerevaNN/mimic3-benchmarks/ cd mimic3-benchmarks/ - +``` 2. Add the path to the `PYTHONPATH` (sorry for this). - +``` export PYTHONPATH=$PYTHONPATH:[PATH TO THIS REPOSITORY] - +``` 3. The following command takes MIMIC-III CSVs, generates one directory per `SUBJECT_ID` and writes ICU stay information to `data/[SUBJECT_ID/stays.csv`, diagnoses to `data/[SUBJECT_ID]/diagnoses.csv`, and events to `data/[SUBJECT_ID]/events.csv`. This step might take around an hour. - +``` python scripts/extract_subjects.py [PATH TO MIMIC-III CSVs] data/root/ - +``` 4. The following command attempts to fix some issues (ICU stay ID is missing) and removes the events that have missing information. 4741761 events (80%) remain after removing all suspicious rows. - +``` python scripts/validate_events.py data/root/ - +``` 5. The next command breaks up per-subject data into separate episodes (pertaining to ICU stays). Time series of events are stored in ```[SUBJECT_ID]/episode{#}_timeseries.csv``` (where # counts distinct episodes) while episode-level information (patient age, gender, ethnicity, height, weight) and outcomes (mortality, length of stay, diagnoses) are stores in ```[SUBJECT_ID]/episode{#}.csv```. This script requires two files, one that maps event ITEMIDs to clinical variables and another that defines valid ranges for clinical variables (for detecting outliers, etc.). - +``` python scripts/extract_episodes_from_subjects.py data/root/ - +``` 6. The next command splits the whole dataset into training and testing sets. Note that all benchmarks use the same split: - +``` python scripts/split_train_and_test.py data/root/ - +``` 7. The following commands will generate task-specific datasets, which can later be used in models. These commands are independent, if you are going to work only on one benchmark task, you can run only the corresponding command. - +``` python scripts/create_in_hospital_mortality.py data/root/ data/in-hospital-mortality/ python scripts/create_decompensation.py data/root/ data/decompensation/ python scripts/create_length_of_stay.py data/root/ data/length-of-stay/ python scripts/create_phenotyping.py data/root/ data/phenotyping/ python scripts/create_multitask.py data/root/ data/multitask/ - +``` ## Working with baseline models For each of the 4 main tasks we provide logistic regression and LSTM baselines. @@ -81,87 +88,87 @@ Please note that running linear models can take hours because of extensive grid ### Train / validation split Use the following command to extract validation set from the traning set. This step is required for running the baseline models. - +``` python mimic3models/split_train_val.py [TASK] - +``` `[TASK]` is either `in-hospital-mortality`, `decompensation`, `length-of-stay`, `phenotyping` or `multitask`. ### In-hospital mortality prediction Run the following command to train the neural network which gives the best result. We got the best performance on validation set after 8 epochs. - +``` cd mimic3models/in_hospital_mortality/ python -u main.py --network lstm --dim 256 --timestep 2.0 --mode train --batch_size 8 --log_every 30 - +``` To test the model use the following: - +``` python -u main.py --network lstm --dim 256 --timestep 2.0 --mode test --batch_size 8 --log_every 30 --load_state best_model.state - +``` Use the following command to train logistic regression. The best model we got used L2 regularization with `C=0.001`: - +``` cd mimic3models/in_hospital_mortality/logistic/ python -u main.py --l2 --C 0.001 - +``` ### Decompensation prediction The best model we got for this task was trained for 110 chunks (that's less than one epoch; it overfits before reaching one epoch because there are many training samples for the same patient with different lengths). - +``` cd mimic3models/decompensation/ python -u main.py --network lstm --dim 256 --mode train --batch_size 8 --log_every 30 - +``` Here is the command to test: - +``` python -u main.py --network lstm --dim 256 --mode test --batch_size 8 --log_every 30 --load_state best_model.state - +``` Use the following command to train a logistic regression. It will do a grid search over a small space of hyperparameters and will report the scores for every case. - +``` cd mimic3models/decompensation/logistic/ python -u main.py - +``` ### Length of stay prediction The best model we got for this task was trained for 15 chunks. - +``` cd mimic3models/length_of_stay/ python -u main.py --network lstm_cf_custom --dim 256 --mode train --batch_size 8 --log_every 30 - +``` Run the following command to test the best pretrained neural network. - +``` python -u main.py --network lstm_cf_custom --dim 256 --mode test --batch_size 8 --log_every 30 --load_state best_model.state - +``` Use the following command to train a logistic regression. It will do a grid search over a small space of hyperparameters and will report the scores for every case. - +``` cd mimic3models/length_of_stay/logistic/ python -u main_cf.py - +``` ### Phenotype classification The best model we got for this task was trained for 30 epochs. - +``` cd mimic3models/phenotyping/ python -u main.py --network lstm_2layer --dim 512 --mode train --batch_size 8 --log_every 30 - +``` Use the following command for testing: - +``` python -u main.py --network lstm_2layer --dim 512 --mode test --batch_size 8 --log_every 30 --load_state best_model.state - +``` Use the following command for logistic regression. It will do a grid search over a small space of hyperparameters and will report the scores for every case. - +``` cd mimic3models/phenotyping/logistic/ python -u main.py - +``` ### Multitask learning `ihm_C`, `decomp_C`, `los_C` and `ph_C` coefficients control the relative weight of the tasks in the multitask model. Default is `1.0`. The best model we got was trained for 12 epochs. - +``` cd mimic3models/multitask/ python -u main.py --network lstm --dim 1024 --mode train --batch_size 8 --log_every 30 --ihm_C 0.02 --decomp_C 0.1 --los_C 0.5 - +``` Use the following command for testing: - +``` python -u main.py --network lstm --dim 1024 --mode test --batch_size 8 --log_every 30 --load_state best_model.state - +``` ## General todos: diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..d9e9913e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +numpy +pandas +pyyaml +psycopg2 +addict \ No newline at end of file From 3766a53439e052d2d64aca2865432bf3c788a5aa Mon Sep 17 00:00:00 2001 From: Mohammed Saqib <msaqib3@gatech.edu> Date: Sat, 28 Oct 2017 20:35:18 -0400 Subject: [PATCH 02/10] added some preliminary code for sql support --- mimic3benchmark/mimic3csv.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/mimic3benchmark/mimic3csv.py b/mimic3benchmark/mimic3csv.py index aa045227..08bc077b 100644 --- a/mimic3benchmark/mimic3csv.py +++ b/mimic3benchmark/mimic3csv.py @@ -3,9 +3,43 @@ import os import pandas as pd import sys +import psycopg2 from pandas import DataFrame +def convertListToSQL(listItems): + ''' + Transform a list of items, (usually ids) + from type int to format "(itemId1, itemId2)" + for sql + :param listItems a python list of stuff + :return string in sql format for "WHERE var IN" would work + ''' + toRet = "" + for item in listItems: + toRet += str(item) + ", " + toRet = "(" + toRet[0:-2] + ")" + return toRet + +def query(sql, config): + """ + :param sql Specific string query to run on the MIMIC3 sql database + :param config a dict/object containing fields dbname, user, host, password, and port + to create the connection to the database. + :return: connection to database + """ + try: + config = Dict(config) + conn = psycopg2.connect("dbname='" + str(config.dbname) + + "' user='" + str(config.user) + + "' host='" + str(config.host) + + "' password='" + str(config.password) + + "' port='" + str(config.port) + "' ") + except: + raise + cur = conn.cursor() + cur.execute("SET search_path TO mimiciii") + return pd.read_sql(sql, conn) def read_patients_table(mimic3_path): pats = DataFrame.from_csv(os.path.join(mimic3_path, 'PATIENTS.csv')) From 75f0b7926517755c7253a5704ecbd3c6b90e35bb Mon Sep 17 00:00:00 2001 From: Mohammed Saqib <msaqib3@gatech.edu> Date: Sun, 29 Oct 2017 13:56:18 -0400 Subject: [PATCH 03/10] added options for sql to mimic3csv, without disrupting existing functions --- mimic3benchmark/mimic3csv.py | 58 ++++++++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/mimic3benchmark/mimic3csv.py b/mimic3benchmark/mimic3csv.py index 08bc077b..0b256353 100644 --- a/mimic3benchmark/mimic3csv.py +++ b/mimic3benchmark/mimic3csv.py @@ -4,6 +4,7 @@ import pandas as pd import sys import psycopg2 +import json from pandas import DataFrame @@ -40,39 +41,70 @@ def query(sql, config): cur = conn.cursor() cur.execute("SET search_path TO mimiciii") return pd.read_sql(sql, conn) - -def read_patients_table(mimic3_path): - pats = DataFrame.from_csv(os.path.join(mimic3_path, 'PATIENTS.csv')) +def get_config(path): + ''' + Gets the config to connect to database (stored in json file) + :param path to the json + :return an object with fields to key info about connection to database + ''' + try: + config = json.load(open(path, "r")) + except: + print "could not open path: " + path + return config +def read_patients_table(mimic3_path, use_db = False): + if (use_db): + pats = query("SELECT * FROM patients", get_config(mimic3_path)) + pats.columns = pats.columns.str.upper() + else: + pats = DataFrame.from_csv(os.path.join(mimic3_path, 'PATIENTS.csv')) pats = pats[['SUBJECT_ID', 'GENDER', 'DOB', 'DOD']] pats.DOB = pd.to_datetime(pats.DOB) pats.DOD = pd.to_datetime(pats.DOD) return pats -def read_admissions_table(mimic3_path): - admits = DataFrame.from_csv(os.path.join(mimic3_path, 'ADMISSIONS.csv')) +def read_admissions_table(mimic3_path, use_db = False): + if (use_db): + admits = query("SELECT * FROM admissions", get_config(mimic3_path)) + admits.columns = admits.columns.str.upper() + else: + admits = DataFrame.from_csv(os.path.join(mimic3_path, 'ADMISSIONS.csv')) admits = admits[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME', 'ETHNICITY', 'DIAGNOSIS']] admits.ADMITTIME = pd.to_datetime(admits.ADMITTIME) admits.DISCHTIME = pd.to_datetime(admits.DISCHTIME) admits.DEATHTIME = pd.to_datetime(admits.DEATHTIME) return admits -def read_icustays_table(mimic3_path): - stays = DataFrame.from_csv(os.path.join(mimic3_path, 'ICUSTAYS.csv')) +def read_icustays_table(mimic3_path, use_db = False): + if (use_db): + stays = query("SELECT * FROM ICUSTAYS", get_config(mimic3_path)) + stays.columns = stays.columns.str.upper() + else: + stays = DataFrame.from_csv(os.path.join(mimic3_path, 'ICUSTAYS.csv')) stays.INTIME = pd.to_datetime(stays.INTIME) stays.OUTTIME = pd.to_datetime(stays.OUTTIME) return stays -def read_icd_diagnoses_table(mimic3_path): - codes = DataFrame.from_csv(os.path.join(mimic3_path, 'D_ICD_DIAGNOSES.csv')) +def read_icd_diagnoses_table(mimic3_path, use_db = False): + if (use_db): + codes = query("SELECT * FROM D_ICD_DIAGNOSES", get_config(mimic3_path)) + codes.columns = codes.columns.str.upper() + else: + codes = DataFrame.from_csv(os.path.join(mimic3_path, 'D_ICD_DIAGNOSES.csv')) codes = codes[['ICD9_CODE','SHORT_TITLE','LONG_TITLE']] diagnoses = DataFrame.from_csv(os.path.join(mimic3_path, 'DIAGNOSES_ICD.csv')) diagnoses = diagnoses.merge(codes, how='inner', left_on='ICD9_CODE', right_on='ICD9_CODE') diagnoses[['SUBJECT_ID','HADM_ID','SEQ_NUM']] = diagnoses[['SUBJECT_ID','HADM_ID','SEQ_NUM']].astype(int) return diagnoses -def read_events_table_by_row(mimic3_path, table): +def read_events_table_by_row(mimic3_path, table, use_db = False, items_to_keep = None, subjects_to_keep = None): nb_rows = { 'chartevents': 263201376, 'labevents': 27872576, 'outputevents': 4349340 } - reader = csv.DictReader(open(os.path.join(mimic3_path, table.upper() + '.csv'), 'r')) + if (use_db): + events = query("SELECT * FROM table WHERE subject_id in " + convertListToSQL(subjects_to_keep) " AND itemid in " + convertListToSQL(items_to_keep), get_config(mimic3_path)) + events.columns = events.columns.str.upper() + reader = events.iterrows() + else: + reader = csv.DictReader(open(os.path.join(mimic3_path, table.upper() + '.csv'), 'r')) for i,row in enumerate(reader): if 'ICUSTAY_ID' not in row: row['ICUSTAY_ID'] = '' @@ -161,7 +193,7 @@ def break_up_diagnoses_by_subject(diagnoses, output_path, subjects=None, verbose if verbose: sys.stdout.write('DONE!\n') -def read_events_table_and_break_up_by_subject(mimic3_path, table, output_path, items_to_keep=None, subjects_to_keep=None, verbose=1): +def read_events_table_and_break_up_by_subject(mimic3_path, table, output_path, items_to_keep=None, subjects_to_keep=None, verbose=1, use_db = False): obs_header = [ 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'CHARTTIME', 'ITEMID', 'VALUE', 'VALUEUOM' ] if items_to_keep is not None: items_to_keep = set([ str(s) for s in items_to_keep ]) @@ -192,7 +224,7 @@ def write_current_observations(): w.writerows(nonlocal.curr_obs) nonlocal.curr_obs = [] - for row, row_no, nb_rows in read_events_table_by_row(mimic3_path, table): + for row, row_no, nb_rows in read_events_table_by_row(mimic3_path, table, items_to_keep, subjects_to_keep): if verbose and (row_no % 100000 == 0): if nonlocal.last_write_no != '': sys.stdout.write('\rprocessing {0}: ROW {1} of {2}...last write ' From 82c30116c73de5f431d30bb17a034e43020f5f35 Mon Sep 17 00:00:00 2001 From: Mohammed Saqib <msaqib3@gatech.edu> Date: Sun, 29 Oct 2017 14:01:59 -0400 Subject: [PATCH 04/10] some debugging in places where args weren't updated, updated extract_subjects to have new use_db flag --- mimic3benchmark/mimic3csv.py | 2 +- scripts/extract_subjects.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/mimic3benchmark/mimic3csv.py b/mimic3benchmark/mimic3csv.py index 0b256353..8f4f535c 100644 --- a/mimic3benchmark/mimic3csv.py +++ b/mimic3benchmark/mimic3csv.py @@ -224,7 +224,7 @@ def write_current_observations(): w.writerows(nonlocal.curr_obs) nonlocal.curr_obs = [] - for row, row_no, nb_rows in read_events_table_by_row(mimic3_path, table, items_to_keep, subjects_to_keep): + for row, row_no, nb_rows in read_events_table_by_row(mimic3_path, table, use_db=use_db, items_to_keep=items_to_keep, subjects_to_keep=subjects_to_keep): if verbose and (row_no % 100000 == 0): if nonlocal.last_write_no != '': sys.stdout.write('\rprocessing {0}: ROW {1} of {2}...last write ' diff --git a/scripts/extract_subjects.py b/scripts/extract_subjects.py index 30be6b20..8db6775c 100644 --- a/scripts/extract_subjects.py +++ b/scripts/extract_subjects.py @@ -6,12 +6,14 @@ from mimic3benchmark.preprocessing import add_hcup_ccs_2015_groups, make_phenotype_label_matrix parser = argparse.ArgumentParser(description='Extract per-subject data from MIMIC-III CSV files.') -parser.add_argument('mimic3_path', type=str, help='Directory containing MIMIC-III CSV files.') +parser.add_argument('mimic3_path', type=str, help='Directory containing MIMIC-III CSV files.' \ + + ' (if use_db is used, instead file path where config for connection is provided)') parser.add_argument('output_path', type=unicode, help='Directory where per-subject data should be written.') parser.add_argument('--event_tables', '-e', type=unicode, nargs='+', help='Tables from which to read events.', default=['CHARTEVENTS', 'LABEVENTS', 'OUTPUTEVENTS']) parser.add_argument('--phenotype_definitions', '-p', type=unicode, default='resources/hcup_ccs_2015_definitions.yaml', help='YAML file with phenotype definitions.') +parser.add_argument('--use_db', type=bool, action='store_true', default=False) parser.add_argument('--itemids_file', '-i', type=unicode, help='CSV containing list of ITEMIDs to keep.') parser.add_argument('--verbose', '-v', type=int, help='Level of verbosity in output.', default=1) parser.add_argument('--test', action='store_true', help='TEST MODE: process only 1000 subjects, 1000000 events.') @@ -22,9 +24,9 @@ except: pass -patients = read_patients_table(args.mimic3_path) -admits = read_admissions_table(args.mimic3_path) -stays = read_icustays_table(args.mimic3_path) +patients = read_patients_table(args.mimic3_path, use_db=args.use_db) +admits = read_admissions_table(args.mimic3_path, use_db=args.use_db) +stays = read_icustays_table(args.mimic3_path, use_db=args.use_db) if args.verbose: print 'START:', stays.ICUSTAY_ID.unique().shape[0], stays.HADM_ID.unique().shape[0], \ stays.SUBJECT_ID.unique().shape[0] @@ -52,7 +54,7 @@ stays.SUBJECT_ID.unique().shape[0] stays.to_csv(os.path.join(args.output_path, 'all_stays.csv'), index=False) -diagnoses = read_icd_diagnoses_table(args.mimic3_path) +diagnoses = read_icd_diagnoses_table(args.mimic3_path, use_db=args.use_db) diagnoses = filter_diagnoses_on_stays(diagnoses, stays) diagnoses.to_csv(os.path.join(args.output_path, 'all_diagnoses.csv'), index=False) count_icd_codes(diagnoses, output_path=os.path.join(args.output_path, 'diagnosis_counts.csv')) @@ -74,4 +76,4 @@ [int(itemid) for itemid in DataFrame.from_csv(args.itemids_file)['ITEMID'].unique()]) if args.itemids_file else None for table in args.event_tables: read_events_table_and_break_up_by_subject(args.mimic3_path, table, args.output_path, items_to_keep=items_to_keep, - subjects_to_keep=subjects, verbose=args.verbose) + subjects_to_keep=subjects, verbose=args.verbose, use_db=args.use_db) From 074b8c705acdb00c3f1223523261cd8db2e57cca Mon Sep 17 00:00:00 2001 From: Mohammed Saqib <msaqib3@gatech.edu> Date: Sun, 29 Oct 2017 19:40:44 -0400 Subject: [PATCH 05/10] added requirements in standardized txt, changed readme --- README.md | 6 ++++++ requirements.txt | 3 +-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e766f8f0..a13db584 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,12 @@ Use the following command if you have pip installed pip install -r requirements.txt ``` +If you have Anaconda + +``` +conda install --f requirements.txt +``` + We do not provide the MIMIC-III data itself. You must acquire the data yourself from https://mimic.physionet.org/. Specifically, download the CSVs. Otherwise, generally we make liberal use of the following packages: - numpy diff --git a/requirements.txt b/requirements.txt index d9e9913e..03f84465 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ numpy pandas pyyaml -psycopg2 -addict \ No newline at end of file +psycopg2 \ No newline at end of file From 3742b52ed950ba338712b77e59d117b799e0d200 Mon Sep 17 00:00:00 2001 From: Mohammed Saqib <msaqib3@gatech.edu> Date: Sun, 29 Oct 2017 19:41:15 -0400 Subject: [PATCH 06/10] debugging with sql support --- mimic3benchmark/mimic3csv.py | 68 ++++++++++++++++++++++-------------- scripts/extract_subjects.py | 2 +- 2 files changed, 42 insertions(+), 28 deletions(-) diff --git a/mimic3benchmark/mimic3csv.py b/mimic3benchmark/mimic3csv.py index 8f4f535c..6f0700d9 100644 --- a/mimic3benchmark/mimic3csv.py +++ b/mimic3benchmark/mimic3csv.py @@ -30,17 +30,18 @@ def query(sql, config): :return: connection to database """ try: - config = Dict(config) - conn = psycopg2.connect("dbname='" + str(config.dbname) - + "' user='" + str(config.user) - + "' host='" + str(config.host) - + "' password='" + str(config.password) - + "' port='" + str(config.port) + "' ") + conn = psycopg2.connect("dbname='" + str(config["dbname"]) \ + + "' user='" + str(config["user"]) \ + + "' host='" + str(config["host"]) \ + + "' password='" + str(config["password"]) \ + + "' port='" + str(config["port"]) + "' ") except: raise cur = conn.cursor() cur.execute("SET search_path TO mimiciii") - return pd.read_sql(sql, conn) + df = pd.read_sql(sql, conn) + df.columns = df.columns.str.upper() + return df def get_config(path): ''' Gets the config to connect to database (stored in json file) @@ -55,7 +56,6 @@ def get_config(path): def read_patients_table(mimic3_path, use_db = False): if (use_db): pats = query("SELECT * FROM patients", get_config(mimic3_path)) - pats.columns = pats.columns.str.upper() else: pats = DataFrame.from_csv(os.path.join(mimic3_path, 'PATIENTS.csv')) pats = pats[['SUBJECT_ID', 'GENDER', 'DOB', 'DOD']] @@ -66,7 +66,6 @@ def read_patients_table(mimic3_path, use_db = False): def read_admissions_table(mimic3_path, use_db = False): if (use_db): admits = query("SELECT * FROM admissions", get_config(mimic3_path)) - admits.columns = admits.columns.str.upper() else: admits = DataFrame.from_csv(os.path.join(mimic3_path, 'ADMISSIONS.csv')) admits = admits[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME', 'ETHNICITY', 'DIAGNOSIS']] @@ -78,7 +77,6 @@ def read_admissions_table(mimic3_path, use_db = False): def read_icustays_table(mimic3_path, use_db = False): if (use_db): stays = query("SELECT * FROM ICUSTAYS", get_config(mimic3_path)) - stays.columns = stays.columns.str.upper() else: stays = DataFrame.from_csv(os.path.join(mimic3_path, 'ICUSTAYS.csv')) stays.INTIME = pd.to_datetime(stays.INTIME) @@ -88,11 +86,13 @@ def read_icustays_table(mimic3_path, use_db = False): def read_icd_diagnoses_table(mimic3_path, use_db = False): if (use_db): codes = query("SELECT * FROM D_ICD_DIAGNOSES", get_config(mimic3_path)) - codes.columns = codes.columns.str.upper() else: codes = DataFrame.from_csv(os.path.join(mimic3_path, 'D_ICD_DIAGNOSES.csv')) codes = codes[['ICD9_CODE','SHORT_TITLE','LONG_TITLE']] - diagnoses = DataFrame.from_csv(os.path.join(mimic3_path, 'DIAGNOSES_ICD.csv')) + if (use_db): + diagnoses = query("SELECT * FROM DIAGNOSES_ICD", get_config(mimic3_path)) + else: + diagnoses = DataFrame.from_csv(os.path.join(mimic3_path, 'DIAGNOSES_ICD.csv')) diagnoses = diagnoses.merge(codes, how='inner', left_on='ICD9_CODE', right_on='ICD9_CODE') diagnoses[['SUBJECT_ID','HADM_ID','SEQ_NUM']] = diagnoses[['SUBJECT_ID','HADM_ID','SEQ_NUM']].astype(int) return diagnoses @@ -100,15 +100,29 @@ def read_icd_diagnoses_table(mimic3_path, use_db = False): def read_events_table_by_row(mimic3_path, table, use_db = False, items_to_keep = None, subjects_to_keep = None): nb_rows = { 'chartevents': 263201376, 'labevents': 27872576, 'outputevents': 4349340 } if (use_db): - events = query("SELECT * FROM table WHERE subject_id in " + convertListToSQL(subjects_to_keep) " AND itemid in " + convertListToSQL(items_to_keep), get_config(mimic3_path)) - events.columns = events.columns.str.upper() - reader = events.iterrows() + if items_to_keep is None and subjects_to_keep is None: + events = query("SELECT * FROM " + table, get_config(mimic3_path)) + else: + queryString = "SELECT * FROM " + table + " WHERE " + if subjects_to_keep is not None: + queryString = queryString + "subject_id in " + convertListToSQL(subjects_to_keep) + if items_to_keep is not None: + queryString = queryString + " AND " + if items_to_keep is not None: + queryString = queryString + " itemid in " + convertListToSQL(items_to_keep) + events = query(queryString, get_config(mimic3_path)) + if 'ICUSTAY_ID' not in events.columns: + events['ICUSTAY_ID'] = pd.Series(dtype=str) + events['ICUSTAY_ID'].fillna(""); + for i, row in events.iterrows(): + yield row, i, nb_rows[table.lower()] + else: reader = csv.DictReader(open(os.path.join(mimic3_path, table.upper() + '.csv'), 'r')) - for i,row in enumerate(reader): - if 'ICUSTAY_ID' not in row: - row['ICUSTAY_ID'] = '' - yield row, i, nb_rows[table.lower()] + for i,row in enumerate(reader): + if 'ICUSTAY_ID' not in row: + row['ICUSTAY_ID'] = '' + yield row, i, nb_rows[table.lower()] def count_icd_codes(diagnoses, output_path=None): codes = diagnoses[['ICD9_CODE','SHORT_TITLE','LONG_TITLE']].drop_duplicates().set_index('ICD9_CODE') @@ -252,12 +266,12 @@ def write_current_observations(): nonlocal.curr_obs.append(row_out) nonlocal.curr_subject_id = row['SUBJECT_ID'] - if nonlocal.curr_subject_id != '': - write_current_observations() + if nonlocal.curr_subject_id != '': + write_current_observations() - if verbose and (row_no % 100000 == 0): - sys.stdout.write('\rprocessing {0}: ROW {1} of {2}...last write ' - '({3}) {4} rows for subject {5}...DONE!\n'.format(table, row_no, nb_rows, - nonlocal.last_write_no, - nonlocal.last_write_nb_rows, - nonlocal.last_write_subject_id)) + if verbose and (row_no % 100000 == 0): + sys.stdout.write('\rprocessing {0}: ROW {1} of {2}...last write ' + '({3}) {4} rows for subject {5}...DONE!\n'.format(table, row_no, nb_rows, + nonlocal.last_write_no, + nonlocal.last_write_nb_rows, + nonlocal.last_write_subject_id)) diff --git a/scripts/extract_subjects.py b/scripts/extract_subjects.py index 8db6775c..e4b698c7 100644 --- a/scripts/extract_subjects.py +++ b/scripts/extract_subjects.py @@ -13,7 +13,7 @@ default=['CHARTEVENTS', 'LABEVENTS', 'OUTPUTEVENTS']) parser.add_argument('--phenotype_definitions', '-p', type=unicode, default='resources/hcup_ccs_2015_definitions.yaml', help='YAML file with phenotype definitions.') -parser.add_argument('--use_db', type=bool, action='store_true', default=False) +parser.add_argument('--use_db', action='store_true', default=False) parser.add_argument('--itemids_file', '-i', type=unicode, help='CSV containing list of ITEMIDs to keep.') parser.add_argument('--verbose', '-v', type=int, help='Level of verbosity in output.', default=1) parser.add_argument('--test', action='store_true', help='TEST MODE: process only 1000 subjects, 1000000 events.') From a7f49ae17d0534ca7c860ede54a07c5ff9e06426 Mon Sep 17 00:00:00 2001 From: Mohammed Saqib <msaqib3@gatech.edu> Date: Sun, 29 Oct 2017 19:55:56 -0400 Subject: [PATCH 07/10] fixed issue in mimic3csv.py with rowno reading --- README.md | 4 +++- mimic3benchmark/mimic3csv.py | 13 +++---------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index a13db584..0fa48c9d 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ If you have Anaconda ``` conda install --f requirements.txt ``` - +### Details on Requirements We do not provide the MIMIC-III data itself. You must acquire the data yourself from https://mimic.physionet.org/. Specifically, download the CSVs. Otherwise, generally we make liberal use of the following packages: - numpy @@ -62,6 +62,8 @@ Here are the required steps to build the benchmark. It assumes that you already ``` export PYTHONPATH=$PYTHONPATH:[PATH TO THIS REPOSITORY] ``` +If you are using Anaconda, setting the PYTHONPATH may cause problems, so you may need to copy the scripts to the base directory of the repo. + 3. The following command takes MIMIC-III CSVs, generates one directory per `SUBJECT_ID` and writes ICU stay information to `data/[SUBJECT_ID/stays.csv`, diagnoses to `data/[SUBJECT_ID]/diagnoses.csv`, and events to `data/[SUBJECT_ID]/events.csv`. This step might take around an hour. ``` python scripts/extract_subjects.py [PATH TO MIMIC-III CSVs] data/root/ diff --git a/mimic3benchmark/mimic3csv.py b/mimic3benchmark/mimic3csv.py index 6f0700d9..a86f47d7 100644 --- a/mimic3benchmark/mimic3csv.py +++ b/mimic3benchmark/mimic3csv.py @@ -115,7 +115,7 @@ def read_events_table_by_row(mimic3_path, table, use_db = False, items_to_keep = events['ICUSTAY_ID'] = pd.Series(dtype=str) events['ICUSTAY_ID'].fillna(""); for i, row in events.iterrows(): - yield row, i, nb_rows[table.lower()] + yield row, i, events.shape[0] else: reader = csv.DictReader(open(os.path.join(mimic3_path, table.upper() + '.csv'), 'r')) @@ -266,12 +266,5 @@ def write_current_observations(): nonlocal.curr_obs.append(row_out) nonlocal.curr_subject_id = row['SUBJECT_ID'] - if nonlocal.curr_subject_id != '': - write_current_observations() - - if verbose and (row_no % 100000 == 0): - sys.stdout.write('\rprocessing {0}: ROW {1} of {2}...last write ' - '({3}) {4} rows for subject {5}...DONE!\n'.format(table, row_no, nb_rows, - nonlocal.last_write_no, - nonlocal.last_write_nb_rows, - nonlocal.last_write_subject_id)) + if nonlocal.curr_subject_id != '': + write_current_observations() From 38a980fa4aa8ad97b0f651bf3881e5d235e73d80 Mon Sep 17 00:00:00 2001 From: Mohammed Saqib <msaqib3@gatech.edu> Date: Sun, 29 Oct 2017 20:09:13 -0400 Subject: [PATCH 08/10] added documentation on how to use extract_subjects in case of need for SQL --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index 0fa48c9d..e1c1b8f0 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,24 @@ If you are using Anaconda, setting the PYTHONPATH may cause problems, so you may ``` python scripts/extract_subjects.py [PATH TO MIMIC-III CSVs] data/root/ ``` + +If you are using SQL (currently only PostgreSQL support), create the following json: +``` +{ + "dbname": "mimic", + "user": "username", + "host": "hostname", + "password": "password", + "port": "port" +} +``` +replacing, of course, with the specific config to connect. + +Then run this: +``` + python scripts/extract_subjects.py [PATH TO JSON FILE] data/root/ --use_db +``` + 4. The following command attempts to fix some issues (ICU stay ID is missing) and removes the events that have missing information. 4741761 events (80%) remain after removing all suspicious rows. ``` python scripts/validate_events.py data/root/ From 166a54e54ad2df0073debfa2dafdf35bde0c6628 Mon Sep 17 00:00:00 2001 From: Mohammed Saqib <msaqib3@gatech.edu> Date: Sat, 4 Nov 2017 15:04:18 -0400 Subject: [PATCH 09/10] made changes to make mimic3csv match correctly with df --- mimic3benchmark/mimic3csv.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mimic3benchmark/mimic3csv.py b/mimic3benchmark/mimic3csv.py index a86f47d7..c8978c6a 100644 --- a/mimic3benchmark/mimic3csv.py +++ b/mimic3benchmark/mimic3csv.py @@ -210,9 +210,9 @@ def break_up_diagnoses_by_subject(diagnoses, output_path, subjects=None, verbose def read_events_table_and_break_up_by_subject(mimic3_path, table, output_path, items_to_keep=None, subjects_to_keep=None, verbose=1, use_db = False): obs_header = [ 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'CHARTTIME', 'ITEMID', 'VALUE', 'VALUEUOM' ] if items_to_keep is not None: - items_to_keep = set([ str(s) for s in items_to_keep ]) + items_to_keep = set([ int(s) for s in items_to_keep ]) if subjects_to_keep is not None: - subjects_to_keep = set([ str(s) for s in subjects_to_keep ]) + subjects_to_keep = set([ int(s) for s in subjects_to_keep ]) class nonlocal: pass nonlocal.curr_subject_id = '' @@ -249,9 +249,9 @@ def write_current_observations(): else: sys.stdout.write('\rprocessing {0}: ROW {1} of {2}...'.format(table, row_no, nb_rows)) - if (subjects_to_keep is not None and row['SUBJECT_ID'] not in subjects_to_keep): + if (subjects_to_keep is not None and int(row['SUBJECT_ID']) not in subjects_to_keep): continue - if (items_to_keep is not None and row['ITEMID'] not in items_to_keep): + if (items_to_keep is not None and int(row['ITEMID']) not in items_to_keep): continue row_out = { 'SUBJECT_ID': row['SUBJECT_ID'], From 78770560631f7ca404bed48c49c5996fe755d19b Mon Sep 17 00:00:00 2001 From: Saqib <saqibm128@gmail.com> Date: Mon, 6 Nov 2017 22:52:18 -0500 Subject: [PATCH 10/10] print problem --- mimic3benchmark/mimic3csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mimic3benchmark/mimic3csv.py b/mimic3benchmark/mimic3csv.py index e78fde7a..3e152816 100644 --- a/mimic3benchmark/mimic3csv.py +++ b/mimic3benchmark/mimic3csv.py @@ -51,7 +51,7 @@ def get_config(path): try: config = json.load(open(path, "r")) except: - print "could not open path: " + path + print("could not open path: " + path) return config def read_patients_table(mimic3_path, use_db = False): if (use_db):