|
| 1 | +""" |
| 2 | +This script ingests both clinical and vocabulary OMOP CSV exports into a single DckDB database for |
| 3 | +downstream use of the core BiasAnalyzer python library. |
| 4 | +Example for running this script: |
| 5 | + python scripts/ingest_csvs_to_omop_duckdb.py \ |
| 6 | + --clinical data/clinical \ |
| 7 | + --vocab data/omop_vocabs \ |
| 8 | + --output data/omop.duckdb |
| 9 | +""" |
| 10 | + |
| 11 | +import duckdb |
| 12 | +import time |
| 13 | +import argparse |
| 14 | +import sys |
| 15 | +from pathlib import Path |
| 16 | + |
| 17 | + |
| 18 | +def load_csv_to_duckdb(con, csv_path: Path, table_name: str): |
| 19 | + """Load a single CSV file into DuckDB.""" |
| 20 | + t0 = time.time() |
| 21 | + print(f'loading {table_name} from {csv_path}') |
| 22 | + con.execute(f""" |
| 23 | + CREATE OR REPLACE TABLE {table_name} AS |
| 24 | + SELECT * FROM read_csv_auto('{csv_path}', header=True, quote='', parallel=True) |
| 25 | + """) |
| 26 | + row_count = con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0] |
| 27 | + elapsed = time.time() - t0 |
| 28 | + print(f"Loaded {table_name} ({row_count} rows) in {elapsed:5.2f}s") |
| 29 | + return row_count, elapsed |
| 30 | + |
| 31 | + |
| 32 | +def ingest_directory(con, csv_dir: Path): |
| 33 | + """Ingest all CSVs in a directory.""" |
| 34 | + if not csv_dir.exists(): |
| 35 | + print(f"directory not found: {csv_dir}") |
| 36 | + return [] |
| 37 | + |
| 38 | + results = [] |
| 39 | + for csv_path in sorted(csv_dir.glob("*.csv")): |
| 40 | + table_name = csv_path.stem.lower() |
| 41 | + rc, t = load_csv_to_duckdb(con, csv_path, table_name) |
| 42 | + results.append((table_name, rc, t)) |
| 43 | + return results |
| 44 | + |
| 45 | + |
| 46 | +def main(): |
| 47 | + parser = argparse.ArgumentParser(description="Ingest OMOP CSVs into DuckDB") |
| 48 | + parser.add_argument("--clinical", type=Path, required=False, |
| 49 | + help="Directory containing OMOP clinical CSVs (person, condition_occurrence, etc.)") |
| 50 | + parser.add_argument("--vocab", type=Path, required=False, |
| 51 | + help="Directory containing OMOP vocabulary CSVs (concept, concept_relationship, etc.)") |
| 52 | + parser.add_argument("--output", type=Path, required=True, |
| 53 | + help="Output DuckDB file path") |
| 54 | + |
| 55 | + args = parser.parse_args() |
| 56 | + |
| 57 | + input_clinical = args.clinical |
| 58 | + input_vocab = args.vocab |
| 59 | + db_path = args.output |
| 60 | + |
| 61 | + if input_clinical is None and input_vocab is None: |
| 62 | + print("Error: You must provide at least one of --clinical or --vocab for data ingestion.") |
| 63 | + sys.exit(1) |
| 64 | + |
| 65 | + print(f"Creating DuckDB at: {db_path}") |
| 66 | + db_path.parent.mkdir(parents=True, exist_ok=True) |
| 67 | + |
| 68 | + con = duckdb.connect(str(db_path)) |
| 69 | + all_results = [] |
| 70 | + |
| 71 | + if input_clinical: |
| 72 | + if not input_clinical.exists(): |
| 73 | + print(f"Clinical directory does not exist: {input_clinical}") |
| 74 | + sys.exit(1) |
| 75 | + all_results += ingest_directory(con, input_clinical) |
| 76 | + |
| 77 | + if input_vocab: |
| 78 | + if not input_vocab.exists(): |
| 79 | + print(f"Vocabulary directory does not exist: {input_vocab}") |
| 80 | + sys.exit(1) |
| 81 | + all_results += ingest_directory(con, input_vocab) |
| 82 | + |
| 83 | + con.close() |
| 84 | + |
| 85 | + print(f"Ingestion complete with {len(all_results)} tables loaded. Details shown below:") |
| 86 | + print(f"\n{all_results}") |
| 87 | + |
| 88 | +if __name__ == "__main__": |
| 89 | + main() |
0 commit comments