From 73ad8c20f034c809b180df8c85ab2806ae34f330 Mon Sep 17 00:00:00 2001 From: hyi Date: Sun, 15 Jun 2025 16:29:24 -0400 Subject: [PATCH 01/10] update pyproject.toml to have it pypi-release ready --- pyproject.toml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c3e1c21..5f63a93 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,8 @@ version = "0.1.0" description = "A python package for health data bias quantification to support visual analytics techniques for tracking and communicating bias in cohort selection" authors = ["Hong Yi "] readme = "README.md" +license="MIT" +keywords=["bias", "healthcare", "cohort", "OMOP", "analytics", "observational research"] include = [ {path = "biasanalyzer/sql_templates/*.sql", format=["sdist", "wheel"]} ] @@ -29,9 +31,6 @@ pytest = "^8.3.3" [tool.poetry.group.dev.dependencies] pytest-cov = "5.0.0" -[tool.setuptools.package-data] -biasanalyzer = ["biasanalyzer/sql_templates/*.sql"] - [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" From 2fe47a7b24da35a560740db4ae1f7c54f85f7272 Mon Sep 17 00:00:00 2001 From: hyi Date: Mon, 16 Jun 2025 18:21:26 -0400 Subject: [PATCH 02/10] updated cohorts notebook from developer testing based to user tutorial --- notebooks/BiasAnalyzerCohortsTutorial.ipynb | 548 +++++++++++++++++++ notebooks/BiasAnalyzerTestingCohorts.ipynb | 577 -------------------- 2 files changed, 548 insertions(+), 577 deletions(-) create mode 100644 notebooks/BiasAnalyzerCohortsTutorial.ipynb delete mode 100644 notebooks/BiasAnalyzerTestingCohorts.ipynb diff --git a/notebooks/BiasAnalyzerCohortsTutorial.ipynb b/notebooks/BiasAnalyzerCohortsTutorial.ipynb new file mode 100644 index 0000000..8957503 --- /dev/null +++ b/notebooks/BiasAnalyzerCohortsTutorial.ipynb @@ -0,0 +1,548 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3bf748e7", + "metadata": {}, + "source": [ + "# BiasAnalyzer Tutorial: Exploring Cohorts\n", + "\n", + "This tutorial demonstrates how to use the `BiasAnalyzer` package to create and analyze cohorts by connecting to an [OMOP (Observational Medical Outcomes Partnership) CDM (Common Data Model)](https://www.ohdsi.org/data-standardization/) database. The currently supported database types include postgreSQL and duckDB. \n", + "\n", + "---\n", + "\n", + "### Overview\n", + "\n", + "**Objective**: \n", + "Guide users through the creation, exploration, and comparison of a baseline and a study cohort using `BiasAnalyzer`, illustrating how to define, explore, and compare them.\n", + "\n", + "**Before You Begin**:\n", + "The `BiasAnalyzer` package is currently in active development and has not yet been officially released on PyPI.\n", + "You can install it in one of the two ways:\n", + "\n", + "- **Install from GitHub (recommended during development)**:\n", + "```bash\n", + "pip install git+https://github.com/vaclab/BiasAnalyzer.git\n", + "```\n", + "- **Install from PyPI (once the pacakge is officially released)**:\n", + "```bash\n", + "pip install biasanalyzer\n", + "```\n", + "For full setup and usage instructions, refer to the [README](https://github.com/VACLab/BiasAnalyzer/blob/main/README.md).\n", + "\n", + "---\n" + ] + }, + { + "cell_type": "markdown", + "id": "c5618746", + "metadata": {}, + "source": [ + "### Preparation for cohort creation\n", + "**Preparation step 1**: Import the `BIAS` class from the `api` module of the `BiasAnalyzer` package" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4de3a621", + "metadata": {}, + "outputs": [], + "source": [ + "from biasanalyzer.api import BIAS" + ] + }, + { + "cell_type": "markdown", + "id": "46559918", + "metadata": {}, + "source": [ + "**Preparation step 2**: Create an object of the `BIAS` class" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "86862060", + "metadata": {}, + "outputs": [], + "source": [ + "bias = BIAS()" + ] + }, + { + "cell_type": "markdown", + "id": "ff3bdfd8", + "metadata": {}, + "source": [ + "**Preparation step 3**: Specifiy OMOP Common Data Model (CDM) database configurations on the `bias` object to allow connection to the OMOP CDM database for cohort creation and selection bias analysis. A configuration file must include root_omop_cdm_database key. An example of the configuration file is shown below:\n", + "```\n", + "root_omop_cdm_database:\n", + " database_type: duckdb # set it to one of the two supported types: postgresql or duckdb\n", + " username: test_username\n", + " password: test_password\n", + " hostname: test_db_hostname\n", + " database: \"shared_test_db.duckdb\" # use a shared name for an in-memory duckdb or database name for postgresql\n", + " port: 5432\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "83e992d1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "configuration specified in ../config.yaml loaded successfully\n" + ] + } + ], + "source": [ + "bias.set_config('../config.yaml')" + ] + }, + { + "cell_type": "markdown", + "id": "6d9c7881-0029-470c-ae84-6eb420c10ae9", + "metadata": {}, + "source": [ + "**Preparation step 4**: Set OMOP CDM database as specified in the configuration on the `bias` object to connect to the OMOP CDM database and create Cohort Definition metadata table and Cohort data table." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "283156f8-63da-42a5-bbd7-ee2b7719652c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connected to the OMOP CDM database (read-only).\n", + "Cohort Definition table created.\n", + "Cohort table created.\n" + ] + } + ], + "source": [ + "bias.set_root_omop()" + ] + }, + { + "cell_type": "markdown", + "id": "c7219629-1a30-44af-9ec5-5eb9b4a52c5a", + "metadata": {}, + "source": [ + "---\n", + "\n", + "**Now that you have connected to your OMOP CDM database, you can start to use the APIs to explore your data. The rest of this notebook illustrates how to create and explore a baseline and a study cohort, and then compare them using the BiasAnalyzer APIs.**\n", + "\n", + "### Baseline cohort creation and exploration\n", + "**Baseline cohort creation**: Create a baseline cohort of young female patients on the `bias` object by calling the `create_cohort(cohort_name, cohort_description, query_or_yaml_file, created_by)` function and passing the name of the cohort (first argument), the description of the cohort (second argument), a yaml file that specifies cohort inclusion and exclusion criteria or a cohort selection SQL query (third argument), and the cohort owner's name indicating who owns or creates this cohort (fourth argument). The function will show a progress bar to indicate cohort creation progress over three stages." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a68f3eaf-92fd-49a2-9768-d685d826fd57", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "template_path: /home/hongyi/BiasAnalyzer/biasanalyzer/sql_templates\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d51f3cdd95894de3ae541cae8ec581da", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Cohort creation: 0%| | 0/3 [00:00= 2000 AND p.year_of_birth <= 2020 GROUP BY c.person_id ) SELECT * FROM filtered_cohort f', 'created_by': 'system'}\n", + "The total number of patients in the baseline cohort: 12360\n", + "The first five patients in the baseline cohort: [{'subject_id': 42583, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2020, 4, 26), 'cohort_end_date': datetime.date(2020, 5, 12)}, {'subject_id': 33685, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2017, 12, 8), 'cohort_end_date': datetime.date(2020, 5, 10)}, {'subject_id': 74383, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2019, 1, 31), 'cohort_end_date': datetime.date(2020, 3, 25)}, {'subject_id': 23986, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2019, 6, 15), 'cohort_end_date': datetime.date(2020, 3, 28)}, {'subject_id': 93962, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2019, 7, 1), 'cohort_end_date': datetime.date(2020, 5, 15)}]\n" + ] + } + ], + "source": [ + "baseline_cohort_def = baseline_cohort.metadata\n", + "print(f'Baseline cohort definition metadata: {baseline_cohort_def}')\n", + "baseline_cohort_data = baseline_cohort.data\n", + "print(f'The total number of patients in the baseline cohort: {len(baseline_cohort_data)}')\n", + "print(f'The first five patients in the baseline cohort: {baseline_cohort_data[:5]}')" + ] + }, + { + "cell_type": "markdown", + "id": "e25fea43-d14e-42cc-8072-063455336fae", + "metadata": {}, + "source": [ + "———————————————\n", + "\n", + "**Baseline cohort deeper exploration**: you can get statistics on age, gender, race, and ethnicity of the baseline cohort by calling `get_stats()` method on the created baseline cohort object. You can also get cohort distributions on age and gender by calling `get_distributions()` method on the created baseline cohort object." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b1415805-b065-40b8-b2cd-6db6f5f9ca41", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "the baseline cohort stats: [{'total_count': 12360, 'earliest_start_date': datetime.date(2000, 2, 19), 'latest_start_date': datetime.date(2020, 5, 26), 'earliest_end_date': datetime.date(2002, 7, 20), 'latest_end_date': datetime.date(2020, 5, 27), 'min_duration_days': 0, 'max_duration_days': 7379, 'avg_duration_days': 1192.32, 'median_duration': 296, 'stddev_duration': 1779.19}]\n", + "the baseline cohort age stats: [{'total_count': 12360, 'min_age': 0, 'max_age': 25, 'avg_age': 7.24, 'median_age': 6, 'stddev_age': 6.01}]\n", + "the baseline cohort gender stats: [{'gender': 'female', 'gender_count': 12360, 'probability': 1.0}]\n", + "the baseline cohort race stats: [{'race': 'Other', 'race_count': 66, 'probability': 0.01}, {'race': 'Asian', 'race_count': 878, 'probability': 0.07}, {'race': 'Black or African American', 'race_count': 1056, 'probability': 0.09}, {'race': 'White', 'race_count': 10360, 'probability': 0.84}]\n", + "the baseline cohort ethnicity stats: [{'ethnicity': 'other', 'ethnicity_count': 12360, 'probability': 1.0}]\n" + ] + } + ], + "source": [ + "# get stats of the baseline cohort\n", + "cohort_stats = baseline_cohort.get_stats()\n", + "print(f'the baseline cohort stats: {cohort_stats}')\n", + "cohort_age_stats = baseline_cohort.get_stats(\"age\")\n", + "print(f'the baseline cohort age stats: {cohort_age_stats}')\n", + "cohort_gender_stats = baseline_cohort.get_stats(\"gender\")\n", + "print(f'the baseline cohort gender stats: {cohort_gender_stats}')\n", + "cohort_race_stats = baseline_cohort.get_stats(\"race\")\n", + "print(f'the baseline cohort race stats: {cohort_race_stats}')\n", + "cohort_ethnicity_stats = baseline_cohort.get_stats(\"ethnicity\")\n", + "print(f'the baseline cohort ethnicity stats: {cohort_ethnicity_stats}')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d54e39da-6f78-4dc1-91ae-a8c26852582a", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "the baseline cohort age discrete probability distribution: [{'age_bin': '0-10', 'bin_count': 8230, 'probability': 0.6659}, {'age_bin': '11-20', 'bin_count': 4129, 'probability': 0.3341}, {'age_bin': '21-30', 'bin_count': 1, 'probability': 0.0001}, {'age_bin': '31-40', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '41-50', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '51-60', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '61-70', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '71-80', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '81-90', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '91+', 'bin_count': 0, 'probability': 0.0}]\n" + ] + } + ], + "source": [ + "# get discrete probability distribution of the age variable in the baseline cohort\n", + "cohort_age_distr = baseline_cohort.get_distributions('age')\n", + "print(f'the baseline cohort age discrete probability distribution: {cohort_age_distr}')" + ] + }, + { + "cell_type": "markdown", + "id": "5d92f81a-99f8-4534-bcb1-29369262c17e", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Study cohort creation and exploration\n", + "**Study cohort creation**: Create a study cohort of young female COVID patients on the bias object by calling the create_cohort(cohort_name, cohort_description, query_or_yaml_file, created_by) function and passing the name of the cohort (first argument), the description of the cohort (second argument), a yaml file that specifies cohort inclusion and exclusion criteria or a cohort selection SQL query (third argument), and the cohort owner's name indicating who owns or creates this cohort (fourth argument). The function will show a progress bar to indicate cohort creation progress over three stages." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e3f5ace2-6cc4-4940-a067-e1a3fc14e1ce", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8afad29563224f62b1d76a5f9f201490", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Cohort creation: 0%| | 0/3 [00:00= 2000 AND p.year_of_birth <= 2020 GROUP BY c.person_id ) SELECT * FROM filtered_cohort f', 'created_by': 'system'}\n", + "The total number of patients in the study cohort: 10208\n", + "The first five patients in the young female COVID-19 patient cohort: [{'subject_id': 22344, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2020, 3, 11), 'cohort_end_date': datetime.date(2020, 4, 13)}, {'subject_id': 53949, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2020, 2, 28), 'cohort_end_date': datetime.date(2020, 3, 11)}, {'subject_id': 80198, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2020, 3, 5), 'cohort_end_date': datetime.date(2020, 4, 9)}, {'subject_id': 30052, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2020, 3, 6), 'cohort_end_date': datetime.date(2020, 4, 8)}, {'subject_id': 88837, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2020, 2, 24), 'cohort_end_date': datetime.date(2020, 3, 12)}]\n" + ] + } + ], + "source": [ + "study_cohort_def = study_cohort.metadata\n", + "print(f'Young female COVID-19 patient cohort definition: {study_cohort_def}')\n", + "study_cohort_data = study_cohort.data\n", + "print(f'The total number of patients in the study cohort: {len(study_cohort_data)}')\n", + "print(f'The first five patients in the young female COVID-19 patient cohort: {study_cohort_data[:5]}')" + ] + }, + { + "cell_type": "markdown", + "id": "0cac81eb-4006-494e-956c-5b4f5015ab20", + "metadata": {}, + "source": [ + "———————————————\n", + "\n", + "**Study cohort deeper exploration**: you can get statistics on age, gender, race, and ethnicity of the study cohort by \n", + "calling `get_stats()` method on the created study cohort object. You can also get cohort distributions on age and gender by \n", + "calling `get_distributions()` method on the created study cohort object." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "8be5061b-cfdf-4dc0-9ef8-f18277ab9fbe", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "the user study cohort stats: [{'total_count': 10208, 'earliest_start_date': datetime.date(2020, 1, 18), 'latest_start_date': datetime.date(2020, 3, 30), 'earliest_end_date': datetime.date(2020, 2, 7), 'latest_end_date': datetime.date(2020, 5, 3), 'min_duration_days': 8, 'max_duration_days': 37, 'avg_duration_days': 24.25, 'median_duration': 24, 'stddev_duration': 7.2}]\n", + "the user study cohort age stats: [{'total_count': 10208, 'min_age': 0, 'max_age': 20, 'avg_age': 10.94, 'median_age': 11, 'stddev_age': 5.92}]\n", + "the user study gender stats: [{'gender': 'female', 'gender_count': 10208, 'probability': 1.0}]\n", + "the user study cohort race stats: [{'race': 'Other', 'race_count': 53, 'probability': 0.01}, {'race': 'Asian', 'race_count': 723, 'probability': 0.07}, {'race': 'Black or African American', 'race_count': 866, 'probability': 0.08}, {'race': 'White', 'race_count': 8566, 'probability': 0.84}]\n", + "the user study ethnicity stats: [{'ethnicity': 'other', 'ethnicity_count': 10208, 'probability': 1.0}]\n" + ] + } + ], + "source": [ + "# get stats and distributions of the user study cohort\n", + "study_cohort_stats = study_cohort.get_stats()\n", + "print(f'the user study cohort stats: {study_cohort_stats}')\n", + "study_cohort_age_stats = study_cohort.get_stats(\"age\")\n", + "print(f'the user study cohort age stats: {study_cohort_age_stats}')\n", + "study_cohort_gender_stats = study_cohort.get_stats(\"gender\")\n", + "print(f'the user study gender stats: {study_cohort_gender_stats}')\n", + "study_cohort_race_stats = study_cohort.get_stats(\"race\")\n", + "print(f'the user study cohort race stats: {study_cohort_race_stats}')\n", + "study_cohort_ethnicity_stats = study_cohort.get_stats(\"ethnicity\")\n", + "print(f'the user study ethnicity stats: {study_cohort_ethnicity_stats}')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c7ad0b7b-21dc-4572-af21-fe1580361999", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "the user study cohort age discrete probability distribution: [{'age_bin': '0-10', 'bin_count': 4744, 'probability': 0.4647}, {'age_bin': '11-20', 'bin_count': 5464, 'probability': 0.5353}, {'age_bin': '21-30', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '31-40', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '41-50', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '51-60', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '61-70', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '71-80', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '81-90', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '91+', 'bin_count': 0, 'probability': 0.0}]\n" + ] + } + ], + "source": [ + "# get discrete probability distribution of the age variable in the baseline cohort\n", + "study_cohort_age_distr = study_cohort.get_distributions('age')\n", + "print(f'the user study cohort age discrete probability distribution: {study_cohort_age_distr}')" + ] + }, + { + "cell_type": "markdown", + "id": "d300e804-69da-4d30-80ad-a5239acba562", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Baseline and study cohort comparison\n", + "You can compare the baseline and study cohorts by calling the method `compare_cohorts(id1, id2)` on the `bias` object. Note that currently only hellinger distances between age and gender distributions of two cohorts are computed as a comparison metrics. More comparative metrics will be added in the future." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "0d03cf95-3c68-4eee-be41-5482dea68b84", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'age_hellinger_distance': 0.14447523081257604}, {'gender_hellinger_distance': 0.0}]\n" + ] + } + ], + "source": [ + "# compare the baseline and user study cohorts\n", + "result = bias.compare_cohorts(baseline_cohort_def['id'], study_cohort_def['id'])\n", + "print(result)" + ] + }, + { + "cell_type": "markdown", + "id": "22984b7e-0001-4add-aacb-ecf1252f2b7a", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Final cleanup to ensure database connections are closed" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "128c7b02-faef-4a35-97a6-c92baa5a37dd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connection to BiasDatabase closed.\n", + "Connection to the OMOP CDM database closed.\n" + ] + } + ], + "source": [ + "bias.cleanup()" + ] + }, + { + "cell_type": "markdown", + "id": "1eddbdd7", + "metadata": {}, + "source": [ + "### ✅ Summary\n", + "\n", + "In this tutorial, you learned how to connect to an OMOP CDM database, create a baseline and a study cohort, explore each created cohort, and compare two created cohorts using the BiasAnalyzer python package.\n", + "\n", + "For more information, refer to the [BiasAnalyzer GitHub repo](https://github.com/VACLab/BiasAnalyzer) and the [README file](https://github.com/VACLab/BiasAnalyzer/blob/main/README.md).\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (biasanalyzer)", + "language": "python", + "name": "biasanalyzer-py3.8" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/BiasAnalyzerTestingCohorts.ipynb b/notebooks/BiasAnalyzerTestingCohorts.ipynb deleted file mode 100644 index 7810c49..0000000 --- a/notebooks/BiasAnalyzerTestingCohorts.ipynb +++ /dev/null @@ -1,577 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "a25ba48a-9e2c-4e1d-9e93-80f7ea3ff3e3", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting git+https://github.com/vaclab/BiasAnalyzer.git\n", - " Cloning https://github.com/vaclab/BiasAnalyzer.git to ./temp/pip-req-build-sqm_zvhy\n", - " Running command git clone --filter=blob:none --quiet https://github.com/vaclab/BiasAnalyzer.git /home/hyi/temp/pip-req-build-sqm_zvhy\n", - " Resolved https://github.com/vaclab/BiasAnalyzer.git to commit 8d821839e93b1d9a208c5c66352ee66db60d1e53\n", - " Installing build dependencies ... \u001B[?25ldone\n", - "\u001B[?25h Getting requirements to build wheel ... \u001B[?25ldone\n", - "\u001B[?25h Preparing metadata (pyproject.toml) ... \u001B[?25ldone\n", - "\u001B[?25hCollecting duckdb<2.0.0,>=1.1.1 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for duckdb<2.0.0,>=1.1.1 from https://files.pythonhosted.org/packages/50/52/6e6f5b5b07841cec334ca6b98f2e02b7bb54ab3b99c49aa3a161cc0b4b37/duckdb-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached duckdb-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (966 bytes)\n", - "Collecting duckdb-engine<0.14.0,>=0.13.2 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for duckdb-engine<0.14.0,>=0.13.2 from https://files.pythonhosted.org/packages/ef/5d/81a0d67483d0767e4fbf7444b079b3f21574a184b0888782ced1c2172777/duckdb_engine-0.13.6-py3-none-any.whl.metadata\n", - " Using cached duckdb_engine-0.13.6-py3-none-any.whl.metadata (8.0 kB)\n", - "Collecting ipytree<0.3.0,>=0.2.2 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for ipytree<0.3.0,>=0.2.2 from https://files.pythonhosted.org/packages/e4/03/35cf1742598d784e96153175233318a2332f71863e55ad1007c9264c1a7a/ipytree-0.2.2-py2.py3-none-any.whl.metadata\n", - " Using cached ipytree-0.2.2-py2.py3-none-any.whl.metadata (849 bytes)\n", - "Collecting ipywidgets<9.0.0,>=8.1.5 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for ipywidgets<9.0.0,>=8.1.5 from https://files.pythonhosted.org/packages/22/2d/9c0b76f2f9cc0ebede1b9371b6f317243028ed60b90705863d493bae622e/ipywidgets-8.1.5-py3-none-any.whl.metadata\n", - " Using cached ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)\n", - "Collecting jinja2==3.1.5 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for jinja2==3.1.5 from https://files.pythonhosted.org/packages/bd/0f/2ba5fbcd631e3e88689309dbe978c5769e883e4b84ebfe7da30b43275c5a/jinja2-3.1.5-py3-none-any.whl.metadata\n", - " Using cached jinja2-3.1.5-py3-none-any.whl.metadata (2.6 kB)\n", - "Collecting numpy==1.24.4 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for numpy==1.24.4 from https://files.pythonhosted.org/packages/22/97/dfb1a31bb46686f09e68ea6ac5c63fdee0d22d7b23b8f3f7ea07712869ef/numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n", - "Collecting pandas==2.0.3 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for pandas==2.0.3 from https://files.pythonhosted.org/packages/d0/28/88b81881c056376254618fad622a5e94b5126db8c61157ea1910cd1c040a/pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)\n", - "Collecting psycopg2<3.0.0,>=2.9.1 (from biasanalyzer==0.1.0)\n", - " Using cached psycopg2-2.9.10-cp311-cp311-linux_x86_64.whl\n", - "Collecting pydantic<3.0.0,>=2.9.2 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for pydantic<3.0.0,>=2.9.2 from https://files.pythonhosted.org/packages/f4/3c/8cc1cc84deffa6e25d2d0c688ebb80635dfdbf1dbea3e30c541c8cf4d860/pydantic-2.10.6-py3-none-any.whl.metadata\n", - " Using cached pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)\n", - "Collecting pyyaml<7.0.0,>=6.0.2 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for pyyaml<7.0.0,>=6.0.2 from https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)\n", - "Collecting scipy==1.10.1 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for scipy==1.10.1 from https://files.pythonhosted.org/packages/21/cd/fe2d4af234b80dc08c911ce63fdaee5badcdde3e9bcd9a68884580652ef0/scipy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached scipy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)\n", - "Collecting sqlalchemy<3.0.0,>=2.0.35 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for sqlalchemy<3.0.0,>=2.0.35 from https://files.pythonhosted.org/packages/ff/0a/46f3171f564a19a1daf6e7e0e6c8afc6ecd792f947c6de435519d4d16af3/sqlalchemy-2.0.39-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached sqlalchemy-2.0.39-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)\n", - "Collecting MarkupSafe>=2.0 (from jinja2==3.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for MarkupSafe>=2.0 from https://files.pythonhosted.org/packages/f1/a4/aefb044a2cd8d7334c8a47d3fb2c9f328ac48cb349468cc31c20b539305f/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)\n", - "Collecting python-dateutil>=2.8.2 (from pandas==2.0.3->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for python-dateutil>=2.8.2 from https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata\n", - " Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)\n", - "Collecting pytz>=2020.1 (from pandas==2.0.3->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for pytz>=2020.1 from https://files.pythonhosted.org/packages/eb/38/ac33370d784287baa1c3d538978b5e2ea064d4c1b93ffbd12826c190dd10/pytz-2025.1-py2.py3-none-any.whl.metadata\n", - " Using cached pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)\n", - "Collecting tzdata>=2022.1 (from pandas==2.0.3->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for tzdata>=2022.1 from https://files.pythonhosted.org/packages/0f/dd/84f10e23edd882c6f968c21c2434fe67bd4a528967067515feca9e611e5e/tzdata-2025.1-py2.py3-none-any.whl.metadata\n", - " Using cached tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)\n", - "Collecting packaging>=21 (from duckdb-engine<0.14.0,>=0.13.2->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for packaging>=21 from https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl.metadata\n", - " Using cached packaging-24.2-py3-none-any.whl.metadata (3.2 kB)\n", - "Collecting comm>=0.1.3 (from ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for comm>=0.1.3 from https://files.pythonhosted.org/packages/e6/75/49e5bfe642f71f272236b5b2d2691cf915a7283cc0ceda56357b61daa538/comm-0.2.2-py3-none-any.whl.metadata\n", - " Using cached comm-0.2.2-py3-none-any.whl.metadata (3.7 kB)\n", - "Collecting ipython>=6.1.0 (from ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for ipython>=6.1.0 from https://files.pythonhosted.org/packages/20/3a/917cb9e72f4e1a4ea13c862533205ae1319bd664119189ee5cc9e4e95ebf/ipython-9.0.2-py3-none-any.whl.metadata\n", - " Using cached ipython-9.0.2-py3-none-any.whl.metadata (4.3 kB)\n", - "Collecting traitlets>=4.3.1 (from ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for traitlets>=4.3.1 from https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl.metadata\n", - " Using cached traitlets-5.14.3-py3-none-any.whl.metadata (10 kB)\n", - "Collecting widgetsnbextension~=4.0.12 (from ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for widgetsnbextension~=4.0.12 from https://files.pythonhosted.org/packages/21/02/88b65cc394961a60c43c70517066b6b679738caf78506a5da7b88ffcb643/widgetsnbextension-4.0.13-py3-none-any.whl.metadata\n", - " Using cached widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)\n", - "Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for jupyterlab-widgets~=3.0.12 from https://files.pythonhosted.org/packages/a9/93/858e87edc634d628e5d752ba944c2833133a28fa87bb093e6832ced36a3e/jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata\n", - " Using cached jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)\n", - "Collecting annotated-types>=0.6.0 (from pydantic<3.0.0,>=2.9.2->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for annotated-types>=0.6.0 from https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl.metadata\n", - " Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)\n", - "Collecting pydantic-core==2.27.2 (from pydantic<3.0.0,>=2.9.2->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for pydantic-core==2.27.2 from https://files.pythonhosted.org/packages/a8/7c/b860618c25678bbd6d1d99dbdfdf0510ccb50790099b963ff78a124b754f/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n", - "Collecting typing-extensions>=4.12.2 (from pydantic<3.0.0,>=2.9.2->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for typing-extensions>=4.12.2 from https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl.metadata\n", - " Using cached typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)\n", - "Collecting greenlet!=0.4.17 (from sqlalchemy<3.0.0,>=2.0.35->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for greenlet!=0.4.17 from https://files.pythonhosted.org/packages/f7/4b/1c9695aa24f808e156c8f4813f685d975ca73c000c2a5056c514c64980f6/greenlet-3.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata\n", - " Using cached greenlet-3.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)\n", - "Collecting decorator (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for decorator from https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl.metadata\n", - " Using cached decorator-5.2.1-py3-none-any.whl.metadata (3.9 kB)\n", - "Collecting ipython-pygments-lexers (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for ipython-pygments-lexers from https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl.metadata\n", - " Using cached ipython_pygments_lexers-1.1.1-py3-none-any.whl.metadata (1.1 kB)\n", - "Collecting jedi>=0.16 (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for jedi>=0.16 from https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl.metadata\n", - " Using cached jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)\n", - "Collecting matplotlib-inline (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for matplotlib-inline from https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl.metadata\n", - " Using cached matplotlib_inline-0.1.7-py3-none-any.whl.metadata (3.9 kB)\n", - "Collecting pexpect>4.3 (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for pexpect>4.3 from https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl.metadata\n", - " Using cached pexpect-4.9.0-py2.py3-none-any.whl.metadata (2.5 kB)\n", - "Collecting prompt_toolkit<3.1.0,>=3.0.41 (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for prompt_toolkit<3.1.0,>=3.0.41 from https://files.pythonhosted.org/packages/e4/ea/d836f008d33151c7a1f62caf3d8dd782e4d15f6a43897f64480c2b8de2ad/prompt_toolkit-3.0.50-py3-none-any.whl.metadata\n", - " Using cached prompt_toolkit-3.0.50-py3-none-any.whl.metadata (6.6 kB)\n", - "Collecting pygments>=2.4.0 (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for pygments>=2.4.0 from https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl.metadata\n", - " Using cached pygments-2.19.1-py3-none-any.whl.metadata (2.5 kB)\n", - "Collecting stack_data (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for stack_data from https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl.metadata\n", - " Using cached stack_data-0.6.3-py3-none-any.whl.metadata (18 kB)\n", - "Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas==2.0.3->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for six>=1.5 from https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl.metadata\n", - " Using cached six-1.17.0-py2.py3-none-any.whl.metadata (1.7 kB)\n", - "Collecting parso<0.9.0,>=0.8.4 (from jedi>=0.16->ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for parso<0.9.0,>=0.8.4 from https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl.metadata\n", - " Using cached parso-0.8.4-py2.py3-none-any.whl.metadata (7.7 kB)\n", - "Collecting ptyprocess>=0.5 (from pexpect>4.3->ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for ptyprocess>=0.5 from https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl.metadata\n", - " Using cached ptyprocess-0.7.0-py2.py3-none-any.whl.metadata (1.3 kB)\n", - "Collecting wcwidth (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for wcwidth from https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl.metadata\n", - " Using cached wcwidth-0.2.13-py2.py3-none-any.whl.metadata (14 kB)\n", - "Collecting executing>=1.2.0 (from stack_data->ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for executing>=1.2.0 from https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl.metadata\n", - " Using cached executing-2.2.0-py2.py3-none-any.whl.metadata (8.9 kB)\n", - "Collecting asttokens>=2.1.0 (from stack_data->ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for asttokens>=2.1.0 from https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl.metadata\n", - " Using cached asttokens-3.0.0-py3-none-any.whl.metadata (4.7 kB)\n", - "Collecting pure-eval (from stack_data->ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for pure-eval from https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl.metadata\n", - " Using cached pure_eval-0.2.3-py3-none-any.whl.metadata (6.3 kB)\n", - "Using cached jinja2-3.1.5-py3-none-any.whl (134 kB)\n", - "Using cached numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n", - "Using cached pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)\n", - "Using cached scipy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.1 MB)\n", - "Using cached duckdb-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.2 MB)\n", - "Using cached duckdb_engine-0.13.6-py3-none-any.whl (48 kB)\n", - "Using cached ipytree-0.2.2-py2.py3-none-any.whl (1.3 MB)\n", - "Using cached ipywidgets-8.1.5-py3-none-any.whl (139 kB)\n", - "Using cached pydantic-2.10.6-py3-none-any.whl (431 kB)\n", - "Using cached pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)\n", - "Using cached PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (762 kB)\n", - "Using cached sqlalchemy-2.0.39-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)\n", - "Using cached annotated_types-0.7.0-py3-none-any.whl (13 kB)\n", - "Using cached comm-0.2.2-py3-none-any.whl (7.2 kB)\n", - "Using cached greenlet-3.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (602 kB)\n", - "Using cached ipython-9.0.2-py3-none-any.whl (600 kB)\n", - "Using cached jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)\n", - "Using cached MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23 kB)\n", - "Using cached packaging-24.2-py3-none-any.whl (65 kB)\n", - "Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)\n", - "Using cached pytz-2025.1-py2.py3-none-any.whl (507 kB)\n", - "Using cached traitlets-5.14.3-py3-none-any.whl (85 kB)\n", - "Using cached typing_extensions-4.12.2-py3-none-any.whl (37 kB)\n", - "Using cached tzdata-2025.1-py2.py3-none-any.whl (346 kB)\n", - "Using cached widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB)\n", - "Using cached jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)\n", - "Using cached pexpect-4.9.0-py2.py3-none-any.whl (63 kB)\n", - "Using cached prompt_toolkit-3.0.50-py3-none-any.whl (387 kB)\n", - "Using cached pygments-2.19.1-py3-none-any.whl (1.2 MB)\n", - "Using cached six-1.17.0-py2.py3-none-any.whl (11 kB)\n", - "Using cached decorator-5.2.1-py3-none-any.whl (9.2 kB)\n", - "Using cached ipython_pygments_lexers-1.1.1-py3-none-any.whl (8.1 kB)\n", - "Using cached matplotlib_inline-0.1.7-py3-none-any.whl (9.9 kB)\n", - "Using cached stack_data-0.6.3-py3-none-any.whl (24 kB)\n", - "Using cached asttokens-3.0.0-py3-none-any.whl (26 kB)\n", - "Using cached executing-2.2.0-py2.py3-none-any.whl (26 kB)\n", - "Using cached parso-0.8.4-py2.py3-none-any.whl (103 kB)\n", - "Using cached ptyprocess-0.7.0-py2.py3-none-any.whl (13 kB)\n", - "Using cached pure_eval-0.2.3-py3-none-any.whl (11 kB)\n", - "Using cached wcwidth-0.2.13-py2.py3-none-any.whl (34 kB)\n", - "Building wheels for collected packages: biasanalyzer\n", - " Building wheel for biasanalyzer (pyproject.toml) ... \u001B[?25ldone\n", - "\u001B[?25h Created wheel for biasanalyzer: filename=biasanalyzer-0.1.0-py3-none-any.whl size=25475 sha256=1982c82749337f81db1a730b8cc25c049d0c0788cd6b782f69ce8be1d92a397c\n", - " Stored in directory: /home/hyi/temp/pip-ephem-wheel-cache-7pwouolk/wheels/25/75/4e/079d96d69cc58148ce31d3d44f858e4db5f689604112dcb7c3\n", - "Successfully built biasanalyzer\n", - "Installing collected packages: wcwidth, pytz, pure-eval, ptyprocess, widgetsnbextension, tzdata, typing-extensions, traitlets, six, pyyaml, pygments, psycopg2, prompt_toolkit, pexpect, parso, packaging, numpy, MarkupSafe, jupyterlab-widgets, greenlet, executing, duckdb, decorator, asttokens, annotated-types, stack_data, sqlalchemy, scipy, python-dateutil, pydantic-core, matplotlib-inline, jinja2, jedi, ipython-pygments-lexers, comm, pydantic, pandas, ipython, duckdb-engine, ipywidgets, ipytree, biasanalyzer\n", - "\u001B[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "ipympl 0.9.3 requires ipython<9, but you have ipython 9.0.2 which is incompatible.\u001B[0m\u001B[31m\n", - "\u001B[0mSuccessfully installed MarkupSafe-3.0.2 annotated-types-0.7.0 asttokens-3.0.0 biasanalyzer-0.1.0 comm-0.2.2 decorator-5.2.1 duckdb-1.2.1 duckdb-engine-0.13.6 executing-2.2.0 greenlet-3.1.1 ipython-9.0.2 ipython-pygments-lexers-1.1.1 ipytree-0.2.2 ipywidgets-8.1.5 jedi-0.19.2 jinja2-3.1.5 jupyterlab-widgets-3.0.13 matplotlib-inline-0.1.7 numpy-1.24.4 packaging-24.2 pandas-2.0.3 parso-0.8.4 pexpect-4.9.0 prompt_toolkit-3.0.50 psycopg2-2.9.10 ptyprocess-0.7.0 pure-eval-0.2.3 pydantic-2.10.6 pydantic-core-2.27.2 pygments-2.19.1 python-dateutil-2.9.0.post0 pytz-2025.1 pyyaml-6.0.2 scipy-1.10.1 six-1.17.0 sqlalchemy-2.0.39 stack_data-0.6.3 traitlets-5.14.3 typing-extensions-4.12.2 tzdata-2025.1 wcwidth-0.2.13 widgetsnbextension-4.0.13\n", - "Requirement already satisfied: typing-extensions in /opt/conda/lib/python3.11/site-packages (4.12.2)\n" - ] - } - ], - "source": [ - "# Have to specify TMPDIR and target in pip install command to work around the kernel crash issue due to \n", - "# the small ephemeral local storage quota allocated to /tmp which is used by default by pip install\n", - "!TMPDIR=/home/hyi/temp pip install git+https://github.com/vaclab/BiasAnalyzer.git --target /home/hyi/target --upgrade\n", - "!pip install --upgrade typing-extensions" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "9ce3b87c-0754-4eae-9f85-8210104e2b0b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# append the target folder where HealthDataBias module was installed to PYTHONPATH\n", - "import sys\n", - "sys.path.append('/home/hyi/target')" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "729e8803-74f8-4180-aa8b-0e44567f8aeb", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from biasanalyzer.api import BIAS" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "548223ed-8948-461e-b9d6-40a0ec7fc89f", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "no configuration file specified. Call set_config(config_file_path) next to specify configurations\n" - ] - } - ], - "source": [ - "# create an object of BIAS class\n", - "bias = BIAS()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "7d440d9f-c7fa-4ef1-ad66-31274ebef4ea", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "configuration specified in /home/hyi/bias/config/config.yaml loaded successfully\n" - ] - } - ], - "source": [ - "bias.set_config('/home/hyi/bias/config/config.yaml')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "283156f8-63da-42a5-bbd7-ee2b7719652c", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Connected to the OMOP CDM database (read-only).\n", - "Cohort Definition table created.\n", - "Cohort table created.\n" - ] - } - ], - "source": [ - "# the configuration file includes root_omop_cdm_database configuration info with an example shown \n", - "# in https://github.com/hyi/HealthDataBias/blob/main/tests/assets/config/test_config.yaml\n", - "bias.set_root_omop()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "a68f3eaf-92fd-49a2-9768-d685d826fd57", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "template_path: /home/hyi/target/biasanalyzer/sql_templates\n", - "configuration specified in /home/hyi/bias/config/test_cohort_creation_condition_occurrence_config_baseline.yaml loaded successfully\n", - "Cohort definition inserted successfully.\n", - "Cohort Young female patients successfully created.\n", - "cohort created successfully\n", - "young female patient cohort definition: {'id': 1, 'name': 'Young female patients', 'description': 'Young female patients', 'created_date': datetime.date(2025, 3, 12), 'creation_info': 'WITH ranked_events AS ( SELECT person_id, condition_concept_id, condition_start_date AS event_start_date, condition_end_date AS event_end_date, ROW_NUMBER() OVER ( PARTITION BY person_id, condition_concept_id ORDER BY condition_start_date ASC ) AS event_instance FROM condition_occurrence ), ranked_visits AS ( SELECT person_id, visit_concept_id, visit_start_date AS event_start_date, visit_end_date AS event_end_date, ROW_NUMBER() OVER ( PARTITION BY person_id, visit_concept_id ORDER BY visit_start_date ASC ) AS event_instance FROM visit_occurrence ), condition_qualifying_events AS ( SELECT person_id, condition_start_date as event_start_date, condition_end_date as event_end_date FROM condition_occurrence ), filtered_cohort AS ( SELECT c.person_id, MIN(c.event_start_date) AS cohort_start_date, MAX(c.event_end_date) AS cohort_end_date FROM condition_qualifying_events c JOIN person p ON c.person_id = p.person_id WHERE 1=1 AND p.gender_concept_id = 8532 AND p.year_of_birth >= 2000 AND p.year_of_birth <= 2020 GROUP BY c.person_id ) SELECT * FROM filtered_cohort f', 'created_by': 'system'}\n", - "The first five patients in the young female patient cohort: [{'subject_id': 8, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2020, 3, 11), 'cohort_end_date': None}, {'subject_id': 13, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2011, 11, 13), 'cohort_end_date': datetime.date(2020, 3, 22)}, {'subject_id': 14, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2015, 4, 20), 'cohort_end_date': datetime.date(2020, 3, 19)}, {'subject_id': 21, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2017, 8, 25), 'cohort_end_date': None}, {'subject_id': 25, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2007, 4, 3), 'cohort_end_date': None}]\n" - ] - } - ], - "source": [ - "baseline_cohort = bias.create_cohort('Young female patients', 'Young female patients', '/home/hyi/bias/config/test_cohort_creation_condition_occurrence_config_baseline.yaml', 'system')\n", - "baseline_cohort_def = baseline_cohort.metadata\n", - "print(f'young female patient cohort definition: {baseline_cohort_def}')\n", - "baseline_cohort_data = baseline_cohort.data\n", - "print(f'The first five patients in the young female patient cohort: {baseline_cohort_data[:5]}')" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "9a52ab5f-57a8-4942-8a03-ec86651e919e", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Cohort definition inserted successfully.\n", - "Cohort COVID-19 patients successfully created.\n", - "cohort created successfully\n", - "all COVID-19 patient cohort definition: {'id': 2, 'name': 'COVID-19 patients', 'description': 'Patients with COVID-19 condition', 'created_date': datetime.date(2025, 3, 12), 'creation_info': 'SELECT person_id, condition_start_date as cohort_start_date, condition_end_date as cohort_end_date FROM condition_occurrence WHERE condition_concept_id = 37311061', 'created_by': 'system'}\n", - "The first five patients in the COVID-19 patient cohort: [{'subject_id': 20342, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2020, 3, 11), 'cohort_end_date': datetime.date(2020, 4, 3)}, {'subject_id': 20343, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2020, 3, 9), 'cohort_end_date': datetime.date(2020, 4, 7)}, {'subject_id': 20344, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2020, 3, 11), 'cohort_end_date': datetime.date(2020, 4, 8)}, {'subject_id': 20345, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2020, 3, 2), 'cohort_end_date': datetime.date(2020, 3, 19)}, {'subject_id': 20347, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2020, 3, 5), 'cohort_end_date': datetime.date(2020, 3, 25)}]\n" - ] - } - ], - "source": [ - "# create a baseline cohort with all COVID-19 patients\n", - "baseline_cohort_query = ('SELECT person_id, condition_start_date as cohort_start_date, '\n", - " 'condition_end_date as cohort_end_date '\n", - " 'FROM condition_occurrence '\n", - " 'WHERE condition_concept_id = 37311061')\n", - "\n", - "baseline_cohort = bias.create_cohort('COVID-19 patients', 'Patients with COVID-19 condition', baseline_cohort_query, 'system')\n", - "baseline_cohort_def = baseline_cohort.metadata\n", - "print(f'all COVID-19 patient cohort definition: {baseline_cohort_def}')\n", - "baseline_cohort_data = baseline_cohort.data\n", - "print(f'The first five patients in the COVID-19 patient cohort: {baseline_cohort_data[:5]}')" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "b1415805-b065-40b8-b2cd-6db6f5f9ca41", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "the baseline cohort stats: [{'total_count': 88166, 'earliest_start_date': datetime.date(2020, 1, 14), 'latest_start_date': datetime.date(2020, 3, 31), 'earliest_end_date': datetime.date(2020, 1, 30), 'latest_end_date': datetime.date(2020, 5, 3), 'min_duration_days': 8, 'max_duration_days': 37, 'avg_duration_days': 22.63, 'median_duration': 23, 'stddev_duration': 8.06}]\n", - "the baseline cohort age stats: [{'total_count': 88166, 'min_age': 0, 'max_age': 111, 'avg_age': 41.6, 'median_age': 41, 'stddev_age': 23.71}]\n", - "the baseline cohort gender stats: [{'gender': 'male', 'gender_count': 42961, 'probability': 0.49}, {'gender': 'female', 'gender_count': 45205, 'probability': 0.51}]\n", - "the baseline cohort race stats: [{'race': 'Asian', 'race_count': 6165, 'probability': 0.07}, {'race': 'Other', 'race_count': 511, 'probability': 0.01}, {'race': 'White', 'race_count': 74065, 'probability': 0.84}, {'race': 'Black or African American', 'race_count': 7425, 'probability': 0.08}]\n", - "the baseline cohort ethnicity stats: [{'ethnicity': 'other', 'ethnicity_count': 88166, 'probability': 1.0}]\n" - ] - } - ], - "source": [ - "# get stats of the baseline cohort\n", - "cohort_stats = baseline_cohort.get_stats()\n", - "print(f'the baseline cohort stats: {cohort_stats}')\n", - "cohort_age_stats = baseline_cohort.get_stats(\"age\")\n", - "print(f'the baseline cohort age stats: {cohort_age_stats}')\n", - "cohort_gender_stats = baseline_cohort.get_stats(\"gender\")\n", - "print(f'the baseline cohort gender stats: {cohort_gender_stats}')\n", - "cohort_race_stats = baseline_cohort.get_stats(\"race\")\n", - "print(f'the baseline cohort race stats: {cohort_race_stats}')\n", - "cohort_ethnicity_stats = baseline_cohort.get_stats(\"ethnicity\")\n", - "print(f'the baseline cohort ethnicity stats: {cohort_ethnicity_stats}')" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "d54e39da-6f78-4dc1-91ae-a8c26852582a", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "the baseline cohort age discrete probability distribution: [{'age_bin': '0-10', 'bin_count': 9231, 'probability': 0.1047}, {'age_bin': '11-20', 'bin_count': 10746, 'probability': 0.1219}, {'age_bin': '21-30', 'bin_count': 12377, 'probability': 0.1404}, {'age_bin': '31-40', 'bin_count': 10896, 'probability': 0.1236}, {'age_bin': '41-50', 'bin_count': 11450, 'probability': 0.1299}, {'age_bin': '51-60', 'bin_count': 13081, 'probability': 0.1484}, {'age_bin': '61-70', 'bin_count': 9985, 'probability': 0.1133}, {'age_bin': '71-80', 'bin_count': 5865, 'probability': 0.0665}, {'age_bin': '81-90', 'bin_count': 2810, 'probability': 0.0319}, {'age_bin': '91+', 'bin_count': 1725, 'probability': 0.0196}]\n" - ] - } - ], - "source": [ - "# get discrete probability distribution of the age variable in the baseline cohort\n", - "cohort_age_distr = baseline_cohort.get_distributions('age')\n", - "print(f'the baseline cohort age discrete probability distribution: {cohort_age_distr}')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "e3f5ace2-6cc4-4940-a067-e1a3fc14e1ce", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Cohort definition inserted successfully.\n", - "Cohort Older COVID-19 patients successfully created.\n", - "cohort created successfully\n", - "Older COVID-19 patient cohort definition: {'id': 3, 'name': 'Older COVID-19 patients', 'description': 'Patients with COVID-19 condition who are older than 65', 'created_date': datetime.date(2025, 3, 12), 'creation_info': 'SELECT c.person_id, c.condition_start_date as cohort_start_date, c.condition_end_date as cohort_end_date FROM condition_occurrence c JOIN person p ON c.person_id = p.person_id WHERE c.condition_concept_id = 37311061 AND p.year_of_birth < 1955', 'created_by': 'system'}\n", - "The first five patients in the older COVID-19 patient cohort: [{'subject_id': 20344, 'cohort_definition_id': 3, 'cohort_start_date': datetime.date(2020, 3, 11), 'cohort_end_date': datetime.date(2020, 4, 8)}, {'subject_id': 20352, 'cohort_definition_id': 3, 'cohort_start_date': datetime.date(2020, 3, 5), 'cohort_end_date': datetime.date(2020, 3, 31)}, {'subject_id': 20361, 'cohort_definition_id': 3, 'cohort_start_date': datetime.date(2020, 3, 9), 'cohort_end_date': datetime.date(2020, 4, 2)}, {'subject_id': 20378, 'cohort_definition_id': 3, 'cohort_start_date': datetime.date(2020, 2, 28), 'cohort_end_date': datetime.date(2020, 3, 11)}, {'subject_id': 20381, 'cohort_definition_id': 3, 'cohort_start_date': datetime.date(2020, 3, 12), 'cohort_end_date': datetime.date(2020, 4, 15)}]\n" - ] - } - ], - "source": [ - "# create a user study cohort with all COVID patients above the age of 65\n", - "study_cohort_query = ('SELECT c.person_id, c.condition_start_date as cohort_start_date, '\n", - " 'c.condition_end_date as cohort_end_date '\n", - " 'FROM condition_occurrence c JOIN '\n", - " 'person p ON c.person_id = p.person_id '\n", - " 'WHERE c.condition_concept_id = 37311061 AND p.year_of_birth < 1955')\n", - "\n", - "study_cohort = bias.create_cohort('Older COVID-19 patients', 'Patients with COVID-19 condition who are older than 65', study_cohort_query, 'system')\n", - "study_cohort_def = study_cohort.metadata\n", - "print(f'Older COVID-19 patient cohort definition: {study_cohort_def}')\n", - "study_cohort_data = study_cohort.data\n", - "print(f'The first five patients in the older COVID-19 patient cohort: {study_cohort_data[:5]}')" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "8be5061b-cfdf-4dc0-9ef8-f18277ab9fbe", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "the user study cohort stats: [{'total_count': 14786, 'earliest_start_date': datetime.date(2020, 1, 20), 'latest_start_date': datetime.date(2020, 3, 29), 'earliest_end_date': datetime.date(2020, 2, 5), 'latest_end_date': datetime.date(2020, 4, 28), 'min_duration_days': 8, 'max_duration_days': 37, 'avg_duration_days': 22.05, 'median_duration': 22, 'stddev_duration': 8.36}]\n", - "the user study cohort age stats: [{'total_count': 14786, 'min_age': 66, 'max_age': 111, 'avg_age': 77.64, 'median_age': 75, 'stddev_age': 10.4}]\n", - "the user study gender stats: [{'gender': 'male', 'gender_count': 7321, 'probability': 0.5}, {'gender': 'female', 'gender_count': 7465, 'probability': 0.5}]\n", - "the user study cohort race stats: [{'race': 'Other', 'race_count': 115, 'probability': 0.01}, {'race': 'Asian', 'race_count': 992, 'probability': 0.07}, {'race': 'White', 'race_count': 12474, 'probability': 0.84}, {'race': 'Black or African American', 'race_count': 1205, 'probability': 0.08}]\n", - "the user study ethnicity stats: [{'ethnicity': 'other', 'ethnicity_count': 14786, 'probability': 1.0}]\n" - ] - } - ], - "source": [ - "# get stats and distributions of the user study cohort\n", - "study_cohort_stats = study_cohort.get_stats()\n", - "print(f'the user study cohort stats: {study_cohort_stats}')\n", - "study_cohort_age_stats = study_cohort.get_stats(\"age\")\n", - "print(f'the user study cohort age stats: {study_cohort_age_stats}')\n", - "study_cohort_gender_stats = study_cohort.get_stats(\"gender\")\n", - "print(f'the user study gender stats: {study_cohort_gender_stats}')\n", - "study_cohort_race_stats = study_cohort.get_stats(\"race\")\n", - "print(f'the user study cohort race stats: {study_cohort_race_stats}')\n", - "study_cohort_ethnicity_stats = study_cohort.get_stats(\"ethnicity\")\n", - "print(f'the user study ethnicity stats: {study_cohort_ethnicity_stats}')" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "c7ad0b7b-21dc-4572-af21-fe1580361999", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "the user study cohort age discrete probability distribution: [{'age_bin': '0-10', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '11-20', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '21-30', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '31-40', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '41-50', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '51-60', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '61-70', 'bin_count': 4386, 'probability': 0.2966}, {'age_bin': '71-80', 'bin_count': 5865, 'probability': 0.3967}, {'age_bin': '81-90', 'bin_count': 2810, 'probability': 0.19}, {'age_bin': '91+', 'bin_count': 1725, 'probability': 0.1167}]\n" - ] - } - ], - "source": [ - "# get discrete probability distribution of the age variable in the baseline cohort\n", - "study_cohort_age_distr = study_cohort.get_distributions('age')\n", - "print(f'the user study cohort age discrete probability distribution: {study_cohort_age_distr}')" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "0d03cf95-3c68-4eee-be41-5482dea68b84", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{'age_hellinger_distance': 0.728150848822386}, {'gender_hellinger_distance': 0.5328876752208462}]\n" - ] - } - ], - "source": [ - "# compare the baseline and user study cohorts\n", - "result = bias.compare_cohorts(1, 2)\n", - "print(result)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "128c7b02-faef-4a35-97a6-c92baa5a37dd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Connection to BiasDatabase closed.\n", - "Connection to the OMOP CDM database closed.\n" - ] - } - ], - "source": [ - "bias.cleanup()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e2bf375-b4fb-4c50-aab9-fff4c1a02a95", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 8b692e3625b628b20e75ed5d966258ff86d8701e Mon Sep 17 00:00:00 2001 From: hyi Date: Tue, 17 Jun 2025 18:22:39 -0400 Subject: [PATCH 03/10] updated async cohort creation developer-testing jupyter notebook to user-friendly tutorial --- .../BiasAnalyzerAsyncCohortsTutorial.ipynb | 431 ++++++++++++++++++ notebooks/BiasAnalyzerCohortsTutorial.ipynb | 9 +- ...asAnalyzerTestingAsyncCohortCreation.ipynb | 352 -------------- 3 files changed, 436 insertions(+), 356 deletions(-) create mode 100644 notebooks/BiasAnalyzerAsyncCohortsTutorial.ipynb delete mode 100644 notebooks/BiasAnalyzerTestingAsyncCohortCreation.ipynb diff --git a/notebooks/BiasAnalyzerAsyncCohortsTutorial.ipynb b/notebooks/BiasAnalyzerAsyncCohortsTutorial.ipynb new file mode 100644 index 0000000..7e25e9b --- /dev/null +++ b/notebooks/BiasAnalyzerAsyncCohortsTutorial.ipynb @@ -0,0 +1,431 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fdc0d263", + "metadata": {}, + "source": [ + "# Using BiasAnalyzer for Asynchronous Cohort Creation and Exploration\n", + "\n", + "This tutorial demonstrates how to use the `BiasAnalyzer` package to create multiple cohorts asynchronously for exploration, which can improve performance and responsiveness when working with large datasets or complex cohort definitions. It complements the [Cohort Exploration Tutorial](./BiasAnalyzerCohortsTutorial.ipynb), following a similar workflow but optimized for performance by introducing asynchronous processing.\n", + "\n", + "---\n", + "\n", + "### Overview\n", + "\n", + "**Objective**: \n", + "Show how to define and create multiple cohorts using asynchronous execution to improve responsiveness and performance when working with large or complex datasets.\n", + "\n", + "**Before You Begin**: \n", + "The `BiasAnalyzer` package is currently in active development and has not yet been officially released on PyPI.\n", + "You can install it in one of the two ways:\n", + "\n", + "- **Install from GitHub (recommended during development)**:\n", + "```bash\n", + "pip install git+https://github.com/vaclab/BiasAnalyzer.git\n", + "```\n", + "- **Install from PyPI (once the pacakge is officially released)**:\n", + "```bash\n", + "pip install biasanalyzer\n", + "```\n", + "\n", + "For full setup and usage instructions, refer to the [README](https://github.com/VACLab/BiasAnalyzer/blob/main/README.md).\n", + "\n", + "---\n" + ] + }, + { + "cell_type": "markdown", + "id": "bb028875", + "metadata": {}, + "source": [ + "### Preparation for asynchronous cohort creation\n", + "**Preparation step 1**: Import the `BIAS` class from the `api` module of the `BiasAnalyzer` package, create an object `bias` of the `BIAS` class, specify OMOP CDM database configurations on the `bias` object, and set OMOP CDM database to enable connection to the database. Refer to the [Cohort Exploration Tutorial](./BiasAnalyzerCohortsTutorial.ipynb) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6dc76f46", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "configuration specified in ../config.yaml loaded successfully\n", + "Connected to the OMOP CDM database (read-only).\n", + "Cohort Definition table created.\n", + "Cohort table created.\n" + ] + } + ], + "source": [ + "from biasanalyzer.api import BIAS\n", + "\n", + "bias = BIAS()\n", + "\n", + "bias.set_config('../config.yaml')\n", + "\n", + "bias.set_root_omop()" + ] + }, + { + "cell_type": "markdown", + "id": "8731e481", + "metadata": {}, + "source": [ + "**Preparation step 2**: Import `BackgroundResult` class and the `run_in_background` function from the `background.threading_utils` module of the `BiasAnalyzer` package to support asynchronous cohort creation." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "31cac333", + "metadata": {}, + "outputs": [], + "source": [ + "from biasanalyzer.background.threading_utils import BackgroundResult, run_in_background" + ] + }, + { + "cell_type": "markdown", + "id": "22edda35", + "metadata": {}, + "source": [ + "**Now that you have connected to your OMOP CDM database and imported the necessary utilities for asynchronous processing, you are ready to create cohorts asynchronously using the `BiasAnalyzer` APIs.** This rest of this notebook illustrates how to create both a baseline and a study cohort asynchronously, and explore and compare them once they are ready. With asynchronous execution, you don't need to wait for cohort creation to finish - you can continue running the subsequent cells and explore the data as it becomes available.\n", + "\n", + "---\n", + "\n", + "### Asynchronous cohort creation\n", + "**Baseline cohort creation**: To create a baseline cohort of young female patients asynchronously, use the `run_in_background()` function on the `bias` object to run `create_cohort(cohort_name, cohort_description, query_or_yaml_file, created_by)` function in a background thread. You'll pass the target function as the first argument, the cohort creation target function input arguments as the next four arguments, a `BackgroundResult` object via the `result_holder` optional parameter to store the created baseline cohort result, and a `delay` value (e.g., 120 seconds) to simulate asynchronous execution of long-running process for testing purposes. The created baseline cohort will be identical to the one created in the [Cohort Exploration Tutorial](./BiasAnalyzerCohortsTutorial.ipynb), except that the cohort creation now runs asychronously in a background thread." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c9c9c7c5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[*] Background task started...\n", + "Baseline cohort creation running in background...\n", + "template_path: /home/hongyi/BiasAnalyzer/biasanalyzer/sql_templates\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9975fb06d4994afa80e7bc7aef956450", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Cohort creation: 0%| | 0/3 [00:00= 2000 AND p.year_of_birth <= 2020 GROUP BY c.person_id ) SELECT * FROM filtered_cohort f', 'created_by': 'system'}\n", + "Baseline cohort created with stats: [{'total_count': 12360, 'earliest_start_date': datetime.date(2000, 2, 19), 'latest_start_date': datetime.date(2020, 5, 26), 'earliest_end_date': datetime.date(2002, 7, 20), 'latest_end_date': datetime.date(2020, 5, 27), 'min_duration_days': 0, 'max_duration_days': 7379, 'avg_duration_days': 1192.32, 'median_duration': 296, 'stddev_duration': 1779.19}]\n" + ] + } + ], + "source": [ + "if baseline_result.ready:\n", + " if baseline_result.error:\n", + " print(f\"Baseline cohort creation failed: {baseline_result.error}\")\n", + " else:\n", + " baseline_cohort = baseline_result.value\n", + " baseline_cohort_def = baseline_cohort.metadata\n", + " print(f\"Baseline cohort created with metadata: {baseline_cohort_def}\")\n", + " baseline_cohort_data = baseline_cohort.data\n", + " baseline_cohort_stats = baseline_cohort.get_stats()\n", + " print(f\"Baseline cohort created with stats: {baseline_cohort_stats}\")\n", + "else:\n", + " print(\"Still creating baseline cohort...\")" + ] + }, + { + "cell_type": "markdown", + "id": "e06df5e3-6cb9-4bbb-842c-c8e987657edb", + "metadata": {}, + "source": [ + "———————————————\n", + "\n", + "**Exploring the study cohort**: To explore the study cohort once it's available, check the `ready` property of the `study_result` - the `BackgroundResult` object provided as the `result_holder` during asynchronous cohort creation. If the result is ready, verify whether the background process completed successfully by checking the `error` property of the `study_result`. If no error occurred, you can retrieve the created study cohort object and explore it, just as demonstrated in the [Cohort Exploration Tutorial](./BiasAnalyzerCohortsTutorial.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b1415805-b065-40b8-b2cd-6db6f5f9ca41", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Study cohort created with metadata: {'id': 2, 'name': 'Young COVID female patients', 'description': 'Young COVID female patients', 'created_date': datetime.date(2025, 6, 17), 'creation_info': 'WITH ranked_events_condition_occurrence AS ( SELECT person_id, condition_concept_id AS concept_id, condition_start_date AS event_start_date, condition_end_date AS event_end_date, ROW_NUMBER() OVER ( PARTITION BY person_id, condition_concept_id ORDER BY condition_start_date ASC ) AS event_instance FROM condition_occurrence ), domain_qualifying_events AS ( (SELECT person_id, event_start_date, event_end_date FROM ranked_events_condition_occurrence WHERE concept_id = 37311061) ), filtered_cohort AS ( SELECT c.person_id, MIN(c.event_start_date) AS cohort_start_date, MAX(c.event_end_date) AS cohort_end_date FROM domain_qualifying_events c JOIN person p ON c.person_id = p.person_id WHERE 1=1 AND p.gender_concept_id = 8532 AND p.year_of_birth >= 2000 AND p.year_of_birth <= 2020 GROUP BY c.person_id ) SELECT * FROM filtered_cohort f', 'created_by': 'system'}\n", + "Study cohort created with stats: [{'total_count': 10208, 'earliest_start_date': datetime.date(2020, 1, 18), 'latest_start_date': datetime.date(2020, 3, 30), 'earliest_end_date': datetime.date(2020, 2, 7), 'latest_end_date': datetime.date(2020, 5, 3), 'min_duration_days': 8, 'max_duration_days': 37, 'avg_duration_days': 24.25, 'median_duration': 24, 'stddev_duration': 7.2}]\n" + ] + } + ], + "source": [ + "if study_result.ready:\n", + " if study_result.error:\n", + " print(f\"Study cohort creation failed: {study_result.error}\")\n", + " else:\n", + " study_cohort = study_result.value\n", + " study_cohort_def = study_cohort.metadata\n", + " print(f\"Study cohort created with metadata: {study_cohort_def}\")\n", + " study_cohort_data = study_cohort.data\n", + " study_cohort_stats = study_cohort.get_stats()\n", + " print(f\"Study cohort created with stats: {study_cohort_stats}\")\n", + "else:\n", + " print(\"Still creating study cohort...\")" + ] + }, + { + "cell_type": "markdown", + "id": "56de0456-104f-4d4b-9f8e-1a65a07a6a2e", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Cohort comparison when available\n", + "To compare the baseline and study cohorts once they are available, check the `ready` property of both `baseline_result` and `study_result` - the `BackgroundResult` objects passed as `result_holder` during asynchronous cohort creation. If both results are ready, you can retrieve and compare the cohorts using the same approach demonstrated in the [Cohort Exploration Tutorial](./BiasAnalyzerCohortsTutorial.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "0d03cf95-3c68-4eee-be41-5482dea68b84", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "first 5 patient in baseline cohort data: [{'subject_id': 42583, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2020, 4, 26), 'cohort_end_date': datetime.date(2020, 5, 12)}, {'subject_id': 33685, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2017, 12, 8), 'cohort_end_date': datetime.date(2020, 5, 10)}, {'subject_id': 74383, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2019, 1, 31), 'cohort_end_date': datetime.date(2020, 3, 25)}, {'subject_id': 23986, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2019, 6, 15), 'cohort_end_date': datetime.date(2020, 3, 28)}, {'subject_id': 93962, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2019, 7, 1), 'cohort_end_date': datetime.date(2020, 5, 15)}]\n", + "first 5 patient in study cohort data: [{'subject_id': 22344, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2020, 3, 11), 'cohort_end_date': datetime.date(2020, 4, 13)}, {'subject_id': 53949, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2020, 2, 28), 'cohort_end_date': datetime.date(2020, 3, 11)}, {'subject_id': 80198, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2020, 3, 5), 'cohort_end_date': datetime.date(2020, 4, 9)}, {'subject_id': 30052, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2020, 3, 6), 'cohort_end_date': datetime.date(2020, 4, 8)}, {'subject_id': 88837, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2020, 2, 24), 'cohort_end_date': datetime.date(2020, 3, 12)}]\n", + "the baseline cohort age stats: [{'total_count': 12360, 'min_age': 0, 'max_age': 25, 'avg_age': 7.24, 'median_age': 6, 'stddev_age': 6.01}]\n", + "the baseline cohort gender stats: [{'gender': 'female', 'gender_count': 12360, 'probability': 1.0}]\n", + "the study cohort age stats: [{'total_count': 10208, 'min_age': 0, 'max_age': 20, 'avg_age': 10.94, 'median_age': 11, 'stddev_age': 5.92}]\n", + "the study cohort gender stats: [{'gender': 'female', 'gender_count': 10208, 'probability': 1.0}]\n", + "[{'age_hellinger_distance': 0.14447523081257604}, {'gender_hellinger_distance': 0.0}]\n" + ] + } + ], + "source": [ + "# compare the baseline and user study cohorts\n", + "if baseline_result.ready and study_result.ready:\n", + " print(f\"first 5 patient in baseline cohort data: {baseline_cohort_data[:5]}\")\n", + " print(f\"first 5 patient in study cohort data: {study_cohort_data[:5]}\")\n", + " baseline_cohort_age_stats = baseline_cohort.get_stats(\"age\")\n", + " print(f'the baseline cohort age stats: {baseline_cohort_age_stats}')\n", + " baseline_cohort_gender_stats = baseline_cohort.get_stats(\"gender\")\n", + " print(f'the baseline cohort gender stats: {baseline_cohort_gender_stats}')\n", + " study_cohort_age_stats = study_cohort.get_stats(\"age\")\n", + " print(f'the study cohort age stats: {study_cohort_age_stats}')\n", + " study_cohort_gender_stats = study_cohort.get_stats(\"gender\")\n", + " print(f'the study cohort gender stats: {study_cohort_gender_stats}')\n", + " result = bias.compare_cohorts(baseline_cohort_def['id'], study_cohort_def['id'])\n", + " print(result)" + ] + }, + { + "cell_type": "markdown", + "id": "d8e53808-cac2-41c7-9d60-f7a3b661ff6f", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Final cleanup to ensure database connections are closed" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "128c7b02-faef-4a35-97a6-c92baa5a37dd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connection to BiasDatabase closed.\n", + "Connection to the OMOP CDM database closed.\n" + ] + } + ], + "source": [ + "bias.cleanup()" + ] + }, + { + "cell_type": "markdown", + "id": "e3ea28f8", + "metadata": {}, + "source": [ + "### ✅ Summary\n", + "\n", + "In this tutorial, you learned how to use the BiasAnalyzer package to create a baseline and a study cohort asynchronously for improved performance and responsiveness when working with large datasets or complex cohort definitions. For testing purposes, a `delay` optional parameter is introduced in the `run_in_background()` function to simulate asynchronous execution of long-running process. This tutorial complements the [Cohort Exploration Tutorial](./BiasAnalyzerCohortsTutorial.ipynb), following a similar workflow but optimized for performance by introducing asynchronous processing.\n", + " \n", + "For more information, refer to the [BiasAnalyzer GitHub repo](https://github.com/VACLab/BiasAnalyzer) and the [README file](https://github.com/VACLab/BiasAnalyzer/blob/main/README.md).\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (biasanalyzer)", + "language": "python", + "name": "biasanalyzer-py3.8" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/BiasAnalyzerCohortsTutorial.ipynb b/notebooks/BiasAnalyzerCohortsTutorial.ipynb index 8957503..d196ccf 100644 --- a/notebooks/BiasAnalyzerCohortsTutorial.ipynb +++ b/notebooks/BiasAnalyzerCohortsTutorial.ipynb @@ -28,6 +28,7 @@ "```bash\n", "pip install biasanalyzer\n", "```\n", + "\n", "For full setup and usage instructions, refer to the [README](https://github.com/VACLab/BiasAnalyzer/blob/main/README.md).\n", "\n", "---\n" @@ -140,12 +141,12 @@ "id": "c7219629-1a30-44af-9ec5-5eb9b4a52c5a", "metadata": {}, "source": [ - "---\n", + "**Now that you have connected to your OMOP CDM database, you can start to use the APIs to explore your data.** The rest of this notebook illustrates how to create and explore a baseline and a study cohort, and then compare them using the BiasAnalyzer APIs.\n", "\n", - "**Now that you have connected to your OMOP CDM database, you can start to use the APIs to explore your data. The rest of this notebook illustrates how to create and explore a baseline and a study cohort, and then compare them using the BiasAnalyzer APIs.**\n", + "---\n", "\n", "### Baseline cohort creation and exploration\n", - "**Baseline cohort creation**: Create a baseline cohort of young female patients on the `bias` object by calling the `create_cohort(cohort_name, cohort_description, query_or_yaml_file, created_by)` function and passing the name of the cohort (first argument), the description of the cohort (second argument), a yaml file that specifies cohort inclusion and exclusion criteria or a cohort selection SQL query (third argument), and the cohort owner's name indicating who owns or creates this cohort (fourth argument). The function will show a progress bar to indicate cohort creation progress over three stages." + "**Baseline cohort creation**: To create a baseline cohort of young female patients, use the `create_cohort(cohort_name, cohort_description, query_or_yaml_file, created_by)` function on the `bias` object. You'll pass the name of the cohort as the first argument, the description of the cohort as the second argument, a yaml file that specifies cohort inclusion and exclusion criteria or a cohort selection SQL query as the third argument, and the cohort owner's name indicating who owns or creates this cohort as the fourth argument. The function will show a progress bar to indicate cohort creation progress over three stages." ] }, { @@ -303,7 +304,7 @@ "---\n", "\n", "### Study cohort creation and exploration\n", - "**Study cohort creation**: Create a study cohort of young female COVID patients on the bias object by calling the create_cohort(cohort_name, cohort_description, query_or_yaml_file, created_by) function and passing the name of the cohort (first argument), the description of the cohort (second argument), a yaml file that specifies cohort inclusion and exclusion criteria or a cohort selection SQL query (third argument), and the cohort owner's name indicating who owns or creates this cohort (fourth argument). The function will show a progress bar to indicate cohort creation progress over three stages." + "**Study cohort creation**: To create a study cohort of young female COVID patients, use the `create_cohort(cohort_name, cohort_description, query_or_yaml_file, created_by)` function on the `bias` object. You'll pass the name of the cohort as the first argument, the description of the cohort as the second argument, a yaml file that specifies cohort inclusion and exclusion criteria or a cohort selection SQL query as the third argument, and the cohort owner's name indicating who owns or creates this cohort as the fourth argument. The function will show a progress bar to indicate cohort creation progress over three stages.\n" ] }, { diff --git a/notebooks/BiasAnalyzerTestingAsyncCohortCreation.ipynb b/notebooks/BiasAnalyzerTestingAsyncCohortCreation.ipynb deleted file mode 100644 index 763bbc1..0000000 --- a/notebooks/BiasAnalyzerTestingAsyncCohortCreation.ipynb +++ /dev/null @@ -1,352 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "10a97b84-7514-4bba-aaf2-0a46a44cc5fd", - "metadata": {}, - "outputs": [], - "source": [ - "from biasanalyzer.api import BIAS\n", - "from biasanalyzer.background.threading_utils import BackgroundResult, run_in_background" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "548223ed-8948-461e-b9d6-40a0ec7fc89f", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "no configuration file specified. Call set_config(config_file_path) next to specify configurations\n" - ] - } - ], - "source": [ - "# create an object of BIAS class\n", - "bias = BIAS()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "7d440d9f-c7fa-4ef1-ad66-31274ebef4ea", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "configuration specified in /home/hongyi/BiasAnalyzer/config.yaml loaded successfully\n" - ] - } - ], - "source": [ - "bias.set_config('/home/hongyi/BiasAnalyzer/config.yaml')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "283156f8-63da-42a5-bbd7-ee2b7719652c", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Connected to the OMOP CDM database (read-only).\n", - "Cohort Definition table created.\n", - "Cohort table created.\n" - ] - } - ], - "source": [ - "# the configuration file includes root_omop_cdm_database configuration info with an example shown \n", - "# in https://github.com/hyi/HealthDataBias/blob/main/tests/assets/config/test_config.yaml\n", - "bias.set_root_omop()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "a68f3eaf-92fd-49a2-9768-d685d826fd57", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[*] Background task started...\n", - "Baseline cohort creation running in background...\n", - "template_path: /home/hongyi/BiasAnalyzer/biasanalyzer/sql_templates\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b6db954d48fe41ab9e53ff6b6e358fcd", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Cohort creation: 0%| | 0/3 [00:00= 2000 AND p.year_of_birth <= 2020 GROUP BY c.person_id ) SELECT * FROM filtered_cohort f', 'created_by': 'system'}\n", - "Baseline cohort created with stats: [{'total_count': 12360, 'earliest_start_date': datetime.date(2000, 2, 19), 'latest_start_date': datetime.date(2020, 5, 26), 'earliest_end_date': datetime.date(2002, 7, 20), 'latest_end_date': datetime.date(2020, 5, 27), 'min_duration_days': 0, 'max_duration_days': 7379, 'avg_duration_days': 1192.32, 'median_duration': 296, 'stddev_duration': 1779.19}]\n" - ] - } - ], - "source": [ - "if baseline_result.ready:\n", - " if baseline_result.error:\n", - " print(f\"Baseline cohort creation failed: {baseline_result.error}\")\n", - " else:\n", - " baseline_cohort = baseline_result.value\n", - " baseline_cohort_def = baseline_cohort.metadata\n", - " print(f\"Baseline cohort created with metadata: {baseline_cohort_def}\")\n", - " baseline_cohort_data = baseline_cohort.data\n", - " baseline_cohort_stats = baseline_cohort.get_stats()\n", - " print(f\"Baseline cohort created with stats: {baseline_cohort_stats}\")\n", - "else:\n", - " print(\"Still creating baseline cohort...\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "b1415805-b065-40b8-b2cd-6db6f5f9ca41", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Study cohort created with metadata: {'id': 1, 'name': 'Young COVID female patients', 'description': 'Young COVID female patients', 'created_date': datetime.date(2025, 6, 4), 'creation_info': 'WITH ranked_events_condition_occurrence AS ( SELECT person_id, condition_concept_id AS concept_id, condition_start_date AS event_start_date, condition_end_date AS event_end_date, ROW_NUMBER() OVER ( PARTITION BY person_id, condition_concept_id ORDER BY condition_start_date ASC ) AS event_instance FROM condition_occurrence ), domain_qualifying_events AS ( (SELECT person_id, event_start_date, event_end_date FROM ranked_events_condition_occurrence WHERE concept_id = 37311061) ), filtered_cohort AS ( SELECT c.person_id, MIN(c.event_start_date) AS cohort_start_date, MAX(c.event_end_date) AS cohort_end_date FROM domain_qualifying_events c JOIN person p ON c.person_id = p.person_id WHERE 1=1 AND p.gender_concept_id = 8532 AND p.year_of_birth >= 2000 AND p.year_of_birth <= 2020 GROUP BY c.person_id ) SELECT * FROM filtered_cohort f', 'created_by': 'system'}\n", - "Study cohort created with stats: [{'total_count': 10208, 'earliest_start_date': datetime.date(2020, 1, 18), 'latest_start_date': datetime.date(2020, 3, 30), 'earliest_end_date': datetime.date(2020, 2, 7), 'latest_end_date': datetime.date(2020, 5, 3), 'min_duration_days': 8, 'max_duration_days': 37, 'avg_duration_days': 24.25, 'median_duration': 24, 'stddev_duration': 7.2}]\n" - ] - } - ], - "source": [ - "if study_result.ready:\n", - " if study_result.error:\n", - " print(f\"Study cohort creation failed: {study_result.error}\")\n", - " else:\n", - " study_cohort = study_result.value\n", - " study_cohort_def = study_cohort.metadata\n", - " print(f\"Study cohort created with metadata: {study_cohort_def}\")\n", - " study_cohort_data = study_cohort.data\n", - " study_cohort_stats = study_cohort.get_stats()\n", - " print(f\"Study cohort created with stats: {study_cohort_stats}\")\n", - "else:\n", - " print(\"Still creating study cohort...\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "0d03cf95-3c68-4eee-be41-5482dea68b84", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "first 5 patient in baseline cohort data: [{'subject_id': 42583, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2020, 4, 26), 'cohort_end_date': datetime.date(2020, 5, 12)}, {'subject_id': 33685, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2017, 12, 8), 'cohort_end_date': datetime.date(2020, 5, 10)}, {'subject_id': 74383, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2019, 1, 31), 'cohort_end_date': datetime.date(2020, 3, 25)}, {'subject_id': 23986, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2019, 6, 15), 'cohort_end_date': datetime.date(2020, 3, 28)}, {'subject_id': 93962, 'cohort_definition_id': 2, 'cohort_start_date': datetime.date(2019, 7, 1), 'cohort_end_date': datetime.date(2020, 5, 15)}]\n", - "first 5 patient in study cohort data: [{'subject_id': 53949, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2020, 2, 28), 'cohort_end_date': datetime.date(2020, 3, 11)}, {'subject_id': 22344, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2020, 3, 11), 'cohort_end_date': datetime.date(2020, 4, 13)}, {'subject_id': 80198, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2020, 3, 5), 'cohort_end_date': datetime.date(2020, 4, 9)}, {'subject_id': 30052, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2020, 3, 6), 'cohort_end_date': datetime.date(2020, 4, 8)}, {'subject_id': 94887, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2020, 2, 29), 'cohort_end_date': datetime.date(2020, 3, 24)}]\n", - "the baseline cohort age stats: [{'total_count': 12360, 'min_age': 0, 'max_age': 25, 'avg_age': 10.71, 'median_age': 11, 'stddev_age': 5.98}]\n", - "the baseline cohort gender stats: [{'gender': 'female', 'gender_count': 12360, 'probability': 1.0}]\n", - "the study cohort age stats: [{'total_count': 10208, 'min_age': 0, 'max_age': 24, 'avg_age': 10.94, 'median_age': 11, 'stddev_age': 5.93}]\n", - "the study cohort gender stats: [{'gender': 'female', 'gender_count': 10208, 'probability': 1.0}]\n", - "[{'age_hellinger_distance': 0.010623813022853212}, {'gender_hellinger_distance': 0.0}]\n" - ] - } - ], - "source": [ - "# compare the baseline and user study cohorts\n", - "if baseline_result.ready and study_result.ready:\n", - " print(f\"first 5 patient in baseline cohort data: {baseline_cohort_data[:5]}\")\n", - " print(f\"first 5 patient in study cohort data: {study_cohort_data[:5]}\")\n", - " baseline_cohort_age_stats = baseline_cohort.get_stats(\"age\")\n", - " print(f'the baseline cohort age stats: {baseline_cohort_age_stats}')\n", - " baseline_cohort_gender_stats = baseline_cohort.get_stats(\"gender\")\n", - " print(f'the baseline cohort gender stats: {baseline_cohort_gender_stats}')\n", - " study_cohort_age_stats = study_cohort.get_stats(\"age\")\n", - " print(f'the study cohort age stats: {study_cohort_age_stats}')\n", - " study_cohort_gender_stats = study_cohort.get_stats(\"gender\")\n", - " print(f'the study cohort gender stats: {study_cohort_gender_stats}')\n", - " result = bias.compare_cohorts(1, 2)\n", - " print(result)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "128c7b02-faef-4a35-97a6-c92baa5a37dd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Connection to BiasDatabase closed.\n", - "Connection to the OMOP CDM database closed.\n" - ] - } - ], - "source": [ - "bias.cleanup()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a6c2d12-91b6-4074-8565-6ff2f61f2f00", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python (biasanalyzer)", - "language": "python", - "name": "biasanalyzer-py3.8" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 440372f7f0a3579e442433b151fe6fde967b8822 Mon Sep 17 00:00:00 2001 From: hyi Date: Thu, 19 Jun 2025 17:24:36 -0400 Subject: [PATCH 04/10] updated code to compute cohort concept prevalence for any domains --- biasanalyzer/cohort.py | 4 +- biasanalyzer/cohort_query_builder.py | 57 ++++++-- biasanalyzer/database.py | 38 +++--- biasanalyzer/models.py | 7 + biasanalyzer/sql.py | 124 ------------------ .../test_hierarchical_prevalence.py | 24 +++- tests/test_database.py | 8 +- 7 files changed, 94 insertions(+), 168 deletions(-) diff --git a/biasanalyzer/cohort.py b/biasanalyzer/cohort.py index 6ca1d75..83aa2e8 100644 --- a/biasanalyzer/cohort.py +++ b/biasanalyzer/cohort.py @@ -18,6 +18,7 @@ def __init__(self, cohort_id: int, bias_db: BiasDatabase, omop_db: OMOPCDMDataba self.omop_db = omop_db self._cohort_data = None # cache the cohort data self._metadata = None + self.query_builder = CohortQueryBuilder(cohort_creation=False) @property def data(self): @@ -55,6 +56,7 @@ def get_concept_stats(self, concept_type='condition_occurrence', filter_count=0, Get cohort concept statistics such as concept prevalence """ cohort_stats = self.bias_db.get_cohort_concept_stats(self.cohort_id, + self.query_builder, concept_type=concept_type, filter_count=filter_count, vocab=vocab, @@ -106,7 +108,7 @@ def create_cohort(self, cohort_name: str, description: str, query_or_yaml_file: notify_users(f'cohort creation configuration yaml file is not valid with validation error: {ex}') return None - query = self._query_builder.build_query(cohort_config) + query = self._query_builder.build_query_cohort_creation(cohort_config) else: query = clean_string(query_or_yaml_file) progress.update(1) diff --git a/biasanalyzer/cohort_query_builder.py b/biasanalyzer/cohort_query_builder.py index 860970a..79fda66 100644 --- a/biasanalyzer/cohort_query_builder.py +++ b/biasanalyzer/cohort_query_builder.py @@ -6,7 +6,7 @@ class CohortQueryBuilder: - def __init__(self): + def __init__(self, cohort_creation=True): """Get the path to SQL templates, whether running from source or installed.""" try: if sys.version_info >= (3, 9): # pragma: no cover @@ -19,12 +19,13 @@ def __init__(self): except ModuleNotFoundError: # pragma: no cover template_path = os.path.join(os.path.dirname(__file__), "sql_templates") - print(f'template_path: {template_path}') + print(f'template_path: {template_path}, cohort_creation: {cohort_creation}') self.env = Environment(loader=FileSystemLoader(template_path), extensions=['jinja2.ext.do']) - self.env.globals.update( - demographics_filter=self._load_macro('demographics_filter'), - temporal_event_filter=self.temporal_event_filter - ) + if cohort_creation: + self.env.globals.update( + demographics_filter=self._load_macro('demographics_filter'), + temporal_event_filter=self.temporal_event_filter + ) def _extract_domains(self, events): domains = set() @@ -42,16 +43,11 @@ def _load_macro(self, macro_name): macros_template = self.env.get_template('macros.sql.j2') return macros_template.module.__dict__[macro_name] - - def build_query(self, cohort_config: dict) -> str: + def build_query_cohort_creation(self, cohort_config: dict) -> str: """ Build a SQL query from the CohortCreationConfig object. - - Args: - cohort_config: dict object loaded from yaml file for building sql query. - - Returns: - str: The rendered SQL query. + :param cohort_config: dict object loaded from yaml file for building sql query. + :return: The rendered SQL query. """ inclusion_criteria = cohort_config.get('inclusion_criteria') exclusion_criteria = cohort_config.get('exclusion_criteria', {}) @@ -75,6 +71,39 @@ def build_query(self, cohort_config: dict) -> str: temporal_events=temporal_events ) + def build_concept_prevalence_query(self, concept_type: str, cid: int, filter_count: int, vocab: str, + include_hierarchy: bool) -> str: + """ + Build a SQL query for concept prevalence statistics for a given domain and cohort. + :param concept_type: Domain from DOMAIN_MAPPING (e.g., 'condition_occurrence'). + :param cid: Cohort definition ID. + :param filter_count: Minimum count threshold for concepts with 0 meaning no filtering + :param vocab: Vocabulary ID. Defaults to domain-specific vocabulary as defined in DOMAIN_MAPPING if set to None + :param include_hierarchy: Include concept hierarchy in results or not + :return: The rendered SQL query + :raises ValueError if concept_type is not invalid + """ + + # Validate concept_type + if concept_type not in DOMAIN_MAPPING or DOMAIN_MAPPING[concept_type]["table"] is None: + valid_domains = [k for k in DOMAIN_MAPPING.keys() if DOMAIN_MAPPING[k]["table"] is not None] + raise ValueError(f"Invalid concept_type: {concept_type}. Must be one of {valid_domains}") + + # The provided vocab is assumed to be already validated if it is not set to None. Otherwise, + # if set to None, use domain-specific default vocabulary + effective_vocab = vocab if vocab is not None else DOMAIN_MAPPING[concept_type]["default_vocab"] + # Load and render the template + template = self.env.get_template("cohort_concept_prevalence_query.sql.j2") + return template.render( + table_name=DOMAIN_MAPPING[concept_type]["table"], + concept_id_column=DOMAIN_MAPPING[concept_type]["concept_id"], + start_date_column=DOMAIN_MAPPING[concept_type]["start_date"], + cid=cid, + filter_count=filter_count, + vocab=effective_vocab, + include_hierarchy=include_hierarchy + ) + @staticmethod def render_event(event): """ diff --git a/biasanalyzer/database.py b/biasanalyzer/database.py index af951bb..0c9d21f 100644 --- a/biasanalyzer/database.py +++ b/biasanalyzer/database.py @@ -6,7 +6,7 @@ from sqlalchemy.orm import sessionmaker from sqlalchemy.exc import SQLAlchemyError from sqlalchemy import create_engine, text -from biasanalyzer.models import Cohort, CohortDefinition +from biasanalyzer.models import CohortDefinition from biasanalyzer.sql import * from biasanalyzer.utils import build_concept_hierarchy, print_hierarchy, find_roots, notify_users @@ -22,16 +22,6 @@ class BiasDatabase: "race": RACE_STATS_QUERY, "ethnicity": ETHNICITY_STATS_QUERY } - cohort_concept_queries = { - 'condition_occurrence': { - 'query': COHORT_CONCEPT_CONDITION_PREVALENCE_QUERY, - 'default_vocab': 'SNOMED' - }, - 'drug_exposure': { - 'query': COHORT_CONCEPT_DRUG_PREVALENCE_QUERY, - 'default_vocab': 'RxNorm' - } - } _instance = None # indicating a singleton with only one instance of the class ever created def __new__(cls, *args, **kwargs): if cls._instance is None: @@ -142,7 +132,7 @@ def get_cohort(self, cohort_definition_id): return [dict(zip(headers, row)) for row in rows] def _create_omop_table(self, table_name): - if self.omop_cdm_db_url is not None and not self.omop_cdm_db_url.endswith('.duckdb'): + if self.omop_cdm_db_url is not None and not self.omop_cdm_db_url.endswith('duckdb'): # need to create person table from OMOP CDM postgreSQL database self.conn.execute(f""" CREATE TABLE IF NOT EXISTS {table_name} AS @@ -237,25 +227,29 @@ def get_cohort_distributions(self, cohort_definition_id: int, variable: str): notify_users(f"Error computing cohort {variable} distributions: {e}", level='error') return None - def get_cohort_concept_stats(self, cohort_definition_id: int, + def get_cohort_concept_stats(self, cohort_definition_id: int, qry_builder, concept_type='condition_occurrence', filter_count=0, vocab=None, include_hierarchy=False): """ Get concept statistics for a cohort from the cohort table. """ concept_stats = {} - if concept_type not in self.__class__.cohort_concept_queries: - notify_users(f"input {concept_type} is not a valid concept type. " - f"Supported concept types are: {self.__class__.cohort_concept_queries.keys()}", level='error') - return concept_stats + try: if (self._create_omop_table('concept') and self._create_omop_table('concept_ancestor') and self._create_omop_table(concept_type)): - query_str = self.__class__.cohort_concept_queries[concept_type]['query'] - if not vocab: - vocab = self.__class__.cohort_concept_queries[concept_type]['default_vocab'] - query = query_str.format(cid=cohort_definition_id, filter_count=filter_count, - vocab=vocab, include_hierarchy=include_hierarchy) + # validate input vocab if it is not None + if vocab is not None: + valid_vocabs = self._execute_query("SELECT distinct vocabulary_id FROM concept") + valid_vocab_ids = [row['vocabulary_id'] for row in valid_vocabs] + if vocab not in valid_vocab_ids: + notify_users(f"input {vocab} is not a valid vocabulary in OMOP. " + f"Supported vocabulary ids are: {valid_vocab_ids}", + level='error') + return concept_stats + + query = qry_builder.build_concept_prevalence_query(concept_type, cohort_definition_id, + filter_count, vocab, include_hierarchy) concept_stats[concept_type] = self._execute_query(query) cs_df = pd.DataFrame(concept_stats[concept_type]) # Combine concept_name and prevalence into a "details" column diff --git a/biasanalyzer/models.py b/biasanalyzer/models.py index 2ec6140..72790fa 100644 --- a/biasanalyzer/models.py +++ b/biasanalyzer/models.py @@ -9,42 +9,49 @@ "concept_id": "condition_concept_id", "start_date": "condition_start_date", "end_date": "condition_end_date", + "default_vocab": "SNOMED" # for use by concept prevalence query }, "drug_exposure": { "table": "drug_exposure", "concept_id": "drug_concept_id", "start_date": "drug_exposure_start_date", "end_date": "drug_exposure_end_date", + "default_vocab": "RxNorm" # for use by concept prevalence query }, "procedure_occurrence": { "table": "procedure_occurrence", "concept_id": "procedure_concept_id", "start_date": "procedure_date", "end_date": "procedure_date", + "default_vocab": "SNOMED" # for use by concept prevalence query }, "visit_occurrence": { "table": "visit_occurrence", "concept_id": "visit_concept_id", "start_date": "visit_start_date", "end_date": "visit_end_date", + "default_vocab": "SNOMED" # for use by concept prevalence query }, "measurement": { "table": "measurement", "concept_id": "measurement_concept_id", "start_date": "measurement_date", "end_date": "measurement_date", + "default_vocab": "LOINC" # for use by concept prevalence query }, "observation": { "table": "observation", "concept_id": "observation_concept_id", "start_date": "observation_date", "end_date": "observation_date", + "default_vocab": "SNOMED" # for use by concept prevalence query }, "date": { # Special case for static timestamps "table": None, "concept_id": None, "start_date": "timestamp", "end_date": "timestamp", + "default_vocab": None } } diff --git a/biasanalyzer/sql.py b/biasanalyzer/sql.py index a87c665..10bbef7 100644 --- a/biasanalyzer/sql.py +++ b/biasanalyzer/sql.py @@ -146,127 +146,3 @@ WHERE c.cohort_definition_id = {} GROUP BY p.ethnicity_concept_id ''' - -COHORT_CONCEPT_CONDITION_PREVALENCE_QUERY = ''' - WITH cohort_conditions AS ( - -- Compute the counts for each condition node - SELECT - co.condition_concept_id AS concept_id, - ct.subject_id - FROM - cohort ct - JOIN - condition_occurrence co ON ct.subject_id = co.person_id - AND co.condition_start_date >= ct.cohort_start_date - AND (co.condition_end_date IS NULL OR co.condition_start_date <= ct.cohort_end_date) - WHERE ct.cohort_definition_id = {cid} - ), - aggregated_counts AS ( - -- Aggregate counts for parent nodes using the concept_ancestor table - SELECT - ca.ancestor_concept_id AS concept_id, - COUNT(DISTINCT cc.subject_id) AS count_in_cohort - FROM - cohort_conditions cc - JOIN - concept_ancestor ca - ON cc.concept_id = ca.descendant_concept_id - WHERE - ca.min_levels_of_separation >= 0 - GROUP BY - ca.ancestor_concept_id - ), - concept_hierarchy AS ( - -- Retrieve the direct parent-child hierarchy for all concepts involved - SELECT - ca.ancestor_concept_id, - ca.descendant_concept_id, - FROM - concept_ancestor ca - WHERE - ca.min_levels_of_separation <= 1 - AND ca.descendant_concept_id IN (SELECT concept_id FROM aggregated_counts where count_in_cohort > {filter_count}) - AND ca.ancestor_concept_id IN (SELECT concept_id FROM aggregated_counts where count_in_cohort > {filter_count}) - ) - -- Combine counts and hierarchy with concept details - SELECT DISTINCT - c.concept_name, - c.concept_code, - ac.count_in_cohort, - (ac.count_in_cohort * 1.0 / (SELECT COUNT(DISTINCT subject_id) FROM cohort WHERE cohort_definition_id = {cid})) AS prevalence, - ch.ancestor_concept_id, - ch.descendant_concept_id - FROM - aggregated_counts ac - JOIN - concept_hierarchy ch ON ac.concept_id = ch.descendant_concept_id - JOIN - concept c ON ac.concept_id = c.concept_id - WHERE ac.count_in_cohort > {filter_count} - AND ({include_hierarchy} = True OR ch.ancestor_concept_id = ch.descendant_concept_id) - ORDER BY - prevalence DESC; -''' -COHORT_CONCEPT_DRUG_PREVALENCE_QUERY = ''' - WITH cohort_drugs AS ( - -- Compute the counts for each drug node - SELECT - de.drug_concept_id AS concept_id, - ct.subject_id - FROM - cohort ct - JOIN - drug_exposure de ON ct.subject_id = de.person_id - AND de.drug_exposure_start_date >= ct.cohort_start_date - AND (de.drug_exposure_start_date IS NULL OR de.drug_exposure_start_date <= ct.cohort_end_date) - WHERE ct.cohort_definition_id = {cid} - ), - aggregated_counts AS ( - -- Aggregate counts for parent nodes using the concept_ancestor table - SELECT - ca.ancestor_concept_id AS concept_id, - COUNT(DISTINCT cd.subject_id) AS count_in_cohort - FROM - cohort_drugs cd - JOIN - concept_ancestor ca - ON cd.concept_id = ca.descendant_concept_id - JOIN - concept anc ON ca.ancestor_concept_id = anc.concept_id - WHERE - anc.vocabulary_id = '{vocab}' AND - ca.min_levels_of_separation >= 0 -- Ensure valid ancestor relationships - GROUP BY - ca.ancestor_concept_id - ), - concept_hierarchy AS ( - -- Retrieve the hierarchy for all concepts involved - SELECT - ca.ancestor_concept_id, - ca.descendant_concept_id - FROM - concept_ancestor ca - WHERE - ca.min_levels_of_separation = 1 - AND ca.descendant_concept_id IN (SELECT concept_id FROM aggregated_counts where count_in_cohort > {filter_count}) - AND ca.ancestor_concept_id IN (SELECT concept_id FROM aggregated_counts where count_in_cohort > {filter_count}) - ) - -- Combine counts and hierarchy with concept details - SELECT DISTINCT - c.concept_name, - c.concept_code, - ac.count_in_cohort, - (ac.count_in_cohort * 1.0 / (SELECT COUNT(DISTINCT subject_id) FROM cohort WHERE cohort_definition_id = {cid})) AS prevalence, - ch.ancestor_concept_id, - ch.descendant_concept_id - FROM - aggregated_counts ac - JOIN - concept_hierarchy ch ON ac.concept_id = ch.descendant_concept_id - JOIN - concept c ON ac.concept_id = c.concept_id - WHERE ac.count_in_cohort > {filter_count} - AND ({include_hierarchy} = True OR ch.ancestor_concept_id = ch.descendant_concept_id) - ORDER BY - prevalence DESC; -''' \ No newline at end of file diff --git a/tests/query_based/test_hierarchical_prevalence.py b/tests/query_based/test_hierarchical_prevalence.py index ba849fa..e0ad55c 100644 --- a/tests/query_based/test_hierarchical_prevalence.py +++ b/tests/query_based/test_hierarchical_prevalence.py @@ -1,4 +1,6 @@ -def test_cohort_concept_hierarchical_prevalence(test_db): +import logging + +def test_cohort_concept_hierarchical_prevalence(test_db, caplog): bias = test_db cohort_query = """ SELECT person_id, condition_concept_id, @@ -15,13 +17,27 @@ def test_cohort_concept_hierarchical_prevalence(test_db): ) # Test cohort object and methods assert cohort is not None, "Cohort creation failed" - # test cohort.get_concept_stats only supports concept stats for condition_occurrence and drug_exposures currently - concept_stats = cohort.get_concept_stats(concept_type='procedure_occurrence') + # test concept_type must be one of the supported OMOP domain name + caplog.clear() + with caplog.at_level(logging.ERROR): + concept_stats = cohort.get_concept_stats(concept_type='dummy_invalid') + assert 'Invalid concept_type' in caplog.text + assert concept_stats == {} + + # test vocab must be None to use the default vocab or one of the supported OMOP vocabulary id + caplog.clear() + with caplog.at_level(logging.ERROR): + concept_stats = cohort.get_concept_stats(vocab='dummy_invalid_vocab') + assert 'is not a valid vocabulary' in caplog.text assert concept_stats == {} + # test the cohort does not have procedure_occurrence related concepts + concept_stats = cohort.get_concept_stats(concept_type='procedure_occurrence') + assert concept_stats == {'procedure_occurrence': []} + include_hierarchy_flags = [True, False] for flag in include_hierarchy_flags: - concept_stats = cohort.get_concept_stats(include_hierarchy=flag) + concept_stats = cohort.get_concept_stats(vocab='ICD10CM', include_hierarchy=flag) assert concept_stats is not None, "Failed to fetch concept stats" assert len(concept_stats) > 0, "No concept stats returned" # check returned data with different include_hierarchy flag diff --git a/tests/test_database.py b/tests/test_database.py index 89652ed..d2cb1ff 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -1,6 +1,7 @@ import duckdb import pytest import logging +from biasanalyzer.cohort_query_builder import CohortQueryBuilder from biasanalyzer.database import BiasDatabase @@ -154,21 +155,22 @@ def test_get_cohort_concept_stats_handles_exception(caplog): BiasDatabase._instance = None db = BiasDatabase(":memory:") db.omop_cdm_db_url = 'duckdb' + qry_builder = CohortQueryBuilder(cohort_creation=False) caplog.clear() with caplog.at_level(logging.ERROR): - result = db.get_cohort_concept_stats(123) + result = db.get_cohort_concept_stats(123, qry_builder) assert 'Error computing cohort concept stats' in caplog.text assert result == {} def test_get_cohort_attributes_handles_exception(): BiasDatabase._instance = None db = BiasDatabase(":memory:") - + qry_builder = CohortQueryBuilder(cohort_creation=False) db.omop_cdm_db_url = None result_stats = db.get_cohort_basic_stats(123, variable='age') assert result_stats is None result = db.get_cohort_distributions(123, 'age') assert result is None - result = db.get_cohort_concept_stats(123) + result = db.get_cohort_concept_stats(123, qry_builder) assert result == {} From 17e406c44aab70e8d3096081d0f76f6881019924 Mon Sep 17 00:00:00 2001 From: hyi Date: Thu, 19 Jun 2025 17:30:41 -0400 Subject: [PATCH 05/10] add the jinja2 SQL template to support cohort concept prevalence across domains --- .../cohort_concept_prevalence_query.sql.j2 | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 biasanalyzer/sql_templates/cohort_concept_prevalence_query.sql.j2 diff --git a/biasanalyzer/sql_templates/cohort_concept_prevalence_query.sql.j2 b/biasanalyzer/sql_templates/cohort_concept_prevalence_query.sql.j2 new file mode 100644 index 0000000..ff3be54 --- /dev/null +++ b/biasanalyzer/sql_templates/cohort_concept_prevalence_query.sql.j2 @@ -0,0 +1,61 @@ +WITH cohort_events AS ( + -- Compute the counts for each concept node + SELECT + e.{{ concept_id_column }} AS concept_id, + ct.subject_id + FROM + cohort ct + JOIN + {{ table_name }} e ON ct.subject_id = e.person_id + AND e.{{ start_date_column }} >= ct.cohort_start_date + AND (ct.cohort_end_date IS NULL OR e.{{ start_date_column }} <= ct.cohort_end_date) + WHERE ct.cohort_definition_id = {{ cid }} +), +aggregated_counts AS ( + -- Aggregate counts for parent nodes using the concept_ancestor table + SELECT + ca.ancestor_concept_id AS concept_id, + COUNT(DISTINCT ce.subject_id) AS count_in_cohort + FROM + cohort_events ce + JOIN + concept_ancestor ca ON ce.concept_id = ca.descendant_concept_id + JOIN + concept anc ON ca.ancestor_concept_id = anc.concept_id + WHERE + anc.vocabulary_id = '{{ vocab }}' + AND ca.min_levels_of_separation >= 0 + GROUP BY + ca.ancestor_concept_id +), +concept_hierarchy AS ( + -- Retrieve the direct parent-child hierarchy for all concepts involved + SELECT + ca.ancestor_concept_id, + ca.descendant_concept_id + FROM + concept_ancestor ca + WHERE + ca.min_levels_of_separation <= 1 + AND ca.descendant_concept_id IN (SELECT concept_id FROM aggregated_counts WHERE count_in_cohort > {{ filter_count }}) + AND ca.ancestor_concept_id IN (SELECT concept_id FROM aggregated_counts WHERE count_in_cohort > {{ filter_count }}) +) +-- Combine counts and hierarchy with concept details +SELECT DISTINCT + c.concept_name, + c.concept_code, + ac.count_in_cohort, + (ac.count_in_cohort * 1.0 / (SELECT COUNT(DISTINCT subject_id) FROM cohort WHERE cohort_definition_id = {{ cid }})) AS prevalence, + ch.ancestor_concept_id, + ch.descendant_concept_id +FROM + aggregated_counts ac +JOIN + concept_hierarchy ch ON ac.concept_id = ch.descendant_concept_id +JOIN + concept c ON ac.concept_id = c.concept_id +WHERE + ac.count_in_cohort > {{ filter_count }} + AND ({{ include_hierarchy }} = True OR ch.ancestor_concept_id = ch.descendant_concept_id) +ORDER BY + prevalence DESC; \ No newline at end of file From 991aa887647d3a3e6114dd0f161eccc823bb4860 Mon Sep 17 00:00:00 2001 From: hyi Date: Thu, 19 Jun 2025 17:33:41 -0400 Subject: [PATCH 06/10] corrected a comment in the cohorts totorial notebook --- notebooks/BiasAnalyzerCohortsTutorial.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/BiasAnalyzerCohortsTutorial.ipynb b/notebooks/BiasAnalyzerCohortsTutorial.ipynb index d196ccf..4ac4bfc 100644 --- a/notebooks/BiasAnalyzerCohortsTutorial.ipynb +++ b/notebooks/BiasAnalyzerCohortsTutorial.ipynb @@ -341,7 +341,7 @@ } ], "source": [ - "# create a user study cohort with all COVID patients above the age of 65\n", + "# create a user study cohort with young female COVID patients\n", "study_cohort = bias.create_cohort('Young female COVID patients', \n", " 'A cohort of female COVID patients born between 2000 and 2020', \n", " '../tests/assets/cohort_creation/test_cohort_creation_condition_occurrence_config_study.yaml', \n", From 50e786f3c92e7e672ff42e6e5c5e483b61aef4e5 Mon Sep 17 00:00:00 2001 From: hyi Date: Thu, 19 Jun 2025 22:34:04 -0400 Subject: [PATCH 07/10] replaced developer version of cohort concept prevalence notebook with user-friendly tutorial notebook --- .../BiasAnalyzerCohortConceptTutorial.ipynb | 1052 ++++++++++++++ ...iasAnalyzerTestingCohortConceptStats.ipynb | 1205 ----------------- 2 files changed, 1052 insertions(+), 1205 deletions(-) create mode 100644 notebooks/BiasAnalyzerCohortConceptTutorial.ipynb delete mode 100644 notebooks/BiasAnalyzerTestingCohortConceptStats.ipynb diff --git a/notebooks/BiasAnalyzerCohortConceptTutorial.ipynb b/notebooks/BiasAnalyzerCohortConceptTutorial.ipynb new file mode 100644 index 0000000..a35de32 --- /dev/null +++ b/notebooks/BiasAnalyzerCohortConceptTutorial.ipynb @@ -0,0 +1,1052 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fdc0d263", + "metadata": {}, + "source": [ + "# Using BiasAnalyzer for Cohort Concept Prevalence Exploration\n", + "\n", + "This tutorial demonstrates how to use the `BiasAnalyzer` package to explore **concept prevalence** within a cohort - a key step in identifying potential biases during cohort selection. It complements the [Cohort Exploration Tutorial](./BiasAnalyzerCohortsTutorial.ipynb) by focusing specifically on analyzing which clincial concepts (e.g., diagnoses, procedures, medications) are most common in a selected cohort. In the OMOP (Observational Medical Outcomes Partnership) CDM (Common Data Model), a **concept** refers to a coded term from a standardized medical vocabulary, uniquely identified by a **concept ID**. All clinical events in OMOP, such as conditions, drug exposures, procedures, measurements, and events, are represented as concepts.\n", + "\n", + "---\n", + "\n", + "### Overview\n", + "\n", + "**Objective**: \n", + "Learn how to retrieve and analyze concept prevalence within a cohort using `BiasAnalyzer`.\n", + "\n", + "**Before You Begin**: \n", + "The `BiasAnalyzer` package is currently in active development and has not yet been officially released on PyPI.\n", + "You can install it in one of the two ways:\n", + "\n", + "- **Install from GitHub (recommended during development)**:\n", + "```bash\n", + "pip install git+https://github.com/vaclab/BiasAnalyzer.git\n", + "```\n", + "- **Install from PyPI (once the pacakge is officially released)**:\n", + "```bash\n", + "pip install biasanalyzer\n", + "```\n", + "\n", + "For full setup and usage instructions, refer to the [README](https://github.com/VACLab/BiasAnalyzer/blob/main/README.md).\n", + "\n", + "---\n" + ] + }, + { + "cell_type": "markdown", + "id": "bb028875", + "metadata": {}, + "source": [ + "### Preparation for cohort concept prevalence exploration\n", + "**Preparation step 1**: Import the `BIAS` class from the `api` module of the `BiasAnalyzer` package, create an object `bias` of the `BIAS` class, specify OMOP CDM database configurations on the `bias` object, and set OMOP CDM database to enable connection to the database. Refer to the [Cohort Exploration Tutorial](./BiasAnalyzerCohortsTutorial.ipynb) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6dc76f46", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "configuration specified in ../config.yaml loaded successfully\n", + "Connected to the OMOP CDM database (read-only).\n", + "Cohort Definition table created.\n", + "Cohort table created.\n" + ] + } + ], + "source": [ + "from biasanalyzer.api import BIAS\n", + "\n", + "bias = BIAS()\n", + "\n", + "bias.set_config('../config.yaml')\n", + "\n", + "bias.set_root_omop()" + ] + }, + { + "cell_type": "markdown", + "id": "8731e481", + "metadata": {}, + "source": [ + "———————————————\n", + "\n", + "**Preparation step 2**: Create a cohort of young female COVID patients using the `create_cohort(cohort_name, cohort_description, query_or_yaml_file, created_by)` function on the `bias` object for cohort concept prevalence exploration. You'll pass the name of the cohort as the first argument, the description of the cohort as the second argument, a yaml file that specifies cohort inclusion and exclusion criteria or a cohort selection SQL query as the third argument, and the cohort owner's name indicating who owns or creates this cohort as the fourth argument. After the cohort is created, you can call `get_stats()` and `get_distributions()` functions on the returned `cohort_data` object to explore cohort statistics and distributions." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "51969248-f348-4f0d-914f-bb908183e3f1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "template_path: /home/hongyi/BiasAnalyzer/biasanalyzer/sql_templates, cohort_creation: True\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b99aedde4936451e9c0b8e75f2bcc620", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Cohort creation: 0%| | 0/3 [00:00=1.1.1 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for duckdb<2.0.0,>=1.1.1 from https://files.pythonhosted.org/packages/50/52/6e6f5b5b07841cec334ca6b98f2e02b7bb54ab3b99c49aa3a161cc0b4b37/duckdb-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached duckdb-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (966 bytes)\n", - "Collecting duckdb-engine<0.14.0,>=0.13.2 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for duckdb-engine<0.14.0,>=0.13.2 from https://files.pythonhosted.org/packages/ef/5d/81a0d67483d0767e4fbf7444b079b3f21574a184b0888782ced1c2172777/duckdb_engine-0.13.6-py3-none-any.whl.metadata\n", - " Using cached duckdb_engine-0.13.6-py3-none-any.whl.metadata (8.0 kB)\n", - "Collecting ipytree<0.3.0,>=0.2.2 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for ipytree<0.3.0,>=0.2.2 from https://files.pythonhosted.org/packages/e4/03/35cf1742598d784e96153175233318a2332f71863e55ad1007c9264c1a7a/ipytree-0.2.2-py2.py3-none-any.whl.metadata\n", - " Using cached ipytree-0.2.2-py2.py3-none-any.whl.metadata (849 bytes)\n", - "Collecting ipywidgets<9.0.0,>=8.1.5 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for ipywidgets<9.0.0,>=8.1.5 from https://files.pythonhosted.org/packages/22/2d/9c0b76f2f9cc0ebede1b9371b6f317243028ed60b90705863d493bae622e/ipywidgets-8.1.5-py3-none-any.whl.metadata\n", - " Using cached ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)\n", - "Collecting jinja2==3.1.5 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for jinja2==3.1.5 from https://files.pythonhosted.org/packages/bd/0f/2ba5fbcd631e3e88689309dbe978c5769e883e4b84ebfe7da30b43275c5a/jinja2-3.1.5-py3-none-any.whl.metadata\n", - " Using cached jinja2-3.1.5-py3-none-any.whl.metadata (2.6 kB)\n", - "Collecting numpy==1.24.4 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for numpy==1.24.4 from https://files.pythonhosted.org/packages/22/97/dfb1a31bb46686f09e68ea6ac5c63fdee0d22d7b23b8f3f7ea07712869ef/numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n", - "Collecting pandas==2.0.3 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for pandas==2.0.3 from https://files.pythonhosted.org/packages/d0/28/88b81881c056376254618fad622a5e94b5126db8c61157ea1910cd1c040a/pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)\n", - "Collecting psycopg2<3.0.0,>=2.9.1 (from biasanalyzer==0.1.0)\n", - " Using cached psycopg2-2.9.10-cp311-cp311-linux_x86_64.whl\n", - "Collecting pydantic<3.0.0,>=2.9.2 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for pydantic<3.0.0,>=2.9.2 from https://files.pythonhosted.org/packages/f4/3c/8cc1cc84deffa6e25d2d0c688ebb80635dfdbf1dbea3e30c541c8cf4d860/pydantic-2.10.6-py3-none-any.whl.metadata\n", - " Using cached pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)\n", - "Collecting pyyaml<7.0.0,>=6.0.2 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for pyyaml<7.0.0,>=6.0.2 from https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)\n", - "Collecting scipy==1.10.1 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for scipy==1.10.1 from https://files.pythonhosted.org/packages/21/cd/fe2d4af234b80dc08c911ce63fdaee5badcdde3e9bcd9a68884580652ef0/scipy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached scipy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)\n", - "Collecting sqlalchemy<3.0.0,>=2.0.35 (from biasanalyzer==0.1.0)\n", - " Obtaining dependency information for sqlalchemy<3.0.0,>=2.0.35 from https://files.pythonhosted.org/packages/ff/0a/46f3171f564a19a1daf6e7e0e6c8afc6ecd792f947c6de435519d4d16af3/sqlalchemy-2.0.39-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached sqlalchemy-2.0.39-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)\n", - "Collecting MarkupSafe>=2.0 (from jinja2==3.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for MarkupSafe>=2.0 from https://files.pythonhosted.org/packages/f1/a4/aefb044a2cd8d7334c8a47d3fb2c9f328ac48cb349468cc31c20b539305f/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)\n", - "Collecting python-dateutil>=2.8.2 (from pandas==2.0.3->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for python-dateutil>=2.8.2 from https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata\n", - " Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)\n", - "Collecting pytz>=2020.1 (from pandas==2.0.3->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for pytz>=2020.1 from https://files.pythonhosted.org/packages/eb/38/ac33370d784287baa1c3d538978b5e2ea064d4c1b93ffbd12826c190dd10/pytz-2025.1-py2.py3-none-any.whl.metadata\n", - " Using cached pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)\n", - "Collecting tzdata>=2022.1 (from pandas==2.0.3->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for tzdata>=2022.1 from https://files.pythonhosted.org/packages/0f/dd/84f10e23edd882c6f968c21c2434fe67bd4a528967067515feca9e611e5e/tzdata-2025.1-py2.py3-none-any.whl.metadata\n", - " Using cached tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)\n", - "Collecting packaging>=21 (from duckdb-engine<0.14.0,>=0.13.2->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for packaging>=21 from https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl.metadata\n", - " Using cached packaging-24.2-py3-none-any.whl.metadata (3.2 kB)\n", - "Collecting comm>=0.1.3 (from ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for comm>=0.1.3 from https://files.pythonhosted.org/packages/e6/75/49e5bfe642f71f272236b5b2d2691cf915a7283cc0ceda56357b61daa538/comm-0.2.2-py3-none-any.whl.metadata\n", - " Using cached comm-0.2.2-py3-none-any.whl.metadata (3.7 kB)\n", - "Collecting ipython>=6.1.0 (from ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for ipython>=6.1.0 from https://files.pythonhosted.org/packages/20/3a/917cb9e72f4e1a4ea13c862533205ae1319bd664119189ee5cc9e4e95ebf/ipython-9.0.2-py3-none-any.whl.metadata\n", - " Using cached ipython-9.0.2-py3-none-any.whl.metadata (4.3 kB)\n", - "Collecting traitlets>=4.3.1 (from ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for traitlets>=4.3.1 from https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl.metadata\n", - " Using cached traitlets-5.14.3-py3-none-any.whl.metadata (10 kB)\n", - "Collecting widgetsnbextension~=4.0.12 (from ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for widgetsnbextension~=4.0.12 from https://files.pythonhosted.org/packages/21/02/88b65cc394961a60c43c70517066b6b679738caf78506a5da7b88ffcb643/widgetsnbextension-4.0.13-py3-none-any.whl.metadata\n", - " Using cached widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)\n", - "Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for jupyterlab-widgets~=3.0.12 from https://files.pythonhosted.org/packages/a9/93/858e87edc634d628e5d752ba944c2833133a28fa87bb093e6832ced36a3e/jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata\n", - " Using cached jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)\n", - "Collecting annotated-types>=0.6.0 (from pydantic<3.0.0,>=2.9.2->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for annotated-types>=0.6.0 from https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl.metadata\n", - " Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)\n", - "Collecting pydantic-core==2.27.2 (from pydantic<3.0.0,>=2.9.2->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for pydantic-core==2.27.2 from https://files.pythonhosted.org/packages/a8/7c/b860618c25678bbd6d1d99dbdfdf0510ccb50790099b963ff78a124b754f/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n", - "Collecting typing-extensions>=4.12.2 (from pydantic<3.0.0,>=2.9.2->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for typing-extensions>=4.12.2 from https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl.metadata\n", - " Using cached typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)\n", - "Collecting greenlet!=0.4.17 (from sqlalchemy<3.0.0,>=2.0.35->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for greenlet!=0.4.17 from https://files.pythonhosted.org/packages/f7/4b/1c9695aa24f808e156c8f4813f685d975ca73c000c2a5056c514c64980f6/greenlet-3.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata\n", - " Using cached greenlet-3.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)\n", - "Collecting decorator (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for decorator from https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl.metadata\n", - " Using cached decorator-5.2.1-py3-none-any.whl.metadata (3.9 kB)\n", - "Collecting ipython-pygments-lexers (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for ipython-pygments-lexers from https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl.metadata\n", - " Using cached ipython_pygments_lexers-1.1.1-py3-none-any.whl.metadata (1.1 kB)\n", - "Collecting jedi>=0.16 (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for jedi>=0.16 from https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl.metadata\n", - " Using cached jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)\n", - "Collecting matplotlib-inline (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for matplotlib-inline from https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl.metadata\n", - " Using cached matplotlib_inline-0.1.7-py3-none-any.whl.metadata (3.9 kB)\n", - "Collecting pexpect>4.3 (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for pexpect>4.3 from https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl.metadata\n", - " Using cached pexpect-4.9.0-py2.py3-none-any.whl.metadata (2.5 kB)\n", - "Collecting prompt_toolkit<3.1.0,>=3.0.41 (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for prompt_toolkit<3.1.0,>=3.0.41 from https://files.pythonhosted.org/packages/e4/ea/d836f008d33151c7a1f62caf3d8dd782e4d15f6a43897f64480c2b8de2ad/prompt_toolkit-3.0.50-py3-none-any.whl.metadata\n", - " Using cached prompt_toolkit-3.0.50-py3-none-any.whl.metadata (6.6 kB)\n", - "Collecting pygments>=2.4.0 (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for pygments>=2.4.0 from https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl.metadata\n", - " Using cached pygments-2.19.1-py3-none-any.whl.metadata (2.5 kB)\n", - "Collecting stack_data (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for stack_data from https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl.metadata\n", - " Using cached stack_data-0.6.3-py3-none-any.whl.metadata (18 kB)\n", - "Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas==2.0.3->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for six>=1.5 from https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl.metadata\n", - " Using cached six-1.17.0-py2.py3-none-any.whl.metadata (1.7 kB)\n", - "Collecting parso<0.9.0,>=0.8.4 (from jedi>=0.16->ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for parso<0.9.0,>=0.8.4 from https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl.metadata\n", - " Using cached parso-0.8.4-py2.py3-none-any.whl.metadata (7.7 kB)\n", - "Collecting ptyprocess>=0.5 (from pexpect>4.3->ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for ptyprocess>=0.5 from https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl.metadata\n", - " Using cached ptyprocess-0.7.0-py2.py3-none-any.whl.metadata (1.3 kB)\n", - "Collecting wcwidth (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for wcwidth from https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl.metadata\n", - " Using cached wcwidth-0.2.13-py2.py3-none-any.whl.metadata (14 kB)\n", - "Collecting executing>=1.2.0 (from stack_data->ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for executing>=1.2.0 from https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl.metadata\n", - " Using cached executing-2.2.0-py2.py3-none-any.whl.metadata (8.9 kB)\n", - "Collecting asttokens>=2.1.0 (from stack_data->ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for asttokens>=2.1.0 from https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl.metadata\n", - " Using cached asttokens-3.0.0-py3-none-any.whl.metadata (4.7 kB)\n", - "Collecting pure-eval (from stack_data->ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->biasanalyzer==0.1.0)\n", - " Obtaining dependency information for pure-eval from https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl.metadata\n", - " Using cached pure_eval-0.2.3-py3-none-any.whl.metadata (6.3 kB)\n", - "Using cached jinja2-3.1.5-py3-none-any.whl (134 kB)\n", - "Using cached numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n", - "Using cached pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)\n", - "Using cached scipy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.1 MB)\n", - "Using cached duckdb-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.2 MB)\n", - "Using cached duckdb_engine-0.13.6-py3-none-any.whl (48 kB)\n", - "Using cached ipytree-0.2.2-py2.py3-none-any.whl (1.3 MB)\n", - "Using cached ipywidgets-8.1.5-py3-none-any.whl (139 kB)\n", - "Using cached pydantic-2.10.6-py3-none-any.whl (431 kB)\n", - "Using cached pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)\n", - "Using cached PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (762 kB)\n", - "Using cached sqlalchemy-2.0.39-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)\n", - "Using cached annotated_types-0.7.0-py3-none-any.whl (13 kB)\n", - "Using cached comm-0.2.2-py3-none-any.whl (7.2 kB)\n", - "Using cached greenlet-3.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (602 kB)\n", - "Using cached ipython-9.0.2-py3-none-any.whl (600 kB)\n", - "Using cached jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)\n", - "Using cached MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23 kB)\n", - "Using cached packaging-24.2-py3-none-any.whl (65 kB)\n", - "Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)\n", - "Using cached pytz-2025.1-py2.py3-none-any.whl (507 kB)\n", - "Using cached traitlets-5.14.3-py3-none-any.whl (85 kB)\n", - "Using cached typing_extensions-4.12.2-py3-none-any.whl (37 kB)\n", - "Using cached tzdata-2025.1-py2.py3-none-any.whl (346 kB)\n", - "Using cached widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB)\n", - "Using cached jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)\n", - "Using cached pexpect-4.9.0-py2.py3-none-any.whl (63 kB)\n", - "Using cached prompt_toolkit-3.0.50-py3-none-any.whl (387 kB)\n", - "Using cached pygments-2.19.1-py3-none-any.whl (1.2 MB)\n", - "Using cached six-1.17.0-py2.py3-none-any.whl (11 kB)\n", - "Using cached decorator-5.2.1-py3-none-any.whl (9.2 kB)\n", - "Using cached ipython_pygments_lexers-1.1.1-py3-none-any.whl (8.1 kB)\n", - "Using cached matplotlib_inline-0.1.7-py3-none-any.whl (9.9 kB)\n", - "Using cached stack_data-0.6.3-py3-none-any.whl (24 kB)\n", - "Using cached asttokens-3.0.0-py3-none-any.whl (26 kB)\n", - "Using cached executing-2.2.0-py2.py3-none-any.whl (26 kB)\n", - "Using cached parso-0.8.4-py2.py3-none-any.whl (103 kB)\n", - "Using cached ptyprocess-0.7.0-py2.py3-none-any.whl (13 kB)\n", - "Using cached pure_eval-0.2.3-py3-none-any.whl (11 kB)\n", - "Using cached wcwidth-0.2.13-py2.py3-none-any.whl (34 kB)\n", - "Building wheels for collected packages: biasanalyzer\n", - " Building wheel for biasanalyzer (pyproject.toml) ... \u001B[?25ldone\n", - "\u001B[?25h Created wheel for biasanalyzer: filename=biasanalyzer-0.1.0-py3-none-any.whl size=25475 sha256=1982c82749337f81db1a730b8cc25c049d0c0788cd6b782f69ce8be1d92a397c\n", - " Stored in directory: /home/hyi/temp/pip-ephem-wheel-cache-f_9rcqkk/wheels/25/75/4e/079d96d69cc58148ce31d3d44f858e4db5f689604112dcb7c3\n", - "Successfully built biasanalyzer\n", - "Installing collected packages: wcwidth, pytz, pure-eval, ptyprocess, widgetsnbextension, tzdata, typing-extensions, traitlets, six, pyyaml, pygments, psycopg2, prompt_toolkit, pexpect, parso, packaging, numpy, MarkupSafe, jupyterlab-widgets, greenlet, executing, duckdb, decorator, asttokens, annotated-types, stack_data, sqlalchemy, scipy, python-dateutil, pydantic-core, matplotlib-inline, jinja2, jedi, ipython-pygments-lexers, comm, pydantic, pandas, ipython, duckdb-engine, ipywidgets, ipytree, biasanalyzer\n", - "\u001B[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "ipympl 0.9.3 requires ipython<9, but you have ipython 9.0.2 which is incompatible.\u001B[0m\u001B[31m\n", - "\u001B[0mSuccessfully installed MarkupSafe-3.0.2 annotated-types-0.7.0 asttokens-3.0.0 biasanalyzer-0.1.0 comm-0.2.2 decorator-5.2.1 duckdb-1.2.1 duckdb-engine-0.13.6 executing-2.2.0 greenlet-3.1.1 ipython-9.0.2 ipython-pygments-lexers-1.1.1 ipytree-0.2.2 ipywidgets-8.1.5 jedi-0.19.2 jinja2-3.1.5 jupyterlab-widgets-3.0.13 matplotlib-inline-0.1.7 numpy-1.24.4 packaging-24.2 pandas-2.0.3 parso-0.8.4 pexpect-4.9.0 prompt_toolkit-3.0.50 psycopg2-2.9.10 ptyprocess-0.7.0 pure-eval-0.2.3 pydantic-2.10.6 pydantic-core-2.27.2 pygments-2.19.1 python-dateutil-2.9.0.post0 pytz-2025.1 pyyaml-6.0.2 scipy-1.10.1 six-1.17.0 sqlalchemy-2.0.39 stack_data-0.6.3 traitlets-5.14.3 typing-extensions-4.12.2 tzdata-2025.1 wcwidth-0.2.13 widgetsnbextension-4.0.13\n" - ] - } - ], - "source": [ - "# Have to specify TMPDIR and target in pip install command to work around the kernel crash issue due to \n", - "# the small ephemeral local storage quota allocated to /tmp which is used by default by pip install\n", - "!TMPDIR=/home/hyi/temp pip install git+https://github.com/vaclab/BiasAnalyzer.git --target /home/hyi/temp --upgrade" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "9ce3b87c-0754-4eae-9f85-8210104e2b0b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# append the target folder where HealthDataBias module was installed to PYTHONPATH\n", - "import sys\n", - "sys.path.append('/home/hyi/temp')\n", - "import pandas as pd\n", - "pd.set_option('display.max_rows', None)\n", - "pd.set_option('display.max_columns', None)\n", - "pd.set_option('display.width', 1000)\n", - "import time" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "729e8803-74f8-4180-aa8b-0e44567f8aeb", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from biasanalyzer.api import BIAS" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "548223ed-8948-461e-b9d6-40a0ec7fc89f", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "no configuration file specified. Call set_config(config_file_path) next to specify configurations\n" - ] - } - ], - "source": [ - "# create an object of BIAS class\n", - "bias = BIAS()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "7d440d9f-c7fa-4ef1-ad66-31274ebef4ea", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "configuration specified in /home/hyi/bias/config/config.yaml loaded successfully\n" - ] - } - ], - "source": [ - "bias.set_config('/home/hyi/bias/config/config.yaml')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "283156f8-63da-42a5-bbd7-ee2b7719652c", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Connected to the OMOP CDM database (read-only).\n", - "Cohort Definition table created.\n", - "Cohort table created.\n" - ] - } - ], - "source": [ - "# the configuration file includes root_omop_cdm_database configuration info with an example shown \n", - "# in https://github.com/hyi/HealthDataBias/blob/main/tests/assets/config/test_config.yaml\n", - "bias.set_root_omop()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "7192ab6d-0845-4bcd-acda-f00157d4215d", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "either domain or vocabulary must be set to constrain the number of returned concepts\n", - "concepts for COVID-19 in Condition domain with SNOMED vocabulary:\n", - " concept_id concept_name valid_start_date valid_end_date domain_id vocabulary_id\n", - "0 703440 COVID-19 confirmed using clinical diagnostic c... 2020-04-01 2099-12-31 Condition SNOMED\n", - "1 703441 COVID-19 confirmed by laboratory test 2020-04-01 2099-12-31 Condition SNOMED\n", - "2 703445 Low risk category for developing complication ... 2020-04-01 2099-12-31 Condition SNOMED\n", - "3 703446 Moderate risk category for developing complica... 2020-04-01 2099-12-31 Condition SNOMED\n", - "4 703447 High risk category for developing complication... 2020-04-01 2099-12-31 Condition SNOMED\n", - "5 37310269 COVID-19 2020-02-04 2020-10-28 Condition SNOMED\n", - "6 37311061 COVID-19 2020-01-31 2099-12-31 Condition SNOMED\n", - "concepts for COVID-19 in Condition domain:\n", - " concept_id concept_name valid_start_date valid_end_date domain_id vocabulary_id\n", - "0 702953 Emergency use of U07.1 | COVID-19 2020-04-01 2099-12-31 Condition ICD10CM\n", - "1 703440 COVID-19 confirmed using clinical diagnostic c... 2020-04-01 2099-12-31 Condition SNOMED\n", - "2 703441 COVID-19 confirmed by laboratory test 2020-04-01 2099-12-31 Condition SNOMED\n", - "3 703445 Low risk category for developing complication ... 2020-04-01 2099-12-31 Condition SNOMED\n", - "4 703446 Moderate risk category for developing complica... 2020-04-01 2099-12-31 Condition SNOMED\n", - "5 703447 High risk category for developing complication... 2020-04-01 2099-12-31 Condition SNOMED\n", - "6 756023 Acute bronchitis caused by COVID-19 2020-03-18 2021-01-29 Condition OMOP Extension\n", - "7 756031 Bronchitis caused by COVID-19 2020-03-18 2099-12-31 Condition OMOP Extension\n", - "8 756039 Respiratory infection caused by COVID-19 2020-03-18 2099-12-31 Condition OMOP Extension\n", - "9 756044 Acute respiratory distress syndrome (ARDS) cau... 2020-03-18 2021-01-29 Condition OMOP Extension\n", - "10 756061 Asymptomatic COVID-19 2020-03-18 2021-01-29 Condition OMOP Extension\n", - "11 756081 Infection of lower respiratory tract caused by... 2020-03-18 2021-01-29 Condition OMOP Extension\n", - "12 37310269 COVID-19 2020-02-04 2020-10-28 Condition SNOMED\n", - "13 37311061 COVID-19 2020-01-31 2099-12-31 Condition SNOMED\n", - "concepts for COVID-19 in SNOMED vocabulary:\n", - " concept_id concept_name valid_start_date valid_end_date domain_id vocabulary_id\n", - "0 703420 COVID-19 presenting complaints simple referenc... 2020-04-01 2099-12-31 Metadata SNOMED\n", - "1 703421 COVID-19 health issues simple reference set 2020-04-01 2099-12-31 Metadata SNOMED\n", - "2 703422 COVID-19 procedures simple reference set 2020-04-01 2099-12-31 Metadata SNOMED\n", - "3 703423 COVID-19 record extraction simple reference set 2020-04-01 2099-12-31 Metadata SNOMED\n", - "4 703424 Provision of advice, assessment or treatment l... 2020-04-01 2099-12-31 Observation SNOMED\n", - "5 703429 COVID-19 excluded using clinical diagnostic cr... 2020-04-01 2099-12-31 Observation SNOMED\n", - "6 703430 COVID-19 excluded by laboratory test 2020-04-01 2099-12-31 Observation SNOMED\n", - "7 703431 COVID-19 excluded 2020-04-01 2020-10-28 Observation SNOMED\n", - "8 703440 COVID-19 confirmed using clinical diagnostic c... 2020-04-01 2099-12-31 Condition SNOMED\n", - "9 703441 COVID-19 confirmed by laboratory test 2020-04-01 2099-12-31 Condition SNOMED\n", - "10 703442 Assessment using COVID-19 severity scale 2020-04-01 2099-12-31 Procedure SNOMED\n", - "11 703443 COVID-19 severity scale 2020-04-01 2099-12-31 Measurement SNOMED\n", - "12 703444 COVID-19 severity score 2020-04-01 2099-12-31 Measurement SNOMED\n", - "13 703445 Low risk category for developing complication ... 2020-04-01 2099-12-31 Condition SNOMED\n", - "14 703446 Moderate risk category for developing complica... 2020-04-01 2099-12-31 Condition SNOMED\n", - "15 703447 High risk category for developing complication... 2020-04-01 2099-12-31 Condition SNOMED\n", - "16 3657496 Provision of advice, assessment or treatment d... 2020-05-13 2099-12-31 Observation SNOMED\n", - "17 3657558 COVID-19 test result communication to general ... 2020-06-10 2099-12-31 Metadata SNOMED\n", - "18 3657559 COVID-19 test result communication to general ... 2020-06-10 2099-12-31 Metadata SNOMED\n", - "19 37310268 Suspected COVID-19 2020-02-04 2020-10-28 Observation SNOMED\n", - "20 37310269 COVID-19 2020-02-04 2020-10-28 Condition SNOMED\n", - "21 37311060 Suspected COVID-19 2020-01-31 2099-12-31 Observation SNOMED\n", - "22 37311061 COVID-19 2020-01-31 2099-12-31 Condition SNOMED\n" - ] - } - ], - "source": [ - "bias.get_concepts(\"COVID-19\")\n", - "concepts = bias.get_concepts(\"COVID-19\", \"Condition\", \"SNOMED\")\n", - "print(f'concepts for COVID-19 in Condition domain with SNOMED vocabulary:\\n {pd.DataFrame(concepts)}')\n", - "concepts = bias.get_concepts(\"COVID-19\", domain=\"Condition\")\n", - "print(f'concepts for COVID-19 in Condition domain:\\n {pd.DataFrame(concepts)}')\n", - "concepts = bias.get_concepts(\"COVID-19\", vocabulary=\"SNOMED\")\n", - "print(f'concepts for COVID-19 in SNOMED vocabulary:\\n {pd.DataFrame(concepts)}')" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "9a52ab5f-57a8-4942-8a03-ec86651e919e", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "template_name: \"cohort_creation_condition_occurrence_query\"\n", - "\n", - "inclusion_criteria:\n", - " demographics: # Optional\n", - " gender: 'female' # accepted values: female or male, optional field\n", - " min_birth_year: 2000 # Born at the year of 2000 or after, optional field\n", - " temporal_events:\n", - " - operator: 'AND'\n", - " events:\n", - " - event_type: 'condition_occurrence'\n", - " event_concept_id: 37311061 # COVID condition\n" - ] - } - ], - "source": [ - "# create a cohort with all COVID-19 female patients under 24 years old\n", - "# cohort_query = ('SELECT c.person_id, c.condition_start_date as cohort_start_date, '\n", - "# 'c.condition_end_date as cohort_end_date '\n", - "# 'FROM condition_occurrence c JOIN '\n", - "# 'person p ON c.person_id = p.person_id '\n", - "# 'WHERE c.condition_concept_id = 37311061 '\n", - "# 'AND p.gender_concept_id = 8532 AND p.year_of_birth > 2000')\n", - "!cat /home/hyi/bias/config/covid_female_born_after_2000_cohort.yaml\n", - "\n", - "cohort_query = '/home/hyi/bias/config/covid_female_born_after_2000_cohort.yaml'\n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "70745d05-1f45-4e7d-b3ad-b6e0e45334e1", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "template_path: /home/hyi/temp/biasanalyzer/sql_templates\n", - "configuration specified in /home/hyi/bias/config/covid_female_born_after_2000_cohort.yaml loaded successfully\n", - "Cohort definition inserted successfully.\n", - "Cohort Young female COVID-19 patients successfully created.\n", - "cohort created successfully\n", - "Young female COVID-19 patient cohort definition: {'id': 1, 'name': 'Young female COVID-19 patients', 'description': 'Female patients with COVID-19 condition under 24 years old', 'created_date': datetime.date(2025, 3, 13), 'creation_info': 'WITH ranked_events AS ( SELECT person_id, condition_concept_id, condition_start_date AS event_start_date, condition_end_date AS event_end_date, ROW_NUMBER() OVER ( PARTITION BY person_id, condition_concept_id ORDER BY condition_start_date ASC ) AS event_instance FROM condition_occurrence ), ranked_visits AS ( SELECT person_id, visit_concept_id, visit_start_date AS event_start_date, visit_end_date AS event_end_date, ROW_NUMBER() OVER ( PARTITION BY person_id, visit_concept_id ORDER BY visit_start_date ASC ) AS event_instance FROM visit_occurrence ), condition_qualifying_events AS ( (SELECT person_id, event_start_date, event_end_date FROM ranked_events WHERE condition_concept_id = 37311061) ), filtered_cohort AS ( SELECT c.person_id, MIN(c.event_start_date) AS cohort_start_date, MAX(c.event_end_date) AS cohort_end_date FROM condition_qualifying_events c JOIN person p ON c.person_id = p.person_id WHERE 1=1 AND p.gender_concept_id = 8532 AND p.year_of_birth >= 2000 GROUP BY c.person_id ) SELECT * FROM filtered_cohort f', 'created_by': 'system'}\n", - "The first five patients in the cohort: [{'subject_id': 53949, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2020, 2, 28), 'cohort_end_date': datetime.date(2020, 3, 11)}, {'subject_id': 22344, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2020, 3, 11), 'cohort_end_date': datetime.date(2020, 4, 13)}, {'subject_id': 80198, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2020, 3, 5), 'cohort_end_date': datetime.date(2020, 4, 9)}, {'subject_id': 30052, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2020, 3, 6), 'cohort_end_date': datetime.date(2020, 4, 8)}, {'subject_id': 94887, 'cohort_definition_id': 1, 'cohort_start_date': datetime.date(2020, 2, 29), 'cohort_end_date': datetime.date(2020, 3, 24)}]\n" - ] - } - ], - "source": [ - "cohort_data = bias.create_cohort('Young female COVID-19 patients', \n", - " 'Female patients with COVID-19 condition under 24 years old', \n", - " cohort_query, 'system')\n", - "md = cohort_data.metadata\n", - "print(f'Young female COVID-19 patient cohort definition: {md}')\n", - "print(f'The first five patients in the cohort: {cohort_data.data[:5]}')" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "b1415805-b065-40b8-b2cd-6db6f5f9ca41", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "the cohort stats: [{'total_count': 10208, 'earliest_start_date': datetime.date(2020, 1, 18), 'latest_start_date': datetime.date(2020, 3, 30), 'earliest_end_date': datetime.date(2020, 2, 7), 'latest_end_date': datetime.date(2020, 5, 3), 'min_duration_days': 8, 'max_duration_days': 37, 'avg_duration_days': 24.25, 'median_duration': 24, 'stddev_duration': 7.2}]\n", - "the cohort age stats: [{'total_count': 10208, 'min_age': 0, 'max_age': 20, 'avg_age': 10.94, 'median_age': 11, 'stddev_age': 5.92}]\n", - "the cohort gender stats: [{'gender': 'female', 'gender_count': 10208, 'probability': 1.0}]\n", - "the cohort race stats: [{'race': 'Asian', 'race_count': 723, 'probability': 0.07}, {'race': 'Other', 'race_count': 53, 'probability': 0.01}, {'race': 'Black or African American', 'race_count': 866, 'probability': 0.08}, {'race': 'White', 'race_count': 8566, 'probability': 0.84}]\n", - "the cohort ethnicity stats: [{'ethnicity': 'other', 'ethnicity_count': 10208, 'probability': 1.0}]\n" - ] - } - ], - "source": [ - "# get stats of the cohocohort\n", - "cohort_stats = cohort_data.get_stats()\n", - "print(f'the cohort stats: {cohort_stats}')\n", - "cohort_age_stats = cohort_data.get_stats(\"age\")\n", - "print(f'the cohort age stats: {cohort_age_stats}')\n", - "cohort_gender_stats = cohort_data.get_stats(\"gender\")\n", - "print(f'the cohort gender stats: {cohort_gender_stats}')\n", - "cohort_race_stats = cohort_data.get_stats(\"race\")\n", - "print(f'the cohort race stats: {cohort_race_stats}')\n", - "cohort_ethnicity_stats = cohort_data.get_stats(\"ethnicity\")\n", - "print(f'the cohort ethnicity stats: {cohort_ethnicity_stats}')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "d54e39da-6f78-4dc1-91ae-a8c26852582a", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "the cohort age discrete probability distribution: [{'age_bin': '0-10', 'bin_count': 4744, 'probability': 0.4647}, {'age_bin': '11-20', 'bin_count': 5464, 'probability': 0.5353}, {'age_bin': '21-30', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '31-40', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '41-50', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '51-60', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '61-70', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '71-80', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '81-90', 'bin_count': 0, 'probability': 0.0}, {'age_bin': '91+', 'bin_count': 0, 'probability': 0.0}]\n" - ] - } - ], - "source": [ - "# get discrete probability distribution of the age variable in the cohort\n", - "cohort_age_distr = cohort_data.get_distributions('age')\n", - "print(f'the cohort age discrete probability distribution: {cohort_age_distr}')" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "c7ad0b7b-21dc-4572-af21-fe1580361999", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "cohort concept hierarchy for condition_occurrence with root concept ids []:\n", - " concept_name concept_code count_in_cohort prevalence ancestor_concept_id descendant_concept_id\n", - "0 Disease due to Coronaviridae 27619001 10208 1.000000 4100065 4100065\n", - "1 COVID-19 840539006 10208 1.000000 37311061 37311061\n", - "2 Coronavirus infection 186747009 10208 1.000000 439676 439676\n", - "3 Viral disease 34014006 10208 1.000000 440029 440029\n", - "4 Clinical finding 404684003 10208 1.000000 441840 441840\n", - "5 Disorder due to infection 40733004 10208 1.000000 432250 432250\n", - "6 Disease 64572001 10208 1.000000 4274025 4274025\n", - "7 Finding by site 118234003 9153 0.896650 4042140 4042140\n", - "8 Clinical history and observation findings 250171008 9153 0.896650 4094294 4094294\n", - "9 General finding of observation of patient 118222006 9149 0.896258 4041283 4041283\n", - "10 General body state finding 82832008 9080 0.889498 4221108 4221108\n", - "11 Temperature-associated finding 301343009 8769 0.859032 4103474 4103474\n", - "12 Body temperature finding 105723007 8650 0.847375 4022230 4022230\n", - "13 Vital signs finding 118227000 8650 0.847375 4042138 4042138\n", - "14 Abnormal body temperature 123979008 8650 0.847375 4047791 4047791\n", - "15 Body temperature above reference range 50177009 8650 0.847375 4178904 4178904\n", - "16 Fever 386661006 8650 0.847375 437663 437663\n", - "17 Respiratory finding 106048009 7785 0.762637 4024567 4024567\n", - "18 Finding of body region 301857004 7200 0.705329 4199402 4199402\n", - "19 Neurological finding 102957003 6664 0.652821 4011630 4011630\n", - "20 Sensory nervous system finding 106147001 6662 0.652625 4024013 4024013\n", - "21 Finding of sensation by site 699697007 6659 0.652332 44783587 44783587\n", - "22 Respiratory function finding 365852007 6596 0.646160 4267789 4267789\n", - "23 Cough 49727002 6596 0.646160 254761 254761\n", - "24 Finding of head and neck region 118254002 6500 0.636755 255919 255919\n", - "25 Head finding 406122000 6460 0.632837 4247371 4247371\n", - "26 Digestive system finding 386617003 5963 0.584150 4302537 4302537\n", - "27 Mouth and/or pharynx finding 249376008 5629 0.551430 4091363 4091363\n", - "28 Finding of head region 298364001 5268 0.516066 4182161 4182161\n", - "29 Finding of mouth region 423066003 4898 0.479820 4307122 4307122\n", - "30 Oral cavity finding 116337000 4898 0.479820 4022570 4022570\n", - "31 Finding of sense of taste 76489005 4893 0.479330 4296465 4296465\n", - "32 Loss of taste 36955009 4893 0.479330 4289517 4289517\n", - "33 General well-being finding 365275006 3776 0.369906 4272867 4272867\n", - "34 Fatigue 84229001 3776 0.369906 4223659 4223659\n", - "35 Energy and stamina finding 359752005 3776 0.369906 4230389 4230389\n", - "36 General problem AND/OR complaint 105721009 3776 0.369906 4022830 4022830\n", - "37 Metabolic finding 106089007 3776 0.369906 432455 432455\n", - "38 Pain 22253000 3565 0.349236 4329041 4329041\n", - "39 Pain / sensation finding 276435006 3565 0.349236 4170962 4170962\n", - "40 Pain finding at anatomical site 279001004 3560 0.348746 4132926 4132926\n", - "41 Sputum finding 248595008 3233 0.316712 4089228 4089228\n", - "42 Ear, nose and throat finding 297268004 2132 0.208856 4178545 4178545\n", - "43 Upper respiratory tract finding 301186004 2047 0.200529 4103320 4103320\n", - "44 Finding reported by subject or history provider 418799008 2040 0.199843 4303401 4303401\n", - "45 Difficulty breathing 230145002 1990 0.194945 4041664 4041664\n", - "46 Ease of respiration - finding 366139009 1990 0.194945 4271505 4271505\n", - "47 Finding of respiration 301282008 1990 0.194945 4115386 4115386\n", - "48 General finding of soft tissue 248402002 1837 0.179957 4093991 4093991\n", - "49 Finding related to respiratory sounds 106051002 1624 0.159091 4021770 4021770\n", - "50 Wheezing 56018004 1624 0.159091 314754 314754\n", - "51 Dyspnea 267036007 1624 0.159091 312437 312437\n", - "52 Finding of sound of breathing 301285005 1624 0.159091 4115387 4115387\n", - "53 Musculoskeletal finding 106028002 1534 0.150274 135930 135930\n", - "54 Joint finding 118952005 1495 0.146454 77960 77960\n", - "55 Pharyngeal finding 116338005 1466 0.143613 4022571 4022571\n", - "56 Muscle finding 106030000 1445 0.141556 4024566 4024566\n", - "57 Finding of sensation of joint 298249004 1445 0.141556 4179167 4179167\n", - "58 Joint pain 57676002 1445 0.141556 77074 77074\n", - "59 Musculoskeletal pain 279069000 1445 0.141556 4150129 4150129\n", - "60 Finding of sensation of skeletal muscle 298287007 1445 0.141556 4184117 4184117\n", - "61 Muscle pain 68962001 1445 0.141556 442752 442752\n", - "62 Finding of neck region 298378000 1382 0.135384 4184252 4184252\n", - "63 Sore throat symptom 267102003 1376 0.134796 4147326 4147326\n", - "64 Finding of sensation of pharynx 300275004 1376 0.134796 4114487 4114487\n", - "65 Pain of digestive structure 301362007 1376 0.134796 4116809 4116809\n", - "66 Pain of head and neck region 301365009 1376 0.134796 4116810 4116810\n", - "67 Pain in throat 162397003 1376 0.134796 259153 259153\n", - "68 Pain of respiratory structure 301355003 1376 0.134796 4115406 4115406\n", - "69 Neck pain 81680005 1376 0.134796 24134 24134\n", - "70 Headache 25064002 1304 0.127743 378253 378253\n", - "71 Disorder by body site 123946008 1195 0.117065 4047779 4047779\n", - "72 Disorder of body system 362965005 1173 0.114910 4180628 4180628\n", - "73 Chill 43724002 1079 0.105701 434490 434490\n", - "74 Shivering or rigors 248456009 1079 0.105701 4087630 4087630\n", - "75 Inflammatory disorder 128139000 987 0.096689 4027384 4027384\n", - "76 Inflammation of specific body structures or ti... 363170005 987 0.096689 4180169 4180169\n", - "77 Inflammation of specific body systems 363171009 987 0.096689 4178818 4178818\n", - "78 Disorder of respiratory system 50043002 891 0.087284 320136 320136\n", - "79 Inflammatory disorder of the respiratory tract 363180009 885 0.086697 4180170 4180170\n", - "80 Inflammatory disorder of the respiratory system 373405005 885 0.086697 4162282 4162282\n", - "81 Inflammation of specific body organs 363169009 853 0.083562 4181063 4181063\n", - "82 Gastrointestinal tract finding 386618008 768 0.075235 4304916 4304916\n", - "83 Functional finding of gastrointestinal tract 300358007 766 0.075039 4101343 4101343\n", - "84 Disorder of trunk 128121009 638 0.062500 4028071 4028071\n", - "85 Finding of trunk structure 302292003 638 0.062500 4117930 4117930\n", - "86 Disorder of thoracic segment of trunk 609622007 635 0.062206 43531056 43531056\n", - "87 Finding of upper trunk 609623002 635 0.062206 43531057 43531057\n", - "88 Disorder of thorax 118946009 635 0.062206 4043346 4043346\n", - "89 Finding of region of thorax 298705000 635 0.062206 4185503 4185503\n", - "90 Inflammatory disorder of lower respiratory tract 128997002 628 0.061520 4028876 4028876\n", - "91 Disorder of lower respiratory system 128272009 628 0.061520 4027553 4027553\n", - "92 Lower respiratory tract finding 301226008 628 0.061520 4115259 4115259\n", - "93 Metabolic disease 75934005 586 0.057406 436670 436670\n", - "94 Viscus structure finding 406123005 585 0.057308 4227253 4227253\n", - "95 Disorder of lung 19829001 582 0.057014 257907 257907\n", - "96 Pneumonitis 205237003 582 0.057014 253506 253506\n", - "97 Pneumonia 233604007 582 0.057014 255848 255848\n", - "98 General clinical state finding 365860008 582 0.057014 432453 432453\n", - "99 Hypoxemia 389087006 582 0.057014 437390 437390\n", - "100 Lung finding 301230006 582 0.057014 4115260 4115260\n", - "101 Lung consolidation 95436008 582 0.057014 4318404 4318404\n", - "102 Disorder of blood gas 238157005 582 0.057014 4080012 4080012\n", - "103 Respiratory distress 271825005 582 0.057014 4158346 4158346\n", - "104 Distress 69328002 582 0.057014 4239819 4239819\n", - "105 Acute disease 2704003 555 0.054369 443883 443883\n", - "106 Finding of face 301310005 535 0.052410 4103352 4103352\n", - "107 Nasal airway finding 249342004 531 0.052018 4096565 4096565\n", - "108 Nose finding 118237005 531 0.052018 4042142 4042142\n", - "109 Nasal congestion 68235000 513 0.050255 4195085 4195085\n", - "110 Vomiting symptom 249497008 443 0.043397 4096715 4096715\n", - "111 Finding of vomiting 300359004 443 0.043397 4101344 4101344\n", - "112 Nausea 422587007 443 0.043397 31967 31967\n", - "113 Disorder of soft tissue 19660004 417 0.040850 376208 376208\n", - "114 Disorder of cardiovascular system 49601007 384 0.037618 134057 134057\n", - "115 Cardiovascular finding 106063007 384 0.037618 4023995 4023995\n", - "116 Soft tissue lesion 239953001 382 0.037422 4344497 4344497\n", - "117 Ear, nose and throat disorder 232208008 370 0.036246 4339468 4339468\n", - "118 Finding of defecation 300373008 350 0.034287 4113563 4113563\n", - "119 Altered bowel function 88111009 350 0.034287 4338120 4338120\n", - "120 Digestive symptom 308925008 350 0.034287 192731 192731\n", - "121 Finding of bowel action 366256008 350 0.034287 4182633 4182633\n", - "122 Diarrhea symptom 267060006 350 0.034287 4145808 4145808\n", - "123 Diarrhea 62315008 350 0.034287 196523 196523\n", - "124 Blood vessel finding 21829004 341 0.033405 4071689 4071689\n", - "125 Vascular disorder 27550009 341 0.033405 443784 443784\n", - "126 Acute respiratory disease 111273006 332 0.032524 4006969 4006969\n", - "127 Acute disease of cardiovascular system 128487001 331 0.032426 4028367 4028367\n", - "128 Disorder of head 118934005 314 0.030760 4042836 4042836\n", - "129 Disorder of upper respiratory system 201060008 268 0.026254 254068 254068\n", - "130 Inflammatory disorder of head 363176004 268 0.026254 4181187 4181187\n", - "131 Inflammatory disorder of upper respiratory tract 129134004 268 0.026254 4043671 4043671\n", - "132 Respiratory tract infection 275498002 236 0.023119 4170143 4170143\n", - "133 Infection by site 301810000 236 0.023119 4200532 4200532\n", - "134 Upper respiratory infection 54150009 236 0.023119 4181583 4181583\n", - "135 Respiratory failure 409622000 205 0.020082 4256228 4256228\n", - "136 Acute respiratory failure 65710008 205 0.020082 319049 319049\n", - "137 Respiratory insufficiency 409623005 205 0.020082 318459 318459\n", - "138 Viral upper respiratory tract infection 281794004 203 0.019886 4085100 4085100\n", - "139 Viral respiratory infection 312133006 203 0.019886 4193169 4193169\n", - "140 Viral infection by site 312130009 203 0.019886 4207186 4207186\n", - "141 Thrombosis of blood vessel 439129009 169 0.016556 4208466 4208466\n", - "142 Thrombosis 439127006 169 0.016556 4231363 4231363\n", - "143 Deep venous thrombosis 128053003 169 0.016556 4133004 4133004\n", - "144 Venous finding 248727005 169 0.016556 4095634 4095634\n", - "145 Disorder of vein 90507008 169 0.016556 4234997 4234997\n", - "146 Venous thrombosis 111293003 169 0.016556 444247 444247\n", - "147 Acute deep venous thrombosis 132281000119108 169 0.016556 44782746 44782746\n", - "148 Trunk arterial embolus 312593004 162 0.015870 4194610 4194610\n", - "149 Disorder of blood vessels of thorax 373434004 162 0.015870 4190192 4190192\n", - "150 Arterial finding 248718009 162 0.015870 4095631 4095631\n", - "151 Pulmonary artery finding 251039005 162 0.015870 4108173 4108173\n", - "152 Embolism 414086009 162 0.015870 4185607 4185607\n", - "153 Acute pulmonary embolism 706870000 162 0.015870 45768439 45768439\n", - "154 Disorder of artery 359557001 162 0.015870 321887 321887\n", - "155 Disorder of pulmonary circulation 39785005 162 0.015870 433208 433208\n", - "156 Arterial embolism 54687002 162 0.015870 312339 312339\n", - "157 Pulmonary embolism 59282003 162 0.015870 440417 440417\n", - "158 Disorder of immune function 414029004 158 0.015478 440371 440371\n", - "159 Disorder of nasal sinus 7393007 146 0.014303 256440 256440\n", - "160 Sinusitis 36971009 146 0.014303 4283893 4283893\n", - "161 Facial sinus finding 271745005 146 0.014303 4158326 4158326\n", - "162 Traumatic and/or non-traumatic injury of anato... 609411003 145 0.014205 43530877 43530877\n", - "163 Traumatic AND/OR non-traumatic injury 417163006 145 0.014205 432795 432795\n", - "164 Traumatic injury by site 609336008 141 0.013813 43530815 43530815\n", - "165 Traumatic injury 417746004 141 0.013813 440921 440921\n", - "166 Infective disorder of head 363166002 133 0.013029 4176944 4176944\n", - "167 Acute inflammatory disease 128482007 132 0.012931 4134294 4134294\n", - "168 Sepsis 91302008 131 0.012833 132797 132797\n", - "169 Sepsis caused by virus 770349000 131 0.012833 36674642 36674642\n", - "170 Organ dysfunction syndrome 238147009 131 0.012833 4080011 4080011\n", - "171 Viral sinusitis 444814009 126 0.012343 40481087 40481087\n", - "172 Disorder of digestive system 53619000 115 0.011266 4201745 4201745\n", - "173 Disorder of digestive tract 84410009 115 0.011266 4309188 4309188\n", - "174 Disorder of digestive organ 76712006 115 0.011266 4297887 4297887\n", - "175 Disorder of upper digestive tract 50410009 113 0.011070 4198525 4198525\n", - "176 Inflammatory disorder of digestive system 373407002 106 0.010384 4190185 4190185\n", - "177 Inflammatory disorder of digestive tract 128999004 106 0.010384 4043371 4043371\n", - "178 Disorder of ear 25906001 105 0.010286 378161 378161\n", - "179 Middle ear finding 300162007 105 0.010286 4101079 4101079\n", - "180 Finding of limb structure 302293008 105 0.010286 138239 138239\n", - "181 Otitis media 65363002 105 0.010286 372328 372328\n", - "182 Disorder of auditory system 362966006 105 0.010286 4176644 4176644\n", - "183 Ear and auditory finding 118236001 105 0.010286 4042141 4042141\n", - "184 Disorder of extremity 128605003 105 0.010286 133468 133468\n", - "185 Ear finding 247234006 105 0.010286 4082416 4082416\n", - "186 Otitis 43275000 105 0.010286 4183452 4183452\n", - "187 Disorder of middle ear 68996008 105 0.010286 374364 374364\n", - "188 Infective pharyngitis 312422001 104 0.010188 4193318 4193318\n", - "189 Disorder of pharynx 75860007 104 0.010188 31057 31057\n", - "190 Infection of digestive system 312158001 104 0.010188 4193990 4193990\n", - "191 Infectious disease of digestive tract 128398001 104 0.010188 4134887 4134887\n", - "192 Pharyngitis 405737000 104 0.010188 4226263 4226263\n", - "193 Disorder of musculoskeletal system 928000 102 0.009992 4244662 4244662\n", - "194 Disorder of skeletal system 88230002 101 0.009894 4339410 4339410\n", - "195 Injury of musculoskeletal system 105606008 99 0.009698 4022201 4022201\n", - "196 Acute respiratory infections 195647007 83 0.008131 4112341 4112341\n", - "197 Acute upper respiratory infection 54398005 83 0.008131 257011 257011\n", - "198 Acute infectious disease 63171007 83 0.008131 4271450 4271450\n", - "199 Skin AND/OR mucosa finding 415531008 82 0.008033 4212577 4212577\n", - "200 Finding related to pregnancy 118185001 78 0.007641 444094 444094\n", - "201 Pregnancy, childbirth and puerperium finding 248982007 78 0.007641 4088927 4088927\n", - "202 Acute viral pharyngitis 195662009 77 0.007543 4112343 4112343\n", - "203 Acute pharyngitis 363746003 77 0.007543 25297 25297\n", - "204 Acute viral disease 409631000 77 0.007543 4252853 4252853\n", - "205 Viral infection of the digestive tract 312131008 77 0.007543 4193875 4193875\n", - "206 Acute digestive system disorder 127321000 77 0.007543 4132552 4132552\n", - "207 Viral pharyngitis 1532007 77 0.007543 4035987 4035987\n", - "208 Normal pregnancy 72892002 76 0.007445 4217975 4217975\n", - "209 Pregnant 77386006 76 0.007445 4299535 4299535\n", - "210 Disorder of joint region 785875003 73 0.007151 37206233 37206233\n", - "211 Mucosal finding 128145008 72 0.007053 4028076 4028076\n", - "212 Traumatic injury due to event 419945001 69 0.006759 439215 439215\n", - "213 Injury by mechanism 282745002 64 0.006270 4154161 4154161\n", - "214 Bleeding 131148009 62 0.006074 437312 437312\n", - "215 Hemoptysis 66857006 62 0.006074 261687 261687\n", - "216 Arthropathy 399269003 58 0.005682 73553 73553\n", - "217 Disorder of lower extremity 118937003 57 0.005584 193460 193460\n", - "218 Finding of lower limb 116312005 57 0.005584 4022922 4022922\n", - "219 Injury of lower extremity 127279002 55 0.005388 4130852 4130852\n", - "220 Traumatic arthropathy 58188004 54 0.005290 74124 74124\n", - "221 Disorder of free lower limb 700012005 53 0.005192 44782620 44782620\n", - "222 Eye / vision finding 118235002 53 0.005192 4038502 4038502\n", - "223 Ocular surface finding 246869006 53 0.005192 4087936 4087936\n", - "224 Conjunctival finding 246875002 53 0.005192 4080857 4080857\n", - "225 Anterior segment finding 418727003 53 0.005192 4303380 4303380\n", - "226 Passive conjunctival congestion 246677007 53 0.005192 4080695 4080695\n", - "227 Orbit finding 246912006 53 0.005192 4087949 4087949\n", - "228 Globe finding 246915008 53 0.005192 4080992 4080992\n", - "229 Bone finding 118953000 52 0.005094 4042505 4042505\n", - "230 Fracture of bone 125605004 52 0.005094 75053 75053\n", - "231 Soft tissue injury 282026002 52 0.005094 4083964 4083964\n", - "232 Bone injury 284003005 52 0.005094 4154739 4154739\n", - "233 Disorder of bone 76069003 52 0.005094 75909 75909\n", - "234 Disorder of connective tissue 105969002 51 0.004996 253549 253549\n", - "235 Injury of free lower limb 700010002 51 0.004996 44784105 44784105\n", - "236 Musculoskeletal and connective tissue disorder 312225001 50 0.004898 4208786 4208786\n", - "237 Tracheobronchial disorder 233776003 49 0.004800 252662 252662\n", - "238 Bronchitis 32398004 49 0.004800 256451 256451\n", - "239 Acute bronchitis 10509002 49 0.004800 260139 260139\n", - "240 Lesion of joint 298149009 49 0.004800 4179141 4179141\n", - "241 Bronchial finding 301229001 49 0.004800 4116777 4116777\n", - "242 Disorder of bronchus 41427001 49 0.004800 260131 260131\n", - "243 Finding of upper limb 116307009 48 0.004702 4020346 4020346\n", - "244 Disorder of upper extremity 118947000 48 0.004702 4042503 4042503\n", - "245 Disorder of ligament 60492000 48 0.004702 442628 442628\n", - "246 Ligament finding 250132005 48 0.004702 4094284 4094284\n", - "247 Injury of connective tissue 385424001 47 0.004604 4300157 4300157\n", - "248 Joint injury 125610000 47 0.004604 4054054 4054054\n", - "249 Injury of upper extremity 127278005 47 0.004604 4130851 4130851\n", - "250 Ligament injury 263126002 47 0.004604 4136694 4136694\n", - "251 Cardiovascular measurement - finding 366157005 45 0.004408 4277352 4277352\n", - "252 Hypertensive disorder 38341003 45 0.004408 316866 316866\n", - "253 Sprain of ligament 398878007 45 0.004408 4160875 4160875\n", - "254 Sprain of joint 105611005 45 0.004408 4023316 4023316\n", - "255 Finding of ankle or foot 419518009 44 0.004310 4305027 4305027\n", - "256 Essential hypertension 59621000 42 0.004114 320128 320128\n", - "257 Bacterial infectious disease 87628006 42 0.004114 432545 432545\n", - "258 Finding of ankle region 116315007 41 0.004016 4023577 4023577\n", - "259 Injury of ankle 125603006 41 0.004016 77162 77162\n", - "260 Disorder of ankle 128138008 41 0.004016 78831 78831\n", - "261 Bacterial infection by site 301811001 34 0.003331 4200533 4200533\n", - "262 Disorder of the central nervous system 23853001 34 0.003331 376106 376106\n", - "263 Bacterial respiratory infection 312117008 34 0.003331 4207184 4207184\n", - "264 Bacterial upper respiratory infection 312118003 34 0.003331 4207185 4207185\n", - "265 Disorder of nervous system 118940003 34 0.003331 376337 376337\n", - "266 Central nervous system finding 246556002 34 0.003331 4086181 4086181\n", - "267 Finding of brain 299718000 33 0.003233 4101796 4101796\n", - "268 Disorder of brain 81308009 33 0.003233 372887 372887\n", - "269 Hypersensitivity condition 473010000 32 0.003135 43021226 43021226\n", - "270 Disorder of ankle joint 428776005 31 0.003037 443583 443583\n", - "271 Disorder of joint of ankle and/or foot 442246002 31 0.003037 40482662 40482662\n", - "272 Traumatic arthropathy of the ankle and/or foot 201938008 31 0.003037 75620 75620\n", - "273 Sprain of ankle and/or foot 209529003 31 0.003037 4016673 4016673\n", - "274 Traumatic arthropathy of lower extremity 373575008 31 0.003037 4189458 4189458\n", - "275 Sprain of ligament of lower limb 281599007 31 0.003037 4105866 4105866\n", - "276 Traumatic arthropathy-ankle 201954006 31 0.003037 4114605 4114605\n", - "277 Lesion of ligaments of the ankle region 240019006 31 0.003037 4344271 4344271\n", - "278 Ankle joint finding 299413005 31 0.003037 443357 443357\n", - "279 Sprain of ankle 44465007 31 0.003037 81151 81151\n", - "280 Fracture of upper limb 23406007 30 0.002939 4050747 4050747\n", - "281 Finding of bone of upper limb 298756009 30 0.002939 4186164 4186164\n", - "282 Bacterial infection of the digestive tract 312129004 27 0.002645 4193874 4193874\n", - "283 Allergic condition 473011001 27 0.002645 43021227 43021227\n", - "284 Streptococcal infectious disease 85769006 27 0.002645 437779 437779\n", - "285 Disease due to Gram-positive bacteria 371582002 27 0.002645 4161193 4161193\n", - "286 Chronic disease 27624003 27 0.002645 443783 443783\n", - "287 Disease due to Gram-positive coccus 408637006 27 0.002645 4248801 4248801\n", - "288 Streptococcal sore throat 43878008 27 0.002645 28060 28060\n", - "289 Head and neck injury 282749008 25 0.002449 4154162 4154162\n", - "290 Disorder of face 118930001 24 0.002351 4042835 4042835\n", - "291 Traumatic arthropathy-wrist 201946009 23 0.002253 4116594 4116594\n", - "292 Traumatic arthropathy of upper extremity 373574007 23 0.002253 4162433 4162433\n", - "293 Disorder of wrist 128130001 23 0.002253 4028074 4028074\n", - "294 Finding of wrist region 116310002 23 0.002253 4020347 4020347\n", - "295 Allergic disorder 781474001 19 0.001861 36683564 36683564\n", - "296 Disorder of soft tissue of head 280131007 19 0.001861 4090614 4090614\n", - "297 Allergic rhinitis 61582004 19 0.001861 257007 257007\n", - "298 IgE-mediated allergic disorder 422076005 19 0.001861 4223759 4223759\n", - "299 Disorder of mucous membrane 95351003 19 0.001861 4318379 4318379\n", - "300 Inflammatory disease of mucous membrane 95361005 19 0.001861 432661 432661\n", - "301 Disorder of nose and nasopharynx 232339008 19 0.001861 4049222 4049222\n", - "302 Nasal mucosa finding 249353005 19 0.001861 442983 442983\n", - "303 Immune hypersensitivity disorder by mechanism 427439005 19 0.001861 4141833 4141833\n", - "304 Atopic IgE-mediated allergic disorder 421871004 19 0.001861 4223595 4223595\n", - "305 Rhinitis 70076002 19 0.001861 4320791 4320791\n", - "306 Injury of head 82271004 19 0.001861 375415 375415\n", - "307 Disorder of the nose 89488007 19 0.001861 4229909 4229909\n", - "308 Evaluation finding 441742003 18 0.001763 40480457 40480457\n", - "309 Chronic disease of respiratory system 17097001 16 0.001567 4063381 4063381\n", - "310 Wound finding 225552003 16 0.001567 4021667 4021667\n", - "311 Laceration - injury 312608009 16 0.001567 443419 443419\n", - "312 Wound 416462003 16 0.001567 4168335 4168335\n", - "313 Perennial allergic rhinitis 446096008 16 0.001567 40486433 40486433\n", - "314 Open wound 125643001 16 0.001567 444187 444187\n", - "315 Disorder of soft tissue of limb 280134004 16 0.001567 4090615 4090615\n", - "316 Chronic pain 82423001 16 0.001567 436096 436096\n", - "317 Disorder of soft tissue of upper limb 280135003 15 0.001469 4090616 4090616\n", - "318 Neurological lesion 299735001 15 0.001469 4103662 4103662\n", - "319 Concussion injury of body structure 708540005 14 0.001371 45769811 45769811\n", - "320 Sprain of wrist and/or hand 209436000 14 0.001371 4018956 4018956\n", - "321 Complication 116223007 14 0.001371 433128 433128\n", - "322 Sprain of upper extremity 123536004 14 0.001371 4048512 4048512\n", - "323 Injury of wrist 125598003 14 0.001371 444129 444129\n", - "324 Traumatic AND/OR non-traumatic brain injury 127294003 14 0.001371 4133611 4133611\n", - "325 Traumatic brain injury 127295002 14 0.001371 4132546 4132546\n", - "326 Intracranial injury 127296001 14 0.001371 437409 437409\n", - "327 Injury of central nervous system 128126004 14 0.001371 4134439 4134439\n", - "328 Finding of wrist joint 298940007 14 0.001371 4181251 4181251\n", - "329 Lesion of brain 301766008 14 0.001371 4200516 4200516\n", - "330 Fracture of lower limb 46866001 14 0.001371 4187096 4187096\n", - "331 Sprain of wrist 70704007 14 0.001371 78272 78272\n", - "332 Disorder of wrist joint 428107009 14 0.001371 4323193 4323193\n", - "333 Concussion injury of brain 110030002 14 0.001371 4001336 4001336\n", - "334 Injury of nervous system 128239009 14 0.001371 4134134 4134134\n", - "335 Abnormal blood cell count 762656009 13 0.001274 42538830 42538830\n", - "336 RBC count low 165423001 13 0.001274 4013842 4013842\n", - "337 RBC count abnormal 165427000 13 0.001274 4013518 4013518\n", - "338 Hematopoietic system finding 106200001 13 0.001274 4021915 4021915\n", - "339 Measurement finding 118245000 13 0.001274 4041436 4041436\n", - "340 Chronic inflammatory disorder 128294001 13 0.001274 444208 444208\n", - "341 Chronic sinusitis 40055000 13 0.001274 257012 257012\n", - "342 Disorder of cellular component of blood 414022008 13 0.001274 443723 443723\n", - "343 Anemia 271737000 13 0.001274 439777 439777\n", - "344 Hemoglobin level outside reference range 441793007 13 0.001274 40480513 40480513\n", - "345 Measurement finding outside reference range 442096005 13 0.001274 40481841 40481841\n", - "346 Measurement finding below reference range 442686002 13 0.001274 40484533 40484533\n", - "347 Hemoglobin low 165397008 13 0.001274 4013074 4013074\n", - "348 Protein level - finding 365799007 13 0.001274 4276572 4276572\n", - "349 Finding of substance level 785671009 13 0.001274 37203927 37203927\n", - "350 Cytopenia 50820005 13 0.001274 4179922 4179922\n", - "351 Developmental disorder 5294002 13 0.001274 435244 435244\n", - "352 Erythropenia 62574001 13 0.001274 4267432 4267432\n", - "353 Clavicle injury 282760004 12 0.001176 4151199 4151199\n", - "354 Finding of clavicle structure 298761006 12 0.001176 4185643 4185643\n", - "355 Lesion of clavicle 298766001 12 0.001176 4186167 4186167\n", - "356 Fracture of clavicle 58150001 12 0.001176 4237458 4237458\n", - "357 Fracture of shoulder 16250001000004107 12 0.001176 46270317 46270317\n", - "358 Traumatic brain injury with no loss of conscio... 127302008 12 0.001176 4133715 4133715\n", - "359 Concussion with no loss of consciousness 62106007 12 0.001176 378001 378001\n", - "360 Open wound of limb 105616000 11 0.001078 4023317 4023317\n", - "361 Chronic nervous system disorder 128283000 11 0.001078 4134145 4134145\n", - "362 Skin finding 106076001 10 0.000980 141960 141960\n", - "363 Vascular headache 128187005 10 0.000980 4134454 4134454\n", - "364 Disorder of integument 128598002 10 0.000980 4028387 4028387\n", - "365 Fracture of ankle 16114001 10 0.000980 4059173 4059173\n", - "366 Pain of cardiovascular structure 301358001 10 0.000980 4115408 4115408\n", - "367 Chronic headache disorder 431237007 10 0.000980 374639 374639\n", - "368 Chronic brain syndrome 78689005 10 0.000980 4301371 4301371\n", - "369 Disorder of skin 95320005 10 0.000980 4317258 4317258\n", - "370 Chronic intractable migraine without aura 124171000119105 10 0.000980 43530652 43530652\n", - "371 Headache disorder 230461009 10 0.000980 375527 375527\n", - "372 Disorder characterized by pain 373673007 10 0.000980 4160062 4160062\n", - "373 Migraine 37796009 10 0.000980 318736 318736\n", - "374 Integumentary system finding 106077005 10 0.000980 444112 444112\n", - "375 Injury of forearm 125597008 10 0.000980 134222 134222\n", - "376 Disorder of forearm 128132009 10 0.000980 136779 136779\n", - "377 Chronic disease of cardiovascular system 128292002 10 0.000980 4028244 4028244\n", - "378 Refractory migraine without aura 423279000 10 0.000980 443616 443616\n", - "379 Refractory migraine 423894005 10 0.000980 443615 443615\n", - "380 Transformed migraine 427419006 10 0.000980 4141827 4141827\n", - "381 Migraine without aura 56097005 10 0.000980 378735 378735\n", - "382 Disorder of skin and/or subcutaneous tissue 80659006 10 0.000980 200174 200174\n", - "383 Impacted molars 196416002 9 0.000882 4055754 4055754\n", - "384 Fracture at wrist and/or hand level 208388003 9 0.000882 4015350 4015350\n", - "385 Impacted tooth 235104008 9 0.000882 4123726 4123726\n", - "386 Disorder of tooth development 371136004 9 0.000882 4159157 4159157\n", - "387 Disorder of jaw 37156001 9 0.000882 435569 435569\n", - "388 Disorder of teeth AND/OR supporting structures 105995000 9 0.000882 201603 201603\n", - "389 Dislocation of joint 108367008 9 0.000882 74726 74726\n", - "390 Disease of mouth 118938008 9 0.000882 4042502 4042502\n", - "391 Traumatic dislocation of joint of wrist 125618007 9 0.000882 4054058 4054058\n", - "392 Seizure disorder 128613002 9 0.000882 4029498 4029498\n", - "393 Traumatic dislocation of joint 129156001 9 0.000882 4043679 4043679\n", - "394 Subluxation of joint of upper limb 263047001 9 0.000882 4135090 4135090\n", - "395 Fracture dislocation of joint 263063009 9 0.000882 4134184 4134184\n", - "396 Fracture subluxation of joint 263094009 9 0.000882 4136573 4136573\n", - "397 Anomaly of tooth position 81256000 9 0.000882 433243 433243\n", - "398 Seizure 91175000 9 0.000882 377091 377091\n", - "399 Asthma 195967001 9 0.000882 317009 317009\n", - "400 Childhood asthma 233678006 9 0.000882 4051466 4051466\n", - "401 Tooth disorder 234947003 9 0.000882 4122115 4122115\n", - "402 Seizure related finding 313287004 9 0.000882 4196708 4196708\n", - "403 Subluxation of wrist 833334002 9 0.000882 3654437 3654437\n", - "404 Dislocation of wrist 833335001 9 0.000882 3654438 3654438\n", - "405 Dislocation of joint of upper limb 263017003 9 0.000882 75047 75047\n", - "406 Subluxation of joint 263031003 9 0.000882 4134174 4134174\n", - "407 Fracture dislocation of joint of upper limb 263073006 9 0.000882 4135097 4135097\n", - "408 Fracture subluxation of wrist 263102004 9 0.000882 4134304 4134304\n", - "409 Tooth finding 278544002 9 0.000882 4132462 4132462\n", - "410 Fracture subluxation of joint of upper limb 281519006 9 0.000882 4085546 4085546\n", - "411 Fracture of forearm 65966004 9 0.000882 4278672 4278672\n", - "412 Acute allergic reaction 241929008 8 0.000784 4084167 4084167\n", - "413 Open wound of lower limb 26947005 8 0.000784 4097962 4097962\n", - "414 Laceration of lower limb 283357002 8 0.000784 4152960 4152960\n", - "415 Chest injury 262525000 8 0.000784 4094683 4094683\n", - "416 Adverse reaction 281647001 8 0.000784 4105886 4105886\n", - "417 Allergic reaction 419076005 8 0.000784 40589905 40589905\n", - "418 Injury of trunk 48125009 8 0.000784 194526 194526\n", - "419 Hypersensitivity reaction 421961002 8 0.000784 4223616 4223616\n", - "420 Fracture of rib 33737001 7 0.000686 4142905 4142905\n", - "421 Acute sinusitis 15805002 7 0.000686 260123 260123\n", - "422 Injury of ribs 282770002 7 0.000686 4151202 4151202\n", - "423 Bacterial sinusitis 703470001 7 0.000686 45766333 45766333\n", - "424 Disorder of body wall 399986003 7 0.000686 4266188 4266188\n", - "425 Fracture of bones of trunk 65354004 7 0.000686 4279139 4279139\n", - "426 Injury of chest wall 65978000 7 0.000686 75128 75128\n", - "427 Acute bacterial sinusitis 75498004 7 0.000686 4294548 4294548\n", - "428 Complication of pregnancy, childbirth and/or t... 198609003 6 0.000588 435875 435875\n", - "429 Perennial allergic rhinitis with seasonal vari... 232353008 6 0.000588 4048171 4048171\n", - "430 Disorder of neck 118939000 6 0.000588 4042837 4042837\n", - "431 Cardiac finding 301095005 6 0.000588 4103183 4103183\n", - "432 Mediastinal finding 301296002 6 0.000588 4115390 4115390\n", - "433 Whiplash injury to neck 39848009 6 0.000588 4218389 4218389\n", - "434 Disorder of mediastinum 49483002 6 0.000588 440142 440142\n", - "435 Heart disease 56265001 6 0.000588 321588 321588\n", - "436 Injury of neck 90460009 6 0.000588 24818 24818\n", - "437 Lesion of neck 298397000 6 0.000588 4185207 4185207\n", - "438 Inflammatory dermatosis 703938007 5 0.000490 45766714 45766714\n", - "439 Lesion of skin and/or skin-associated mucous m... 714974000 5 0.000490 37018424 37018424\n", - "440 Disease of circulatory system complicating pre... 724497009 5 0.000490 37110290 37110290\n", - "441 Open wound of face 210339009 5 0.000490 4049957 4049957\n", - "442 Atopic dermatitis 24079001 5 0.000490 133834 133834\n", - "443 Disorder of hemostatic system 362970003 5 0.000490 4179872 4179872\n", - "444 Facial laceration 370247008 5 0.000490 4156265 4156265\n", - "445 Open wound of head AND/OR neck 397180001 5 0.000490 4246695 4246695\n", - "446 Disorder of cardiac function 105981003 5 0.000490 4024552 4024552\n", - "447 Genetic finding 106221001 5 0.000490 4025367 4025367\n", - "448 Atopy 115665000 5 0.000490 4019380 4019380\n", - "449 Injury of integument 125592002 5 0.000490 4053826 4053826\n", - "450 Open wound of thigh 125659001 5 0.000490 4053602 4053602\n", - "451 Burn 125666000 5 0.000490 442013 442013\n", - "452 Injury of face 125593007 5 0.000490 444191 444191\n", - "453 Burn of skin 284196006 5 0.000490 4108467 4108467\n", - "454 Laceration of thigh 283385000 5 0.000490 4152936 4152936\n", - "455 Finding of thigh 419003001 5 0.000490 4169466 4169466\n", - "456 Genetic predisposition 47708004 5 0.000490 4166231 4166231\n", - "457 Injury of thigh 7523003 5 0.000490 442564 442564\n", - "458 Heart failure 84114007 5 0.000490 316139 316139\n", - "459 Laceration of head 428088000 5 0.000490 4179823 4179823\n", - "460 Hypersensitivity disposition 609433001 5 0.000490 43530897 43530897\n", - "461 Lesion of face 767811005 5 0.000490 35624868 35624868\n", - "462 Cutaneous hypersensitivity 21626009 5 0.000490 4070025 4070025\n", - "463 Acquired coagulation disorder 234466008 5 0.000490 4120613 4120613\n", - "464 Inflammation of skin and/or subcutaneous tissue 363168001 5 0.000490 4181062 4181062\n", - "465 Open wound of head 38354005 5 0.000490 4243161 4243161\n", - "466 Disorder of thigh 128135006 5 0.000490 444211 444211\n", - "467 Skin or mucosa lesion 247440002 5 0.000490 4083787 4083787\n", - "468 Laceration of head and neck 283358007 5 0.000490 4155030 4155030\n", - "469 Propensity to adverse reaction 420134006 5 0.000490 4172024 4172024\n", - "470 Eczema 43116000 5 0.000490 133835 133835\n", - "471 Blood coagulation disorder 64779008 5 0.000490 432585 432585\n", - "472 Skin lesion 95324001 5 0.000490 4316083 4316083\n", - "473 Finding of abdomen 609624008 4 0.000392 43531058 43531058\n", - "474 Disorder of pelvic girdle 700011003 4 0.000392 44784106 44784106\n", - "475 Disorder of glucose regulation 237597000 4 0.000392 4130161 4130161\n", - "476 Disorder of abdomen 118948005 4 0.000392 444089 444089\n", - "477 Disorder of hip 118935006 4 0.000392 4042501 4042501\n", - "478 Disorder of knee 128136007 4 0.000392 4134443 4134443\n", - "479 Closed fracture of lower limb 52603002 4 0.000392 4199590 4199590\n", - "480 Finding of abdominopelvic segment of trunk 822987005 4 0.000392 37311678 37311678\n", - "481 Disorder of abdominopelvic segment of trunk 822988000 4 0.000392 37311677 37311677\n", - "482 Impaired glucose tolerance 9414007 4 0.000392 4311629 4311629\n", - "483 Arthropathy of knee joint 428724006 4 0.000392 4324765 4324765\n", - "484 Injury of pelvic girdle 700009007 4 0.000392 44782619 44782619\n", - "485 Disorder of pregnancy 173300003 4 0.000392 439658 439658\n", - "486 Miscarriage 17369002 4 0.000392 4067106 4067106\n", - "487 Miscarriage in first trimester 19169002 4 0.000392 4078393 4078393\n", - "488 Disorder of carbohydrate metabolism 20957000 4 0.000392 437515 437515\n", - "489 Closed fracture of hip 359817006 4 0.000392 4230399 4230399\n", - "490 Disorder of endocrine system 362969004 4 0.000392 31821 31821\n", - "491 Pregnancy with abortive outcome 363681007 4 0.000392 40539858 40539858\n", - "492 Finding of hip region 116313000 4 0.000392 444220 444220\n", - "493 Finding of knee region 116314006 4 0.000392 4022923 4022923\n", - "494 Injury of hip region 125600009 4 0.000392 193666 193666\n", - "495 Abdominal organ finding 249561001 4 0.000392 4096864 4096864\n", - "496 Knee joint finding 299321000 4 0.000392 4100932 4100932\n", - "497 Epidermal burn of skin 403190006 4 0.000392 4296204 4296204\n", - "498 Closed fracture 423125000 4 0.000392 4307254 4307254\n", - "499 Fracture of bone of hip region 700097003 4 0.000392 45763653 45763653\n", - "500 Child attention deficit disorder 192127007 3 0.000294 440086 440086\n", - "501 Disorders of attention and motor control 229712006 3 0.000294 4047120 4047120\n", - "502 Finding of foot region 116316008 3 0.000294 4022924 4022924\n", - "503 Disorder of foot 118932009 3 0.000294 444090 444090\n", - "504 Injury of foot 125604000 3 0.000294 444130 444130\n", - "505 Finding of functional performance and activity 248536006 3 0.000294 4089214 4089214\n", - "506 Attention deficit hyperactivity disorder 406506008 3 0.000294 438409 438409\n", - "507 Chronic disease of immune function 413834006 3 0.000294 4188970 4188970\n", - "508 Recurrent disease 58184002 3 0.000294 440059 440059\n", - "509 Mental disorder 74732009 3 0.000294 432586 432586\n", - "510 Seasonal allergic rhinitis 367498001 3 0.000294 4280726 4280726\n", - "511 Hypertension AND/OR vomiting complicating preg... 106005003 3 0.000294 4024560 4024560\n", - "512 Functional finding 118228005 3 0.000294 4041284 4041284\n", - "513 Open wound of foot 125663008 3 0.000294 4054067 4054067\n", - "514 Developmental mental disorder 129104009 3 0.000294 4043545 4043545\n", - "515 Developmental disorder of motor function 268674003 3 0.000294 4148091 4148091\n", - "516 Laceration of upper limb 283366003 3 0.000294 4152932 4152932\n", - "517 Laceration of foot 284551006 3 0.000294 4109685 4109685\n", - "518 Pre-eclampsia 398254007 3 0.000294 439393 439393\n", - "519 Pregnancy-induced hypertension 48194001 3 0.000294 4167493 4167493\n", - "520 Neurodevelopmental disorder 700364009 3 0.000294 45771096 45771096\n", - "521 Open wound of upper limb 81405006 3 0.000294 4216185 4216185\n", - "522 Disorder of pelvic region of trunk 609619005 2 0.000196 43531053 43531053\n", - "523 Disorder of pelvis 609620004 2 0.000196 43531054 43531054\n", - "524 Finding of pelvic region of trunk 609625009 2 0.000196 43531059 43531059\n", - "525 Rupture of intestine 235799001 2 0.000196 4340361 4340361\n", - "526 Cystitis 38822007 2 0.000196 195588 195588\n", - "527 Urinary system finding 106098005 2 0.000196 4024000 4024000\n", - "528 Lower urinary tract finding 106100005 2 0.000196 4021780 4021780\n", - "529 Finding of hand region 116311003 2 0.000196 77358 77358\n", - "530 Disorder of gastrointestinal tract 119292006 2 0.000196 4000610 4000610\n", - "531 Disorder of large intestine 119523007 2 0.000196 4002905 4002905\n", - "532 Injury of hand 125599006 2 0.000196 80004 80004\n", - "533 Finding of urinary tract proper 249273002 2 0.000196 4091213 4091213\n", - "534 Bladder finding 249585009 2 0.000196 4092881 4092881\n", - "535 Tendon finding 250133000 2 0.000196 4095203 4095203\n", - "536 Laceration of hand 284549007 2 0.000196 4113008 4113008\n", - "537 Disorder of urinary tract 41368006 2 0.000196 197331 197331\n", - "538 Inflammation of large intestine 302168000 2 0.000196 4201402 4201402\n", - "539 Disorder of bladder 42643001 2 0.000196 201337 201337\n", - "540 Disorder of tendon 68172002 2 0.000196 442264 442264\n", - "541 Appendicitis 74400008 2 0.000196 440448 440448\n", - "542 Epilepsy 84757009 2 0.000196 380378 380378\n", - "543 Disorder of intestine 85919009 2 0.000196 201618 201618\n", - "544 Drug-related disorder 87858002 2 0.000196 444363 444363\n", - "545 Injury of ligament of knee 438479005 2 0.000196 4231941 4231941\n", - "546 Complication occurring during pregnancy 609496007 2 0.000196 43530950 43530950\n", - "547 Finding of pelvis 609626005 2 0.000196 43531060 43531060\n", - "548 Pelvic organ finding 700006000 2 0.000196 44784102 44784102\n", - "549 Disorder of appendix 18526009 2 0.000196 433524 433524\n", - "550 Eclampsia in pregnancy 198992004 2 0.000196 137613 137613\n", - "551 Inflammatory disorder of genitourinary system 373406006 2 0.000196 4159963 4159963\n", - "552 Urogenital finding 118238000 2 0.000196 4041285 4041285\n", - "553 Finding of large intestine 118436003 2 0.000196 4038678 4038678\n", - "554 Disorder of hand 118933004 2 0.000196 77635 77635\n", - "555 Injury of knee 125601008 2 0.000196 444132 444132\n", - "556 Open wound of hand 125652005 2 0.000196 4129405 4129405\n", - "557 Disorder of the urinary system 128606002 2 0.000196 75865 75865\n", - "558 Eclampsia 15938005 2 0.000196 443700 443700\n", - "559 Bowel finding 249562008 2 0.000196 4091532 4091532\n", - "560 Finding of appendix 300307005 2 0.000196 4113552 4113552\n", - "561 Disorder of the genitourinary system 42030000 2 0.000196 4171379 4171379\n", - "562 Rupture of appendix 47693006 2 0.000196 4166224 4166224\n", - "563 Drug overdose 55680006 2 0.000196 4208104 4208104\n", - "564 Disorder of the lower urinary tract 7793005 2 0.000196 4301471 4301471\n", - "565 Disorder of lower gastrointestinal tract 79787007 2 0.000196 4197094 4197094\n", - "566 Central nervous system complication 87536007 2 0.000196 373087 373087\n", - "567 Edema 267038008 1 0.000098 433595 433595\n", - "568 Injury of cruciate ligament of knee 444158007 1 0.000098 40485073 40485073\n", - "569 Injury of anterior cruciate ligament 444470001 1 0.000098 40479768 40479768\n", - "570 Disorder of vertebral column 699699005 1 0.000098 44782549 44782549\n", - "571 Traumatic and/or non-traumatic injury of back 712893003 1 0.000098 37016775 37016775\n", - "572 Injury of intrathoracic organ 733217006 1 0.000098 37116489 37116489\n", - "573 Interstitial lung disease 233703007 1 0.000098 4119786 4119786\n", - "574 Injury of tendon of the rotator cuff of shoulder 307731004 1 0.000098 4146173 4146173\n", - "575 Rupture of patellar tendon 30832001 1 0.000098 4149245 4149245\n", - "576 Disorder of back 33308003 1 0.000098 140190 140190\n", - "577 Blighted ovum 35999006 1 0.000098 4262136 4262136\n", - "578 Paralysis due to lesion of spinal cord 372310001 1 0.000098 4157607 4157607\n", - "579 Disorder of shoulder 118944007 1 0.000098 77630 77630\n", - "580 Vertebral column finding 119414006 1 0.000098 4002898 4002898\n", - "581 Disorder characterized by edema 118654009 1 0.000098 4040388 4040388\n", - "582 Open wound of forearm 125649002 1 0.000098 4053599 4053599\n", - "583 Traumatic brain injury with loss of consciousness 127298000 1 0.000098 4132082 4132082\n", - "584 Structural disorder of heart 128599005 1 0.000098 4027255 4027255\n", - "585 Ligament rupture 263134008 1 0.000098 4138286 4138286\n", - "586 Internal injury of chest 27817002 1 0.000098 74786 74786\n", - "587 Finding of structures of conception 289262005 1 0.000098 4128846 4128846\n", - "588 Finding of vertebra 298385001 1 0.000098 4185206 4185206\n", - "589 Finding of spinal cord 299733008 1 0.000098 4103661 4103661\n", - "590 Edema of trunk 301867009 1 0.000098 4199409 4199409\n", - "591 Abnormal products of conception 39804004 1 0.000098 436477 436477\n", - "592 Partial thickness burn 403191005 1 0.000098 4296205 4296205\n", - "593 Disorder of spinal region 410730009 1 0.000098 4260918 4260918\n", - "594 Traumatic or non-traumatic rupture of tendon 415746003 1 0.000098 4215217 4215217\n", - "595 Fracture of vertebral column 50448004 1 0.000098 4174520 4174520\n", - "596 Cartilage disorder 50927007 1 0.000098 4178431 4178431\n", - "597 Rupture of quadriceps tendon 6849006 1 0.000098 195632 195632\n", - "598 Disorder of tendon of shoulder region 76318008 1 0.000098 79116 79116\n", - "599 Injury of medial collateral ligament of knee 444448004 1 0.000098 40479422 40479422\n", - "600 Injury of collateral ligament of knee 444159004 1 0.000098 40485074 40485074\n", - "601 Chronic paralysis due to lesion of spinal cord 698754002 1 0.000098 44782520 44782520\n", - "602 Injury of rotator cuff 718539004 1 0.000098 36713625 36713625\n", - "603 Traumatic injury of vertebral region of back 737566006 1 0.000098 42537893 42537893\n", - "604 Fracture of vertebral column with spinal cord ... 1734006 1 0.000098 4066995 4066995\n", - "605 Pulmonary edema 19242006 1 0.000098 4078925 4078925\n", - "606 Tear of meniscus of knee 239720000 1 0.000098 4035415 4035415\n", - "607 Soft tissue lesion of knee region 239999004 1 0.000098 4344027 4344027\n", - "608 Connective tissue disorder by body site 363044007 1 0.000098 4180645 4180645\n", - "609 Injury of internal organ 105612003 1 0.000098 193631 193631\n", - "610 Finding of shoulder region 116308004 1 0.000098 4022449 4022449\n", - "611 Cartilage finding 118954006 1 0.000098 4043349 4043349\n", - "612 Intracranial injury with loss of consciousness 127297005 1 0.000098 437385 437385\n", - "613 Disorder of product of conception 128604004 1 0.000098 4029496 4029496\n", - "614 Complete miscarriage 156073000 1 0.000098 40318618 40318618\n", - "615 Spinal injury 262521009 1 0.000098 4095850 4095850\n", - "616 Rupture of ligament of knee joint 263139003 1 0.000098 4134312 4134312\n", - "617 Disorder of soft tissue of lower limb 280136002 1 0.000098 4093228 4093228\n", - "618 Rupture of tendon of lower limb 281549008 1 0.000098 4084434 4084434\n", - "619 Cardiovascular injury 282728007 1 0.000098 4152156 4152156\n", - "620 Laceration of forearm 283371005 1 0.000098 4155034 4155034\n", - "621 Paralytic syndrome 29426003 1 0.000098 374377 374377\n", - "622 Finding of spinal region 298379008 1 0.000098 4182165 4182165\n", - "623 Disorder of rotator cuff 414033006 1 0.000098 4212887 4212887\n", - "624 Finding of back 414252009 1 0.000098 4213101 4213101\n", - "625 Spinal cord disease 48522003 1 0.000098 135526 135526\n", - "626 Acute respiratory distress syndrome 67782005 1 0.000098 4195694 4195694\n", - "627 Injury of heart 86175003 1 0.000098 4311280 4311280\n", - "628 Spinal cord injury 90584004 1 0.000098 4235863 4235863\n", - "629 Concussion with loss of consciousness 62564004 1 0.000098 375671 375671\n", - "the time taken to get cohort concept stats for condition_occurrence is 1.1196579933166504s\n" - ] - } - ], - "source": [ - "# get cohort concept prevalance\n", - "t1 = time.time()\n", - "cohort_concepts = cohort_data.get_concept_stats()\n", - "print(pd.DataFrame(cohort_concepts[\"condition_occurrence\"]))\n", - "print(f'the time taken to get cohort concept stats for condition_occurrence is {time.time() - t1}s')" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "128c7b02-faef-4a35-97a6-c92baa5a37dd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Connection to BiasDatabase closed.\n", - "Connection to the OMOP CDM database closed.\n" - ] - } - ], - "source": [ - "bias.cleanup()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e2bf375-b4fb-4c50-aab9-fff4c1a02a95", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 9947ddcb67ff56b7f1d7c152c73c2e7191d6b8a5 Mon Sep 17 00:00:00 2001 From: hyi Date: Fri, 20 Jun 2025 15:46:22 -0400 Subject: [PATCH 08/10] converted all jupyter notebooks into tutorials --- biasanalyzer/api.py | 20 +- biasanalyzer/database.py | 1 + .../BiasAnalyzerAsyncCohortsTutorial.ipynb | 6 +- .../BiasAnalyzerCohortConceptTutorial.ipynb | 10 +- notebooks/BiasAnalyzerCohortsTutorial.ipynb | 4 +- .../BiasAnalyzerConceptBrowsingTutorial.ipynb | 519 ++++++++++++++ .../BiasAnalyzerTestingConceptBrowsing.ipynb | 660 ------------------ 7 files changed, 539 insertions(+), 681 deletions(-) create mode 100644 notebooks/BiasAnalyzerConceptBrowsingTutorial.ipynb delete mode 100644 notebooks/BiasAnalyzerTestingConceptBrowsing.ipynb diff --git a/biasanalyzer/api.py b/biasanalyzer/api.py index 50fca55..d09163f 100644 --- a/biasanalyzer/api.py +++ b/biasanalyzer/api.py @@ -72,7 +72,6 @@ def _set_cohort_action(self): return self.cohort_action def get_domains_and_vocabularies(self): - print(f'self.omop_cdm_db: {self.omop_cdm_db}') if self.omop_cdm_db is None: notify_users('A valid OMOP CDM must be set before getting domains. ' 'Call set_root_omop first to set a valid root OMOP CDM') @@ -96,19 +95,18 @@ def get_concept_hierarchy(self, concept_id): return None return self.omop_cdm_db.get_concept_hierarchy(concept_id) - def display_concept_tree(self, concept_tree: dict, level: int = 0, show_in_text_format=True, tree_type=None): + def display_concept_tree(self, concept_tree: dict, level: int = 0, show_in_text_format=True): """ Recursively prints the concept hierarchy tree in an indented format for display. """ details = concept_tree.get("details", {}) - if tree_type is None or tree_type not in ['parents', 'children']: - if 'parents' in concept_tree: - tree_type = 'parents' - elif 'children' in concept_tree: - tree_type = 'children' - else: - notify_users('The input concept tree must contain parents or children key as the type of the tree.') - return '' + if 'parents' in concept_tree: + tree_type = 'parents' + elif 'children' in concept_tree: + tree_type = 'children' + else: + notify_users('The input concept tree must contain parents or children key as the type of the tree.') + return '' if show_in_text_format: if details: @@ -119,7 +117,7 @@ def display_concept_tree(self, concept_tree: dict, level: int = 0, show_in_text_ for child in concept_tree.get(tree_type, []): if child: - self.display_concept_tree(child, level + 1, tree_type=tree_type, show_in_text_format=True) + self.display_concept_tree(child, level + 1, show_in_text_format=True) # return empty string to print None being printed at the end of printout return "" else: diff --git a/biasanalyzer/database.py b/biasanalyzer/database.py index 0c9d21f..03efa6e 100644 --- a/biasanalyzer/database.py +++ b/biasanalyzer/database.py @@ -504,6 +504,7 @@ def get_concept_hierarchy(self, concept_id: int): ancestor_id, {"details": concept_details[ancestor_id], "parents": []}) desc_entry_rev["parents"].append(ancestor_entry_rev) progress.update(1) + progress.close() # Return the parent hierarchy and children hierarchy of the specified concept return reverse_hierarchy[concept_id], hierarchy[concept_id] diff --git a/notebooks/BiasAnalyzerAsyncCohortsTutorial.ipynb b/notebooks/BiasAnalyzerAsyncCohortsTutorial.ipynb index 7e25e9b..9ee1bca 100644 --- a/notebooks/BiasAnalyzerAsyncCohortsTutorial.ipynb +++ b/notebooks/BiasAnalyzerAsyncCohortsTutorial.ipynb @@ -98,7 +98,7 @@ "---\n", "\n", "### Asynchronous cohort creation\n", - "**Baseline cohort creation**: To create a baseline cohort of young female patients asynchronously, use the `run_in_background()` function on the `bias` object to run `create_cohort(cohort_name, cohort_description, query_or_yaml_file, created_by)` function in a background thread. You'll pass the target function as the first argument, the cohort creation target function input arguments as the next four arguments, a `BackgroundResult` object via the `result_holder` optional parameter to store the created baseline cohort result, and a `delay` value (e.g., 120 seconds) to simulate asynchronous execution of long-running process for testing purposes. The created baseline cohort will be identical to the one created in the [Cohort Exploration Tutorial](./BiasAnalyzerCohortsTutorial.ipynb), except that the cohort creation now runs asychronously in a background thread." + "**Baseline cohort creation**: To create a baseline cohort of young female patients asynchronously, use the `run_in_background()` function to run `create_cohort(cohort_name, cohort_description, query_or_yaml_file, created_by)` method in a background thread. You'll pass the target function as the first argument, the cohort creation target function input arguments as the next four arguments, a `BackgroundResult` object via the `result_holder` optional parameter to store the created baseline cohort result, and a `delay` value (e.g., 120 seconds) to simulate asynchronous execution of long-running process for testing purposes. The created baseline cohort will be identical to the one created in the [Cohort Exploration Tutorial](./BiasAnalyzerCohortsTutorial.ipynb), except that the cohort creation now runs asychronously in a background thread." ] }, { @@ -147,7 +147,7 @@ "# Create baseline cohort result holder\n", "baseline_result = BackgroundResult()\n", "\n", - "# Start background task to run create_cohort() function for a baseline cohort in a background thread\n", + "# Start background task to run create_cohort() method for a baseline cohort in a background thread\n", "baseline_thread = run_in_background(\n", " bias.create_cohort,\n", " \"Young female patients\",\n", @@ -168,7 +168,7 @@ "source": [ "———————————————\n", "\n", - "**Study cohort creation**: To create a study cohort of young female COVID patients asynchronously, use the `run_in_background()` function on the `bias` object to run `create_cohort(cohort_name, cohort_description, query_or_yaml_file, created_by)` function in a background thread. You'll pass the target function as the first argument, the cohort creation target function input arguments as the next four arguments, a `BackgroundResult` object via the `result_holder` optional parameter to store the created baseline cohort result, and a `delay` value (e.g., 120 seconds) to simulate asynchronous execution of long-running process for testing purposes. The created study cohort will be identical to the one created in the [Cohort Exploration Tutorial](./BiasAnalyzerCohortsTutorial.ipynb), except that the cohort creation now runs asychronously in a background thread." + "**Study cohort creation**: To create a study cohort of young female COVID patients asynchronously, use the `run_in_background()` function to run `create_cohort(cohort_name, cohort_description, query_or_yaml_file, created_by)` function in a background thread. You'll pass the target function as the first argument, the cohort creation target function input arguments as the next four arguments, a `BackgroundResult` object via the `result_holder` optional parameter to store the created baseline cohort result, and a `delay` value (e.g., 120 seconds) to simulate asynchronous execution of long-running process for testing purposes. The created study cohort will be identical to the one created in the [Cohort Exploration Tutorial](./BiasAnalyzerCohortsTutorial.ipynb), except that the cohort creation now runs asychronously in a background thread." ] }, { diff --git a/notebooks/BiasAnalyzerCohortConceptTutorial.ipynb b/notebooks/BiasAnalyzerCohortConceptTutorial.ipynb index a35de32..8968c21 100644 --- a/notebooks/BiasAnalyzerCohortConceptTutorial.ipynb +++ b/notebooks/BiasAnalyzerCohortConceptTutorial.ipynb @@ -77,7 +77,7 @@ "source": [ "———————————————\n", "\n", - "**Preparation step 2**: Create a cohort of young female COVID patients using the `create_cohort(cohort_name, cohort_description, query_or_yaml_file, created_by)` function on the `bias` object for cohort concept prevalence exploration. You'll pass the name of the cohort as the first argument, the description of the cohort as the second argument, a yaml file that specifies cohort inclusion and exclusion criteria or a cohort selection SQL query as the third argument, and the cohort owner's name indicating who owns or creates this cohort as the fourth argument. After the cohort is created, you can call `get_stats()` and `get_distributions()` functions on the returned `cohort_data` object to explore cohort statistics and distributions." + "**Preparation step 2**: Create a cohort of young female COVID patients using the `create_cohort(cohort_name, cohort_description, query_or_yaml_file, created_by)` method on the `bias` object for cohort concept prevalence exploration. You'll pass the name of the cohort as the first argument, the description of the cohort as the second argument, a yaml file that specifies cohort inclusion and exclusion criteria or a cohort selection SQL query as the third argument, and the cohort owner's name indicating who owns or creates this cohort as the fourth argument. After the cohort is created, you can call `get_stats()` and `get_distributions()` methods on the returned `cohort_data` object to explore cohort statistics and distributions." ] }, { @@ -157,15 +157,15 @@ "---\n", "\n", "### Exploring cohort concept prevalence\n", - "You can retrieve concept prevalence statistics for a cohort using the `get_concept_stats(concept_type='condition_occurrence', filter_count=0, vocab=None, include_hierarchy=False)` function on the `cohort_data` object. Each input argument to this function has a default value, so you can call the function without specifying all parameters. \n", + "You can retrieve concept prevalence statistics for a cohort using the `get_concept_stats(concept_type='condition_occurrence', filter_count=0, vocab=None, include_hierarchy=False)` method on the `cohort_data` object. Each input argument to this method has a default value, so you can call the method without specifying all parameters.\n", "- The `concept_type` input argument specifies the OMOP domain to analyze. It must be one of the OMOP domain names: `condition_occurrence`, `drug_exposure`, `procedure_occurrence`, `visit_occurrence`, `measurement`, or `observation`.\n", "- The `vocab` input argument specifies the OMOP vocabulary ID to filter concepts by. If set to `None`, a default vocabulary is used based on the domain: `RxNorm` for `drug_exposure`, `LOINC` for `measurement`, and `SNOMED` for all other domains.\n", "- The `filter_count` input argument filters out concepts with fewer than this number of patients in the cohort. Set it to `0` to include all without filtering.\n", "- The `include_hierarchy` input argument specifies whether to include concept hierarchical relationship. If set to `True`, ancestor concepts using the OMOP concept hierarchy are included when calculating prevalence.\n", - "This function helps identify the most prevalent clinical concepts in your cohort, which can reveal patterns or potential sources of selection bias in the cohort data.\n", + "This method helps identify the most prevalent clinical concepts in your cohort, which can reveal patterns or potential sources of selection bias in the cohort data.\n", "\n", "**Cohort condition occurrence concept prevalence**: \n", - "The code block below demonstrates how to use the default parameters of the `get_concept_stats()` function to retrieve concept prevalence for the `condition occurrence` domain. By default, it uses the `SNOMED` vocabulary, excludes hierarchical relationships, and applies no filtering. The function returns a dictionary where the **key** is the `concept_type` (e.g., `condition_occurrence`) and the **value** is a list of concept dictionaries. Each concept dictionary in the list contains `concept_name`, `concept_code`, `count_in_cohort`, `prevalence`, `ancestor_concept_id`, and `descendant_concept_id`. These values allow you to explore which clinical concepts are most prevalent in your cohort and suppoert deeper investigations into potential sources of selection bias.\n", + "The code block below demonstrates how to use the default parameters of the `get_concept_stats()` method to retrieve concept prevalence for the `condition occurrence` domain. By default, it uses the `SNOMED` vocabulary, excludes hierarchical relationships, and applies no filtering. The method returns a dictionary where the **key** is the `concept_type` (e.g., `condition_occurrence`) and the **value** is a list of concept dictionaries. Each concept dictionary in the list contains `concept_name`, `concept_code`, `count_in_cohort`, `prevalence`, `ancestor_concept_id`, and `descendant_concept_id`. These values allow you to explore which clinical concepts are most prevalent in your cohort and suppoert deeper investigations into potential sources of selection bias.\n", "\n", "**Note** that this prevalence computation may take some time, especially for large cohorts. A progress bar will appear to indicate the progress of the prevalence calculation." ] @@ -854,7 +854,7 @@ "———————————————\n", "\n", "**Cohort drug exposure concept prevalence**: \n", - "The code block below demonstrates how to use `get_concept_stats(concept_type='drug_exposure', filter_count=500, include_hierarchy=True)` function to retrieve concept prevalence for the `drug_exposure` domain. By default, this uses the `RxNorm` vocabulary. Concepts with fewer than 500 patients are excluded, and hierarchical relationships are included in the results. The function returns a dictionary where the **key** is the `concept_type` (in this case, `drug_exposure`) and the **value** is a list of concept dictionaries. Each concept dictionary in the list contains the following fields: `concept_name`, `concept_code`, `count_in_cohort`, `prevalence`, `ancestor_concept_id`, and `descendant_concept_id`. These values allow you to explore which clinical concepts are most prevalent in your cohort and suppoert deeper investigations into potential sources of selection bias.\n", + "The code block below demonstrates how to use `get_concept_stats(concept_type='drug_exposure', filter_count=500, include_hierarchy=True)` method to retrieve concept prevalence for the `drug_exposure` domain. By default, this uses the `RxNorm` vocabulary. Concepts with fewer than 500 patients are excluded, and hierarchical relationships are included in the results. The method returns a dictionary where the **key** is the `concept_type` (in this case, `drug_exposure`) and the **value** is a list of concept dictionaries. Each concept dictionary in the list contains the following fields: `concept_name`, `concept_code`, `count_in_cohort`, `prevalence`, `ancestor_concept_id`, and `descendant_concept_id`. These values allow you to explore which clinical concepts are most prevalent in your cohort and suppoert deeper investigations into potential sources of selection bias.\n", "\n", "**Note**: Prevalence computation may take some time, especially for large cohorts or when hierarchical relationships are included. A progress bar will appear to indicate the progress of the computation. \n", "\n", diff --git a/notebooks/BiasAnalyzerCohortsTutorial.ipynb b/notebooks/BiasAnalyzerCohortsTutorial.ipynb index 4ac4bfc..028745e 100644 --- a/notebooks/BiasAnalyzerCohortsTutorial.ipynb +++ b/notebooks/BiasAnalyzerCohortsTutorial.ipynb @@ -146,7 +146,7 @@ "---\n", "\n", "### Baseline cohort creation and exploration\n", - "**Baseline cohort creation**: To create a baseline cohort of young female patients, use the `create_cohort(cohort_name, cohort_description, query_or_yaml_file, created_by)` function on the `bias` object. You'll pass the name of the cohort as the first argument, the description of the cohort as the second argument, a yaml file that specifies cohort inclusion and exclusion criteria or a cohort selection SQL query as the third argument, and the cohort owner's name indicating who owns or creates this cohort as the fourth argument. The function will show a progress bar to indicate cohort creation progress over three stages." + "**Baseline cohort creation**: To create a baseline cohort of young female patients, use the `create_cohort(cohort_name, cohort_description, query_or_yaml_file, created_by)` method on the `bias` object. You'll pass the name of the cohort as the first argument, the description of the cohort as the second argument, a yaml file that specifies cohort inclusion and exclusion criteria or a cohort selection SQL query as the third argument, and the cohort owner's name indicating who owns or creates this cohort as the fourth argument. The method will show a progress bar to indicate cohort creation progress over three stages." ] }, { @@ -304,7 +304,7 @@ "---\n", "\n", "### Study cohort creation and exploration\n", - "**Study cohort creation**: To create a study cohort of young female COVID patients, use the `create_cohort(cohort_name, cohort_description, query_or_yaml_file, created_by)` function on the `bias` object. You'll pass the name of the cohort as the first argument, the description of the cohort as the second argument, a yaml file that specifies cohort inclusion and exclusion criteria or a cohort selection SQL query as the third argument, and the cohort owner's name indicating who owns or creates this cohort as the fourth argument. The function will show a progress bar to indicate cohort creation progress over three stages.\n" + "**Study cohort creation**: To create a study cohort of young female COVID patients, use the `create_cohort(cohort_name, cohort_description, query_or_yaml_file, created_by)` method on the `bias` object. You'll pass the name of the cohort as the first argument, the description of the cohort as the second argument, a yaml file that specifies cohort inclusion and exclusion criteria or a cohort selection SQL query as the third argument, and the cohort owner's name indicating who owns or creates this cohort as the fourth argument. The method will show a progress bar to indicate cohort creation progress over three stages.\n" ] }, { diff --git a/notebooks/BiasAnalyzerConceptBrowsingTutorial.ipynb b/notebooks/BiasAnalyzerConceptBrowsingTutorial.ipynb new file mode 100644 index 0000000..753555c --- /dev/null +++ b/notebooks/BiasAnalyzerConceptBrowsingTutorial.ipynb @@ -0,0 +1,519 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "d719bac9-2b20-4792-8a1f-3272f3e42a8b", + "metadata": {}, + "source": [ + "# Using BiasAnalyzer for Cohort Concept Prevalence Exploration\n", + "\n", + "This tutorial demonstrates how to use the `BiasAnalyzer` package to browse and explore OMOP concepts. In the OMOP (Observational Medical Outcomes Partnership) CDM (Common Data Model), a **concept** refers to a coded term from a standardized medical vocabulary, uniquely identified by a **concept ID**. All clinical events in OMOP, such as conditions, drug exposures, procedures, measurements, and events, are represented as concepts.\n", + "\n", + "---\n", + "\n", + "### Overview\n", + "\n", + "**Objective**: \n", + "Learn how to browse and explore OMOP concepts using `BiasAnalyzer`.\n", + "\n", + "**Before You Begin**: \n", + "The `BiasAnalyzer` package is currently in active development and has not yet been officially released on PyPI.\n", + "You can install it in one of the two ways:\n", + "\n", + "- **Install from GitHub (recommended during development)**:\n", + "```bash\n", + "pip install git+https://github.com/vaclab/BiasAnalyzer.git\n", + "```\n", + "- **Install from PyPI (once the pacakge is officially released)**:\n", + "```bash\n", + "pip install biasanalyzer\n", + "```\n", + "\n", + "For full setup and usage instructions, refer to the [README](https://github.com/VACLab/BiasAnalyzer/blob/main/README.md).\n", + "\n", + "---\n", + "\n", + "\n", + "### Preparation for OMOP concept exploration\n", + "Import the `BIAS` class from the `api` module of the `BiasAnalyzer` package, create an object `bias` of the `BIAS` class, specify OMOP CDM database configurations on the `bias` object, and set OMOP CDM database to enable connection to the database. Refer to the [Cohort Exploration Tutorial](./BiasAnalyzerCohortsTutorial.ipynb) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6dc76f46", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "configuration specified in ../config.yaml loaded successfully\n", + "Connected to the OMOP CDM database (read-only).\n", + "Cohort Definition table created.\n", + "Cohort table created.\n" + ] + } + ], + "source": [ + "from biasanalyzer.api import BIAS\n", + "\n", + "bias = BIAS()\n", + "\n", + "bias.set_config('../config.yaml')\n", + "\n", + "bias.set_root_omop()" + ] + }, + { + "cell_type": "markdown", + "id": "8731e481", + "metadata": {}, + "source": [ + "**Now that you have connected to your OMOP CDM database, you are ready to browse and explore OMOP concepts.** \n", + "\n", + "---\n", + "\n", + "### Explore OMOP domains and vocabularies\n", + "Since each OMOP concept is linked to a domain and vocabulary, it is helpful to first understand which domains and vocabularies are available before exploring concepts. You can retrieve available OMOP domains and their associated vocabularies using the `get_domains_and_vocabularies()` method on the `bias` object. This function returns a list of dictionaries, where each dictionary contains a `domain_id` and a `vocabulary_id`. The list is sorted alphabetically by `domain_id` and then by `vocabulary_id`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "51969248-f348-4f0d-914f-bb908183e3f1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " domain_id vocabulary_id\n", + "0 Condition HCPCS\n", + "1 Condition ICD10\n", + "2 Condition ICD10CM\n", + "3 Condition ICD9CM\n", + "4 Condition ICDO3\n", + "5 Condition OMOP Extension\n", + "6 Condition SNOMED\n", + "7 Condition/Device ICD10CM\n", + "8 Condition/Meas ICD10CM\n", + "9 Condition Status Condition Status\n", + "10 Cost Cost\n", + "11 Currency Currency\n", + "12 Device HCPCS\n", + "13 Device ICD10PCS\n", + "14 Device NDC\n", + "15 Device SNOMED\n", + "16 Device SPL\n", + "17 Drug ATC\n", + "18 Drug HCPCS\n", + "19 Drug ICD10PCS\n", + "20 Drug NDC\n", + "21 Drug RxNorm\n", + "22 Drug RxNorm Extension\n", + "23 Drug SNOMED\n", + "24 Drug SPL\n", + "25 Episode Episode\n", + "26 Ethnicity Ethnicity\n", + "27 Gender Gender\n", + "28 Gender SNOMED\n", + "29 Geography OSM\n", + "30 Geography US Census\n", + "31 Measurement HCPCS\n", + "32 Measurement ICD10\n", + "33 Measurement ICD10CM\n", + "34 Measurement ICD9CM\n", + "35 Measurement LOINC\n", + "36 Measurement OMOP Extension\n", + "37 Measurement SNOMED\n", + "38 Meas Value LOINC\n", + "39 Meas Value SNOMED\n", + "40 Meas Value Operator SNOMED\n", + "41 Metadata CDM\n", + "42 Metadata Concept Class\n", + "43 Metadata Domain\n", + "44 Metadata Metadata\n", + "45 Metadata None\n", + "46 Metadata Relationship\n", + "47 Metadata SNOMED\n", + "48 Metadata Vocabulary\n", + "49 Observation HCPCS\n", + "50 Observation ICD10\n", + "51 Observation ICD10CM\n", + "52 Observation ICD9CM\n", + "53 Observation ICDO3\n", + "54 Observation LOINC\n", + "55 Observation NUCC\n", + "56 Observation OMOP Extension\n", + "57 Observation SNOMED\n", + "58 Observation SPL\n", + "59 Observation UB04 Pri Typ of Adm\n", + "60 Observation UB04 Typ bill\n", + "61 Payer PHDSC\n", + "62 Plan Plan\n", + "63 Plan Stop Reason Plan Stop Reason\n", + "64 Procedure HCPCS\n", + "65 Procedure ICD10\n", + "66 Procedure ICD10CM\n", + "67 Procedure ICD10PCS\n", + "68 Procedure ICD9CM\n", + "69 Procedure ICD9Proc\n", + "70 Procedure SNOMED\n", + "71 Provider ABMS\n", + "72 Provider Medicare Specialty\n", + "73 Provider NUCC\n", + "74 Provider SNOMED\n", + "75 Race Race\n", + "76 Race SNOMED\n", + "77 Relationship SNOMED\n", + "78 Revenue Code Korean Revenue Code\n", + "79 Revenue Code Revenue Code\n", + "80 Route SNOMED\n", + "81 Spec Anatomic Site SNOMED\n", + "82 Spec Disease Status SNOMED\n", + "83 Specimen SNOMED\n", + "84 Sponsor Sponsor\n", + "85 Type Concept Condition Type\n", + "86 Type Concept Cost Type\n", + "87 Type Concept Death Type\n", + "88 Type Concept Device Type\n", + "89 Type Concept Drug Type\n", + "90 Type Concept Meas Type\n", + "91 Type Concept Note Type\n", + "92 Type Concept Observation Type\n", + "93 Type Concept Obs Period Type\n", + "94 Type Concept Procedure Type\n", + "95 Type Concept SNOMED\n", + "96 Type Concept Type Concept\n", + "97 Type Concept Visit Type\n", + "98 Unit SNOMED\n", + "99 Unit UCUM\n", + "100 Visit CMS Place of Service\n", + "101 Visit Medicare Specialty\n", + "102 Visit NUCC\n", + "103 Visit SNOMED\n", + "104 Visit UB04 Point of Origin\n", + "105 Visit UB04 Pt dis status\n", + "106 Visit UB04 Typ bill\n", + "107 Visit Visit\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "pd.set_option('display.max_rows', None)\n", + "\n", + "domains_and_vocabs = bias.get_domains_and_vocabularies()\n", + "print(pd.DataFrame(domains_and_vocabs))" + ] + }, + { + "cell_type": "markdown", + "id": "22edda35", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Exploring OMOP concepts\n", + "\n", + "You can explore OMOP concepts using the `get_concepts(search_term, domain=None, vocabulary=None)` method on the `bias` object. To narrow down your search, you should provide a search term along with a domain, a vocabulary, or both. Since the OMOP vocabulary contains a vast number of concepts, filtering by domain and/or vocabulary helps constrain the search space and keeps the number of results manageable. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b1415805-b065-40b8-b2cd-6db6f5f9ca41", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " concept_id concept_name \\\n", + "0 703440 COVID-19 confirmed using clinical diagnostic c... \n", + "1 703441 COVID-19 confirmed by laboratory test \n", + "2 703445 Low risk category for developing complication ... \n", + "3 703446 Moderate risk category for developing complica... \n", + "4 703447 High risk category for developing complication... \n", + "5 37310269 COVID-19 \n", + "6 37311061 COVID-19 \n", + "\n", + " valid_start_date valid_end_date domain_id vocabulary_id \n", + "0 2020-04-01 2099-12-31 Condition SNOMED \n", + "1 2020-04-01 2099-12-31 Condition SNOMED \n", + "2 2020-04-01 2099-12-31 Condition SNOMED \n", + "3 2020-04-01 2099-12-31 Condition SNOMED \n", + "4 2020-04-01 2099-12-31 Condition SNOMED \n", + "5 2020-02-04 2020-10-28 Condition SNOMED \n", + "6 2020-01-31 2099-12-31 Condition SNOMED \n" + ] + } + ], + "source": [ + "concepts = bias.get_concepts(\"COVID-19\", \"Condition\", \"SNOMED\")\n", + "print(pd.DataFrame(concepts))" + ] + }, + { + "cell_type": "markdown", + "id": "10305fac-8ae3-49ca-8542-47d0a0636f97", + "metadata": {}, + "source": [ + "———————————————\n", + "\n", + "### Exploring concept hierarchy\n", + "\n", + "**Retrieve concept hierarchy**:You can retrieve the concept hierarchy for a specific OMOP concept using the `get_concept_hierarchy(concept_id)` method on the `bias` object. The method returns two dictionaries: the **ancestor hierarchy** representing the concept's lineage upward, and the descendant hierarchy representing the concept's children and their branches. Each dictionary has a nested structure with two main keys: \n", + "- `details`: a dictionary containing metadata about the current concept node, including `concept_id`, `concept_name`, `vocabulary_id`, and `concept_code`\n", + "- `parents` (for the ancestor hierarchy) or `children` (for the descendant hierarchy): a list of parent of child concept nodes, respectively\n", + "\n", + "A progress bar is displayed during execution to indicate the progress of computing the concept's hierarchical relationships." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d539b8df-2bf4-42ec-abc5-36fa0238cea1", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2452988eafb64ccd8caf3eec8004c453", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Concept Hierarchy: 0%| | 0/3 [00:00=1.1.1 (from BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for duckdb<2.0.0,>=1.1.1 from https://files.pythonhosted.org/packages/bf/56/f627b6fcd4aa34015a15449d852ccb78d7cc6eda654aa20c1d378e99fa76/duckdb-1.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached duckdb-1.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (762 bytes)\n", - "Collecting duckdb-engine<0.14.0,>=0.13.2 (from BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for duckdb-engine<0.14.0,>=0.13.2 from https://files.pythonhosted.org/packages/5f/81/571c0373978d4e987ec2437bfb16adce6cf3b4a05761a76f1c06e859b668/duckdb_engine-0.13.5-py3-none-any.whl.metadata\n", - " Using cached duckdb_engine-0.13.5-py3-none-any.whl.metadata (8.0 kB)\n", - "Collecting ipytree<0.3.0,>=0.2.2 (from BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for ipytree<0.3.0,>=0.2.2 from https://files.pythonhosted.org/packages/e4/03/35cf1742598d784e96153175233318a2332f71863e55ad1007c9264c1a7a/ipytree-0.2.2-py2.py3-none-any.whl.metadata\n", - " Using cached ipytree-0.2.2-py2.py3-none-any.whl.metadata (849 bytes)\n", - "Collecting ipywidgets<9.0.0,>=8.1.5 (from BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for ipywidgets<9.0.0,>=8.1.5 from https://files.pythonhosted.org/packages/22/2d/9c0b76f2f9cc0ebede1b9371b6f317243028ed60b90705863d493bae622e/ipywidgets-8.1.5-py3-none-any.whl.metadata\n", - " Using cached ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)\n", - "Collecting numpy==1.24.4 (from BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for numpy==1.24.4 from https://files.pythonhosted.org/packages/22/97/dfb1a31bb46686f09e68ea6ac5c63fdee0d22d7b23b8f3f7ea07712869ef/numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n", - "Collecting pandas==2.0.3 (from BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for pandas==2.0.3 from https://files.pythonhosted.org/packages/d0/28/88b81881c056376254618fad622a5e94b5126db8c61157ea1910cd1c040a/pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)\n", - "Collecting psycopg2<3.0.0,>=2.9.1 (from BiasAnalyzer==0.1.0)\n", - " Using cached psycopg2-2.9.10-cp311-cp311-linux_x86_64.whl\n", - "Collecting pydantic<3.0.0,>=2.9.2 (from BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for pydantic<3.0.0,>=2.9.2 from https://files.pythonhosted.org/packages/df/e4/ba44652d562cbf0bf320e0f3810206149c8a4e99cdbf66da82e97ab53a15/pydantic-2.9.2-py3-none-any.whl.metadata\n", - " Using cached pydantic-2.9.2-py3-none-any.whl.metadata (149 kB)\n", - "Collecting pyyaml<7.0.0,>=6.0.2 (from BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for pyyaml<7.0.0,>=6.0.2 from https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)\n", - "Collecting scipy==1.10.1 (from BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for scipy==1.10.1 from https://files.pythonhosted.org/packages/21/cd/fe2d4af234b80dc08c911ce63fdaee5badcdde3e9bcd9a68884580652ef0/scipy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached scipy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)\n", - "Collecting sqlalchemy<3.0.0,>=2.0.35 (from BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for sqlalchemy<3.0.0,>=2.0.35 from https://files.pythonhosted.org/packages/b4/5f/95e0ed74093ac3c0db6acfa944d4d8ac6284ef5e1136b878a327ea1f975a/SQLAlchemy-2.0.36-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached SQLAlchemy-2.0.36-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)\n", - "Collecting python-dateutil>=2.8.2 (from pandas==2.0.3->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for python-dateutil>=2.8.2 from https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata\n", - " Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)\n", - "Collecting pytz>=2020.1 (from pandas==2.0.3->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for pytz>=2020.1 from https://files.pythonhosted.org/packages/11/c3/005fcca25ce078d2cc29fd559379817424e94885510568bc1bc53d7d5846/pytz-2024.2-py2.py3-none-any.whl.metadata\n", - " Using cached pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)\n", - "Collecting tzdata>=2022.1 (from pandas==2.0.3->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for tzdata>=2022.1 from https://files.pythonhosted.org/packages/a6/ab/7e5f53c3b9d14972843a647d8d7a853969a58aecc7559cb3267302c94774/tzdata-2024.2-py2.py3-none-any.whl.metadata\n", - " Using cached tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)\n", - "Collecting packaging>=21 (from duckdb-engine<0.14.0,>=0.13.2->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for packaging>=21 from https://files.pythonhosted.org/packages/08/aa/cc0199a5f0ad350994d660967a8efb233fe0416e4639146c089643407ce6/packaging-24.1-py3-none-any.whl.metadata\n", - " Using cached packaging-24.1-py3-none-any.whl.metadata (3.2 kB)\n", - "Collecting comm>=0.1.3 (from ipywidgets<9.0.0,>=8.1.5->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for comm>=0.1.3 from https://files.pythonhosted.org/packages/e6/75/49e5bfe642f71f272236b5b2d2691cf915a7283cc0ceda56357b61daa538/comm-0.2.2-py3-none-any.whl.metadata\n", - " Using cached comm-0.2.2-py3-none-any.whl.metadata (3.7 kB)\n", - "Collecting ipython>=6.1.0 (from ipywidgets<9.0.0,>=8.1.5->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for ipython>=6.1.0 from https://files.pythonhosted.org/packages/c5/a5/c15ed187f1b3fac445bb42a2dedd8dec1eee1718b35129242049a13a962f/ipython-8.29.0-py3-none-any.whl.metadata\n", - " Using cached ipython-8.29.0-py3-none-any.whl.metadata (5.0 kB)\n", - "Collecting traitlets>=4.3.1 (from ipywidgets<9.0.0,>=8.1.5->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for traitlets>=4.3.1 from https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl.metadata\n", - " Using cached traitlets-5.14.3-py3-none-any.whl.metadata (10 kB)\n", - "Collecting widgetsnbextension~=4.0.12 (from ipywidgets<9.0.0,>=8.1.5->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for widgetsnbextension~=4.0.12 from https://files.pythonhosted.org/packages/21/02/88b65cc394961a60c43c70517066b6b679738caf78506a5da7b88ffcb643/widgetsnbextension-4.0.13-py3-none-any.whl.metadata\n", - " Using cached widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)\n", - "Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets<9.0.0,>=8.1.5->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for jupyterlab-widgets~=3.0.12 from https://files.pythonhosted.org/packages/a9/93/858e87edc634d628e5d752ba944c2833133a28fa87bb093e6832ced36a3e/jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata\n", - " Using cached jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)\n", - "Collecting annotated-types>=0.6.0 (from pydantic<3.0.0,>=2.9.2->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for annotated-types>=0.6.0 from https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl.metadata\n", - " Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)\n", - "Collecting pydantic-core==2.23.4 (from pydantic<3.0.0,>=2.9.2->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for pydantic-core==2.23.4 from https://files.pythonhosted.org/packages/44/31/a3899b5ce02c4316865e390107f145089876dff7e1dfc770a231d836aed8/pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n", - " Using cached pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n", - "Collecting typing-extensions>=4.6.1 (from pydantic<3.0.0,>=2.9.2->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for typing-extensions>=4.6.1 from https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl.metadata\n", - " Using cached typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)\n", - "Collecting greenlet!=0.4.17 (from sqlalchemy<3.0.0,>=2.0.35->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for greenlet!=0.4.17 from https://files.pythonhosted.org/packages/f7/4b/1c9695aa24f808e156c8f4813f685d975ca73c000c2a5056c514c64980f6/greenlet-3.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata\n", - " Using cached greenlet-3.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)\n", - "Collecting decorator (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for decorator from https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl.metadata\n", - " Using cached decorator-5.1.1-py3-none-any.whl.metadata (4.0 kB)\n", - "Collecting jedi>=0.16 (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for jedi>=0.16 from https://files.pythonhosted.org/packages/20/9f/bc63f0f0737ad7a60800bfd472a4836661adae21f9c2535f3957b1e54ceb/jedi-0.19.1-py2.py3-none-any.whl.metadata\n", - " Using cached jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)\n", - "Collecting matplotlib-inline (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for matplotlib-inline from https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl.metadata\n", - " Using cached matplotlib_inline-0.1.7-py3-none-any.whl.metadata (3.9 kB)\n", - "Collecting prompt-toolkit<3.1.0,>=3.0.41 (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for prompt-toolkit<3.1.0,>=3.0.41 from https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl.metadata\n", - " Using cached prompt_toolkit-3.0.48-py3-none-any.whl.metadata (6.4 kB)\n", - "Collecting pygments>=2.4.0 (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for pygments>=2.4.0 from https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl.metadata\n", - " Using cached pygments-2.18.0-py3-none-any.whl.metadata (2.5 kB)\n", - "Collecting stack-data (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for stack-data from https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl.metadata\n", - " Using cached stack_data-0.6.3-py3-none-any.whl.metadata (18 kB)\n", - "Collecting pexpect>4.3 (from ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for pexpect>4.3 from https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl.metadata\n", - " Using cached pexpect-4.9.0-py2.py3-none-any.whl.metadata (2.5 kB)\n", - "Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas==2.0.3->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for six>=1.5 from https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl.metadata\n", - " Using cached six-1.16.0-py2.py3-none-any.whl.metadata (1.8 kB)\n", - "Collecting parso<0.9.0,>=0.8.3 (from jedi>=0.16->ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for parso<0.9.0,>=0.8.3 from https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl.metadata\n", - " Using cached parso-0.8.4-py2.py3-none-any.whl.metadata (7.7 kB)\n", - "Collecting ptyprocess>=0.5 (from pexpect>4.3->ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for ptyprocess>=0.5 from https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl.metadata\n", - " Using cached ptyprocess-0.7.0-py2.py3-none-any.whl.metadata (1.3 kB)\n", - "Collecting wcwidth (from prompt-toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for wcwidth from https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl.metadata\n", - " Using cached wcwidth-0.2.13-py2.py3-none-any.whl.metadata (14 kB)\n", - "Collecting executing>=1.2.0 (from stack-data->ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for executing>=1.2.0 from https://files.pythonhosted.org/packages/b5/fd/afcd0496feca3276f509df3dbd5dae726fcc756f1a08d9e25abe1733f962/executing-2.1.0-py2.py3-none-any.whl.metadata\n", - " Using cached executing-2.1.0-py2.py3-none-any.whl.metadata (8.9 kB)\n", - "Collecting asttokens>=2.1.0 (from stack-data->ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for asttokens>=2.1.0 from https://files.pythonhosted.org/packages/45/86/4736ac618d82a20d87d2f92ae19441ebc7ac9e7a581d7e58bbe79233b24a/asttokens-2.4.1-py2.py3-none-any.whl.metadata\n", - " Using cached asttokens-2.4.1-py2.py3-none-any.whl.metadata (5.2 kB)\n", - "Collecting pure-eval (from stack-data->ipython>=6.1.0->ipywidgets<9.0.0,>=8.1.5->BiasAnalyzer==0.1.0)\n", - " Obtaining dependency information for pure-eval from https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl.metadata\n", - " Using cached pure_eval-0.2.3-py3-none-any.whl.metadata (6.3 kB)\n", - "Using cached numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n", - "Using cached pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)\n", - "Using cached scipy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.1 MB)\n", - "Using cached duckdb-1.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.1 MB)\n", - "Using cached duckdb_engine-0.13.5-py3-none-any.whl (48 kB)\n", - "Using cached ipytree-0.2.2-py2.py3-none-any.whl (1.3 MB)\n", - "Using cached ipywidgets-8.1.5-py3-none-any.whl (139 kB)\n", - "Using cached pydantic-2.9.2-py3-none-any.whl (434 kB)\n", - "Using cached pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)\n", - "Using cached PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (762 kB)\n", - "Using cached SQLAlchemy-2.0.36-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)\n", - "Using cached annotated_types-0.7.0-py3-none-any.whl (13 kB)\n", - "Using cached comm-0.2.2-py3-none-any.whl (7.2 kB)\n", - "Using cached greenlet-3.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (602 kB)\n", - "Using cached ipython-8.29.0-py3-none-any.whl (819 kB)\n", - "Using cached jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)\n", - "Using cached packaging-24.1-py3-none-any.whl (53 kB)\n", - "Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)\n", - "Using cached pytz-2024.2-py2.py3-none-any.whl (508 kB)\n", - "Using cached traitlets-5.14.3-py3-none-any.whl (85 kB)\n", - "Using cached typing_extensions-4.12.2-py3-none-any.whl (37 kB)\n", - "Using cached tzdata-2024.2-py2.py3-none-any.whl (346 kB)\n", - "Using cached widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB)\n", - "Using cached jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)\n", - "Using cached pexpect-4.9.0-py2.py3-none-any.whl (63 kB)\n", - "Using cached prompt_toolkit-3.0.48-py3-none-any.whl (386 kB)\n", - "Using cached pygments-2.18.0-py3-none-any.whl (1.2 MB)\n", - "Using cached six-1.16.0-py2.py3-none-any.whl (11 kB)\n", - "Using cached decorator-5.1.1-py3-none-any.whl (9.1 kB)\n", - "Using cached matplotlib_inline-0.1.7-py3-none-any.whl (9.9 kB)\n", - "Using cached stack_data-0.6.3-py3-none-any.whl (24 kB)\n", - "Using cached asttokens-2.4.1-py2.py3-none-any.whl (27 kB)\n", - "Using cached executing-2.1.0-py2.py3-none-any.whl (25 kB)\n", - "Using cached parso-0.8.4-py2.py3-none-any.whl (103 kB)\n", - "Using cached ptyprocess-0.7.0-py2.py3-none-any.whl (13 kB)\n", - "Using cached pure_eval-0.2.3-py3-none-any.whl (11 kB)\n", - "Using cached wcwidth-0.2.13-py2.py3-none-any.whl (34 kB)\n", - "Building wheels for collected packages: BiasAnalyzer\n", - " Building wheel for BiasAnalyzer (pyproject.toml) ... \u001B[?25ldone\n", - "\u001B[?25h Created wheel for BiasAnalyzer: filename=biasanalyzer-0.1.0-py3-none-any.whl size=12482 sha256=254ea1fa17b7c1706a4d4e4ed711dd7128601c09a1c3c36c9ec903ed842441af\n", - " Stored in directory: /home/hyi/temp/pip-ephem-wheel-cache-wgmpfyq9/wheels/25/75/4e/079d96d69cc58148ce31d3d44f858e4db5f689604112dcb7c3\n", - "Successfully built BiasAnalyzer\n", - "Installing collected packages: wcwidth, pytz, pure-eval, ptyprocess, widgetsnbextension, tzdata, typing-extensions, traitlets, six, pyyaml, pygments, psycopg2, prompt-toolkit, pexpect, parso, packaging, numpy, jupyterlab-widgets, greenlet, executing, duckdb, decorator, annotated-types, sqlalchemy, scipy, python-dateutil, pydantic-core, matplotlib-inline, jedi, comm, asttokens, stack-data, pydantic, pandas, duckdb-engine, ipython, ipywidgets, ipytree, BiasAnalyzer\n", - "Successfully installed BiasAnalyzer-0.1.0 annotated-types-0.7.0 asttokens-2.4.1 comm-0.2.2 decorator-5.1.1 duckdb-1.1.3 duckdb-engine-0.13.5 executing-2.1.0 greenlet-3.1.1 ipython-8.29.0 ipytree-0.2.2 ipywidgets-8.1.5 jedi-0.19.1 jupyterlab-widgets-3.0.13 matplotlib-inline-0.1.7 numpy-1.24.4 packaging-24.1 pandas-2.0.3 parso-0.8.4 pexpect-4.9.0 prompt-toolkit-3.0.48 psycopg2-2.9.10 ptyprocess-0.7.0 pure-eval-0.2.3 pydantic-2.9.2 pydantic-core-2.23.4 pygments-2.18.0 python-dateutil-2.9.0.post0 pytz-2024.2 pyyaml-6.0.2 scipy-1.10.1 six-1.16.0 sqlalchemy-2.0.36 stack-data-0.6.3 traitlets-5.14.3 typing-extensions-4.12.2 tzdata-2024.2 wcwidth-0.2.13 widgetsnbextension-4.0.13\n" - ] - } - ], - "source": [ - "# Have to specify TMPDIR and target in pip install command to work around the kernel crash issue due to \n", - "# the small ephemeral local storage quota allocated to /tmp which is used by default by pip install\n", - "!TMPDIR=/home/hyi/temp pip install git+https://github.com/vaclab/BiasAnalyzer.git --target /home/hyi/temp --upgrade" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "9ce3b87c-0754-4eae-9f85-8210104e2b0b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# append the target folder where HealthDataBias module was installed to PYTHONPATH\n", - "import sys\n", - "sys.path.append('/home/hyi/temp')\n", - "import pandas as pd\n", - "pd.set_option('display.max_rows', None)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "729e8803-74f8-4180-aa8b-0e44567f8aeb", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from biasanalyzer.api import BIAS" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "548223ed-8948-461e-b9d6-40a0ec7fc89f", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "no configuration file specified. Call set_config(config_file_path) next to specify configurations\n", - "Cohort Definition table created.\n", - "Cohort table created.\n" - ] - } - ], - "source": [ - "# create an object of BIAS class\n", - "bias = BIAS()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "7d440d9f-c7fa-4ef1-ad66-31274ebef4ea", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "configuration specified in /home/hyi/bias/config/config.yaml loaded successfully\n" - ] - } - ], - "source": [ - "bias.set_config('/home/hyi/bias/config/config.yaml')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "283156f8-63da-42a5-bbd7-ee2b7719652c", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Connected to the OMOP CDM database (read-only).\n" - ] - } - ], - "source": [ - "# the configuration file includes root_omop_cdm_database configuration info with an example shown \n", - "# in https://github.com/hyi/HealthDataBias/blob/main/tests/assets/config/test_config.yaml\n", - "bias.set_root_omop()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "9a52ab5f-57a8-4942-8a03-ec86651e919e", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " domain_id vocabulary_id\n", - "0 Condition HCPCS\n", - "1 Condition ICD10\n", - "2 Condition ICD10CM\n", - "3 Condition ICD9CM\n", - "4 Condition ICDO3\n", - "5 Condition OMOP Extension\n", - "6 Condition SNOMED\n", - "7 Condition/Device ICD10CM\n", - "8 Condition/Meas ICD10CM\n", - "9 Condition Status Condition Status\n", - "10 Cost Cost\n", - "11 Currency Currency\n", - "12 Device HCPCS\n", - "13 Device ICD10PCS\n", - "14 Device NDC\n", - "15 Device SNOMED\n", - "16 Device SPL\n", - "17 Drug ATC\n", - "18 Drug HCPCS\n", - "19 Drug ICD10PCS\n", - "20 Drug NDC\n", - "21 Drug RxNorm\n", - "22 Drug RxNorm Extension\n", - "23 Drug SNOMED\n", - "24 Drug SPL\n", - "25 Episode Episode\n", - "26 Ethnicity Ethnicity\n", - "27 Gender Gender\n", - "28 Gender SNOMED\n", - "29 Geography OSM\n", - "30 Geography US Census\n", - "31 Measurement HCPCS\n", - "32 Measurement ICD10\n", - "33 Measurement ICD10CM\n", - "34 Measurement ICD9CM\n", - "35 Measurement LOINC\n", - "36 Measurement OMOP Extension\n", - "37 Measurement SNOMED\n", - "38 Meas Value LOINC\n", - "39 Meas Value SNOMED\n", - "40 Meas Value Operator SNOMED\n", - "41 Metadata CDM\n", - "42 Metadata Concept Class\n", - "43 Metadata Domain\n", - "44 Metadata Metadata\n", - "45 Metadata None\n", - "46 Metadata Relationship\n", - "47 Metadata SNOMED\n", - "48 Metadata Vocabulary\n", - "49 Observation HCPCS\n", - "50 Observation ICD10\n", - "51 Observation ICD10CM\n", - "52 Observation ICD9CM\n", - "53 Observation ICDO3\n", - "54 Observation LOINC\n", - "55 Observation NUCC\n", - "56 Observation OMOP Extension\n", - "57 Observation SNOMED\n", - "58 Observation SPL\n", - "59 Observation UB04 Pri Typ of Adm\n", - "60 Observation UB04 Typ bill\n", - "61 Payer PHDSC\n", - "62 Plan Plan\n", - "63 Plan Stop Reason Plan Stop Reason\n", - "64 Procedure HCPCS\n", - "65 Procedure ICD10\n", - "66 Procedure ICD10CM\n", - "67 Procedure ICD10PCS\n", - "68 Procedure ICD9CM\n", - "69 Procedure ICD9Proc\n", - "70 Procedure SNOMED\n", - "71 Provider ABMS\n", - "72 Provider Medicare Specialty\n", - "73 Provider NUCC\n", - "74 Provider SNOMED\n", - "75 Race Race\n", - "76 Race SNOMED\n", - "77 Relationship SNOMED\n", - "78 Revenue Code Korean Revenue Code\n", - "79 Revenue Code Revenue Code\n", - "80 Route SNOMED\n", - "81 Spec Anatomic Site SNOMED\n", - "82 Spec Disease Status SNOMED\n", - "83 Specimen SNOMED\n", - "84 Sponsor Sponsor\n", - "85 Type Concept Condition Type\n", - "86 Type Concept Cost Type\n", - "87 Type Concept Death Type\n", - "88 Type Concept Device Type\n", - "89 Type Concept Drug Type\n", - "90 Type Concept Meas Type\n", - "91 Type Concept Note Type\n", - "92 Type Concept Observation Type\n", - "93 Type Concept Obs Period Type\n", - "94 Type Concept Procedure Type\n", - "95 Type Concept SNOMED\n", - "96 Type Concept Type Concept\n", - "97 Type Concept Visit Type\n", - "98 Unit SNOMED\n", - "99 Unit UCUM\n", - "100 Visit CMS Place of Service\n", - "101 Visit Medicare Specialty\n", - "102 Visit NUCC\n", - "103 Visit SNOMED\n", - "104 Visit UB04 Point of Origin\n", - "105 Visit UB04 Pt dis status\n", - "106 Visit UB04 Typ bill\n", - "107 Visit Visit\n" - ] - } - ], - "source": [ - "domains = bias.get_domains_and_vocabularies()\n", - "print(pd.DataFrame(domains))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "b1415805-b065-40b8-b2cd-6db6f5f9ca41", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " concept_id concept_name \\\n", - "0 703440 COVID-19 confirmed using clinical diagnostic c... \n", - "1 37311061 COVID-19 \n", - "2 37310269 COVID-19 \n", - "3 703447 High risk category for developing complication... \n", - "4 703446 Moderate risk category for developing complica... \n", - "5 703445 Low risk category for developing complication ... \n", - "6 703441 COVID-19 confirmed by laboratory test \n", - "\n", - " valid_start_date valid_end_date \n", - "0 2020-04-01 2099-12-31 \n", - "1 2020-01-31 2099-12-31 \n", - "2 2020-02-04 2020-10-28 \n", - "3 2020-04-01 2099-12-31 \n", - "4 2020-04-01 2099-12-31 \n", - "5 2020-04-01 2099-12-31 \n", - "6 2020-04-01 2099-12-31 \n" - ] - } - ], - "source": [ - "concepts = bias.get_concepts(\"COVID-19\", \"Condition\", \"SNOMED\")\n", - "print(pd.DataFrame(concepts))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "d54e39da-6f78-4dc1-91ae-a8c26852582a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# get parent and children concept hierarchical tree for COVID-19\n", - "parent_concept_tree, children_concept_tree = bias.get_concept_hierarchy(37311061)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "00f036eb", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "parent concept hierarchy for COVID-19 in text format:\n", - "🔼 COVID-19 (ID: 37311061, Code: 840539006)\n", - " 🔼 Clinical finding (ID: 441840, Code: 404684003)\n", - " 🔼 Viral disease (ID: 440029, Code: 34014006)\n", - " 🔼 Disease (ID: 4274025, Code: 64572001)\n", - " 🔼 Coronavirus infection (ID: 439676, Code: 186747009)\n", - " 🔼 Disease due to Coronaviridae (ID: 4100065, Code: 27619001)\n", - " 🔼 Disorder due to infection (ID: 432250, Code: 40733004)\n", - "\n", - "children concept hierarchy for COVID-19 in text format:\n", - "🔽 COVID-19 (ID: 37311061, Code: 840539006)\n", - " 🔽 Lymphocytopenia due to Severe acute respiratory syndrome coronavirus 2 (ID: 3661631, Code: 866151004)\n", - " 🔽 Otitis media due to disease caused by Severe acute respiratory syndrome coronavirus 2 (ID: 37310254, Code: 1240521000000100)\n", - " 🔽 Respiratory infection caused by COVID-19 (ID: 756039, Code: OMOP4873907)\n", - " 🔽 Acute bronchitis caused by SARS-CoV-2 (ID: 3661405, Code: 138389411000119105)\n", - " 🔽 Pneumonia caused by SARS-CoV-2 (ID: 3661408, Code: 882784691000119100)\n", - " 🔽 Lower respiratory infection caused by SARS-CoV-2 (ID: 3663281, Code: 880529761000119102)\n", - " 🔽 Pneumonia caused by SARS-CoV-2 (ID: 3661408, Code: 882784691000119100)\n", - " 🔽 Acute bronchitis caused by SARS-CoV-2 (ID: 3661405, Code: 138389411000119105)\n", - " 🔽 Bronchitis caused by COVID-19 (ID: 756031, Code: OMOP4873909)\n", - " 🔽 Acute bronchitis caused by SARS-CoV-2 (ID: 3661405, Code: 138389411000119105)\n", - " 🔽 Bronchitis caused by COVID-19 (ID: 756031, Code: OMOP4873909)\n", - " 🔽 Acute bronchitis caused by SARS-CoV-2 (ID: 3661405, Code: 138389411000119105)\n", - " 🔽 Infection of upper respiratory tract caused by Severe acute respiratory syndrome coronavirus 2 (ID: 37310286, Code: 1240541000000107)\n", - " 🔽 Encephalopathy due to disease caused by Severe acute respiratory syndrome coronavirus 2 (ID: 37310284, Code: 1240561000000108)\n", - " 🔽 Cardiomyopathy due to disease caused by Severe acute respiratory syndrome coronavirus 2 (ID: 3656667, Code: 119731000146105)\n", - " 🔽 Acute bronchitis caused by SARS-CoV-2 (ID: 3661405, Code: 138389411000119105)\n", - " 🔽 Acute kidney injury due to disease caused by Severe acute respiratory syndrome coronavirus 2 (ID: 3661748, Code: 870589006)\n", - " 🔽 Thrombocytopenia due to Severe acute respiratory syndrome coronavirus 2 (ID: 3661632, Code: 866152006)\n", - " 🔽 Conjunctivitis due to disease caused by Severe acute respiratory syndrome coronavirus 2 (ID: 3656668, Code: 119741000146102)\n", - " 🔽 Pneumonia caused by SARS-CoV-2 (ID: 3661408, Code: 882784691000119100)\n", - " 🔽 Lower respiratory infection caused by SARS-CoV-2 (ID: 3663281, Code: 880529761000119102)\n", - " 🔽 Pneumonia caused by SARS-CoV-2 (ID: 3661408, Code: 882784691000119100)\n", - " 🔽 Acute bronchitis caused by SARS-CoV-2 (ID: 3661405, Code: 138389411000119105)\n", - " 🔽 Bronchitis caused by COVID-19 (ID: 756031, Code: OMOP4873909)\n", - " 🔽 Acute bronchitis caused by SARS-CoV-2 (ID: 3661405, Code: 138389411000119105)\n", - " 🔽 Gastroenteritis caused by SARS-CoV-2 (severe acute respiratory syndrome coronavirus 2) (ID: 37310283, Code: 1240571000000101)\n", - " 🔽 Fever caused by Severe acute respiratory syndrome coronavirus 2 (ID: 3661885, Code: 119751000146104)\n", - " 🔽 Acute respiratory distress syndrome due to disease caused by Severe acute respiratory syndrome coronavirus 2 (ID: 3661406, Code: 674814021000119106)\n", - " 🔽 Myocarditis due to disease caused by Severe acute respiratory syndrome coronavirus 2 (ID: 37310287, Code: 1240531000000103)\n", - " 🔽 Rhabdomyolysis due to disease caused by Severe acute respiratory syndrome coronavirus 2 (ID: 3655977, Code: 870591003)\n", - " 🔽 Bronchitis caused by COVID-19 (ID: 756031, Code: OMOP4873909)\n", - " 🔽 Acute bronchitis caused by SARS-CoV-2 (ID: 3661405, Code: 138389411000119105)\n", - " 🔽 Asymptomatic SARS-CoV-2 (ID: 3662381, Code: 189486241000119100)\n", - " 🔽 Infection of upper respiratory tract caused by Severe acute respiratory syndrome coronavirus 2 (ID: 37310286, Code: 1240541000000107)\n", - " 🔽 Sepsis due to disease caused by Severe acute respiratory syndrome coronavirus 2 (ID: 3655975, Code: 870588003)\n", - " 🔽 Dyspnea caused by Severe acute respiratory syndrome coronavirus 2 (ID: 3656669, Code: 119981000146107)\n", - " 🔽 Acute hypoxemic respiratory failure due to disease caused by Severe acute respiratory syndrome coronavirus 2 (ID: 3655976, Code: 870590002)\n", - "\n" - ] - } - ], - "source": [ - "print('parent concept hierarchy for COVID-19 in text format:')\n", - "print(bias.display_concept_tree(parent_concept_tree))\n", - "print('children concept hierarchy for COVID-19 in text format:')\n", - "print(bias.display_concept_tree(children_concept_tree))" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "e3f5ace2-6cc4-4940-a067-e1a3fc14e1ce", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "parent concept hierarchy for COVID-19 in widget tree format:\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4969f2d9c4f6438ba5557588332be0aa", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "VBox(children=(Label(value='Concept Hierarchy'), Tree(nodes=(Node(name='🔼 COVID-19 (ID: 37311061, Code: 840539…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "children concept hierarchy for COVID-19 in widget tree format:\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c476950289304222b749351a48121387", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "VBox(children=(Label(value='Concept Hierarchy'), Tree(nodes=(Node(name='🔽 COVID-19 (ID: 37311061, Code: 840539…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "print(f'parent concept hierarchy for COVID-19 in widget tree format:')\n", - "bias.display_concept_tree(parent_concept_tree, show_in_text_format=False)\n", - "print(f'children concept hierarchy for COVID-19 in widget tree format:')\n", - "bias.display_concept_tree(children_concept_tree, show_in_text_format=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "8be5061b-cfdf-4dc0-9ef8-f18277ab9fbe", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Connection to BiasDatabase closed.\n", - "Connection to the OMOP CDM database closed.\n" - ] - } - ], - "source": [ - "bias.cleanup()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7ad0b7b-21dc-4572-af21-fe1580361999", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d03cf95-3c68-4eee-be41-5482dea68b84", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "128c7b02-faef-4a35-97a6-c92baa5a37dd", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From f87c9142a513d55db689ebe3e578c8e25f8c8113 Mon Sep 17 00:00:00 2001 From: hyi Date: Fri, 20 Jun 2025 16:56:04 -0400 Subject: [PATCH 09/10] linked tutorials to readme --- README.md | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index a2d5103..f354010 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ containing a list of the concept's children or parents in the hierarchy. - Call `bias.display_concept_tree(parent_concept_tree)` and `bias.display_concept_tree(children_concept_tree)` to display the concept hierarchical tree in an indented text format. If ipytree widget is installed and supported in a Jupyter notebook environment, you can set `show_in_text_format` input parameter to `False` -(e.g., call `bias.display_concept_tree(parent_concept_tree, show_in_text_format=False)`)to leverage the tree widget for displaying +(e.g., call `bias.display_concept_tree(parent_concept_tree, show_in_text_format=False)`) to leverage the tree widget for displaying the hierarchy in a tree that can be expanded and collapsed on demand interactively. In addition to exploring the concepts using BiasAnalyzer APIs, the main functionalities of the BiasAnalyzer is @@ -88,10 +88,13 @@ The following code snippets show some examples. ``` Note that currently the `get_stats()` method only returns statistics of age, gender, race, and ethinicity of a cohort and `get_distributions()` method only returns distribution of age and gender in a cohort. -- You can also get patient counts and prevalence with each diagnostic condition concept code in a cohort by accessing +- You can also explore concept prevalence within a cohort - a key step in identifying potential biases during +cohort selection. A concept refers to a coded term from a standardized medical vocabulary, uniquely identified by a +concept ID. All clinical events in OMOP, such as conditions, drug exposures, procedures, measurements, and events, are +represented as concepts. You can get patient counts and prevalence associated with each concept by accessing the method `get_concept_stats()` with a code snippet example shown below. ```angular2html - cohort_concepts = baseline_cohort_data.get_concept_stats() + cohort_concepts = baseline_cohort_data.get_concept_stats(concept_type='condition_occurrence') print(pd.DataFrame(cohort_concepts["condition_occurrence"])) ``` - There is also an API method that enables users to compare distributions of two cohorts by calling `bias.compare_cohorts(cohort1_id, cohort2_id)` @@ -99,4 +102,21 @@ where cohort1_id and cohort2_id are integers and can be obtained from metadata o only hellinger distances between distributions of two cohorts are computed. - After all analysis is done, please make sure to close database connections and do necessary cleanups by calling -the API method `bias.cleanup()`. \ No newline at end of file +the API method `bias.cleanup()`. + +-- + +## 📘 Tutorial Notebooks + +To help users get started with the `BiasAnalyzer` python package, four Jupyter notebooks are +provided in the [`notebooks/`](https://github.com/VACLab/BiasAnalyzer/tree/main/notebooks) +directory. These tutorials walk users through key features and workflows with illustrative examples. + +| Tutorial | Description | +|----------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [BiasAnalyzerCohortsTutorial.ipynb](https://github.com/VACLab/BiasAnalyzer/blob/main/notebooks/BiasAnalyzerCohortsTutorial.ipynb) | Demonstrates how to create baseline and study cohorts, retrieve cohort statistics, and compare cohort distributions. | +| [BiasAnalyzerAsyncCohortsTutorial.ipynb](https://github.com/VACLab/BiasAnalyzer/blob/main/notebooks/BiasAnalyzerAsyncCohortsTutorial.ipynb) | As a companion to the Cohort tutorial above, demonstrates how to create and analyze cohorts asynchronously for improved performance and responsiveness when working with large datasets or complex cohort definitions. | +| [BiasAnalyzerCohortConceptTutorial.ipynb](https://github.com/VACLab/BiasAnalyzer/blob/main/notebooks/BiasAnalyzerCohortConceptTutorial.ipynb) | Demonstrates how to explore clinical concept prevalence within a cohort, helping users analyze clinical concept prevalence and identify potential cohort selection biases. | +| [BiasAnalyzerConceptBrowsingTutorial.ipynb](https://github.com/VACLab/BiasAnalyzer/blob/main/notebooks/BiasAnalyzerConceptBrowsingTutorial.ipynb) | Guides users through browsing OMOP concepts, domains, and vocabularies, including how to retrieve and visualize concept hierarchies. | + +These tutorials are designed to be run in a Jupyter environment with access to an OMOP-compatible postgreSQL or DuckDB database. From f425071eef1641538213d30a86b40aa5c97e3538 Mon Sep 17 00:00:00 2001 From: hyi Date: Fri, 20 Jun 2025 16:57:33 -0400 Subject: [PATCH 10/10] minor updates to readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f354010..fb444c1 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ only hellinger distances between distributions of two cohorts are computed. - After all analysis is done, please make sure to close database connections and do necessary cleanups by calling the API method `bias.cleanup()`. --- +--- ## 📘 Tutorial Notebooks @@ -119,4 +119,4 @@ directory. These tutorials walk users through key features and workflows with il | [BiasAnalyzerCohortConceptTutorial.ipynb](https://github.com/VACLab/BiasAnalyzer/blob/main/notebooks/BiasAnalyzerCohortConceptTutorial.ipynb) | Demonstrates how to explore clinical concept prevalence within a cohort, helping users analyze clinical concept prevalence and identify potential cohort selection biases. | | [BiasAnalyzerConceptBrowsingTutorial.ipynb](https://github.com/VACLab/BiasAnalyzer/blob/main/notebooks/BiasAnalyzerConceptBrowsingTutorial.ipynb) | Guides users through browsing OMOP concepts, domains, and vocabularies, including how to retrieve and visualize concept hierarchies. | -These tutorials are designed to be run in a Jupyter environment with access to an OMOP-compatible postgreSQL or DuckDB database. +These tutorials are designed to run in a Jupyter environment with access to an OMOP-compatible postgreSQL or DuckDB database.