diff --git a/0.preprocessing_data/notebooks/0.patient_specific_preprocessing.ipynb b/0.preprocessing_data/notebooks/0.patient_specific_preprocessing.ipynb index 8b608a1..a7d6f3e 100644 --- a/0.preprocessing_data/notebooks/0.patient_specific_preprocessing.ipynb +++ b/0.preprocessing_data/notebooks/0.patient_specific_preprocessing.ipynb @@ -105,7 +105,7 @@ ], "metadata": { "kernelspec": { - "display_name": "gff_preprocessing_env", + "display_name": ".venv (3.11.11)", "language": "python", "name": "python3" }, @@ -119,7 +119,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.15" + "version": "3.11.11" } }, "nbformat": 4, diff --git a/0.preprocessing_data/notebooks/1.make_zstack_and_copy_over.ipynb b/0.preprocessing_data/notebooks/1.make_zstack_and_copy_over.ipynb new file mode 100644 index 0000000..7b87aae --- /dev/null +++ b/0.preprocessing_data/notebooks/1.make_zstack_and_copy_over.ipynb @@ -0,0 +1,11553 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "dd91c916", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "a03e0a4d", + "metadata": {}, + "outputs": [], + "source": [ + "import argparse\n", + "import os\n", + "import pathlib\n", + "import re\n", + "import sys\n", + "\n", + "import numpy as np\n", + "import tifffile\n", + "import tqdm\n", + "from notebook_init_utils import avoid_path_crash_bandicoot, init_notebook\n", + "from preprocessing_funcs import (\n", + " check_well_dir_name_format,\n", + " get_to_the_unested_dir,\n", + " get_well_fov_dirs,\n", + " read_2D_image_for_zstacking,\n", + ")\n", + "\n", + "root_dir, in_notebook = init_notebook()\n", + "\n", + "if in_notebook:\n", + " import tqdm.notebook as tqdm\n", + "else:\n", + " import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d8aa9728", + "metadata": {}, + "outputs": [], + "source": [ + "argparse = argparse.ArgumentParser(\n", + " description=\"Copy files from one directory to another\"\n", + ")\n", + "argparse.add_argument(\"--HPC\", action=\"store_true\", help=\"Type of compute to run on\")\n", + "# Parse arguments\n", + "args = argparse.parse_args(args=sys.argv[1:] if \"ipykernel\" not in sys.argv[0] else [])\n", + "HPC = args.HPC" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "72dfffaf", + "metadata": {}, + "outputs": [], + "source": [ + "# check if bandicoot is set\n", + "bandicoot_path = pathlib.Path(os.path.expanduser(\"~/mnt/bandicoot\")).resolve()\n", + "if not HPC and bandicoot_path.exists():\n", + " bandicoot = True\n", + "else:\n", + " bandicoot = False\n", + "\n", + "bandicoot = False" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "186468e0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Raw image dir: /home/lippincm/Desktop/20TB_A/NF1_Patient_organoids\n" + ] + } + ], + "source": [ + "if HPC:\n", + " raw_image_dir = pathlib.Path(\"/pl/active/koala/GFF_Data/GFF-Raw/\").resolve(\n", + " strict=True\n", + " )\n", + " output_base_dir = root_dir\n", + "elif bandicoot:\n", + " # comment out depending on whose computer you are on\n", + " # mike's computer\n", + " bandicoot_path = pathlib.Path(\n", + " os.path.expanduser(\"~/mnt/bandicoot/NF1_organoid_data\")\n", + " ).resolve(strict=True)\n", + " raw_image_dir = pathlib.Path(f\"{bandicoot_path}/Raw_patient_files\").resolve(\n", + " strict=True\n", + " )\n", + " output_base_dir = bandicoot_path\n", + "else:\n", + " # comment out depending on whose computer you are on\n", + " # mike's computer\n", + " raw_image_dir = pathlib.Path(\n", + " os.path.expanduser(\"~/Desktop/20TB_A/NF1_Patient_organoids\")\n", + " ).resolve(strict=True)\n", + " # Jenna's computer\n", + " # raw_image_dir_local = pathlib.Path(\"/media/18tbdrive/GFF_organoid_data/\")\n", + " output_base_dir = root_dir\n", + "print(f\"Raw image dir: {raw_image_dir}\")" + ] + }, + { + "cell_type": "markdown", + "id": "f36481e2", + "metadata": {}, + "source": [ + "## Define paths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b44eb61", + "metadata": {}, + "outputs": [], + "source": [ + "# Define parent and destination directories in a single dictionary\n", + "\"\"\"\n", + "This dictionary maps patient IDs to their corresponding parent directories,\n", + "destination directories, and metadata for processing.\n", + "Nested dictionary name: patientID_tumorID\n", + "Keys:\n", + "- parent: Path to the parent directory containing raw images.\n", + "- destination: Path to the output directory for processed images.\n", + "- times_nested: Number of nested timepoint directories.\n", + "- well_position: Index of the well position in the filename when split by underscores.\n", + "- channel_position: Index of the channel position in the filename when split by underscores.\n", + "\"\"\"\n", + "\n", + "dir_mapping = {\n", + " \"NF0014_T1\": {\n", + " \"parent\": pathlib.Path(\n", + " f\"{raw_image_dir}/NF0014-Thawed 3 (Raw image files)-Combined/NF0014-Thawed 3 (Raw image files)-Combined copy\"\n", + " ).resolve(strict=True),\n", + " \"destination\": pathlib.Path(\n", + " f\"{output_base_dir}/data/NF0014_T1/zstack_images\"\n", + " ).resolve(),\n", + " \"times_nested\": 2,\n", + " \"well_position\": 0,\n", + " \"channel_position\": 1,\n", + " },\n", + " \"NF0014_T2\": {\n", + " \"parent\": pathlib.Path(\n", + " f\"{raw_image_dir}/NF0014-T2 Cell Painting/NF0014-T2 Combined/\"\n", + " ).resolve(strict=True),\n", + " \"destination\": pathlib.Path(\n", + " f\"{output_base_dir}/data/NF0014_T2/zstack_images\"\n", + " ).resolve(),\n", + " \"times_nested\": 2,\n", + " \"well_position\": 0,\n", + " \"channel_position\": 1,\n", + " },\n", + " \"NF0016_T1\": {\n", + " \"parent\": pathlib.Path(\n", + " f\"{raw_image_dir}/NF0016 Cell Painting-Pilot Drug Screening-selected/NF0016-Cell Painting Images/NF0016-images copy\"\n", + " ).resolve(strict=True),\n", + " \"destination\": pathlib.Path(\n", + " f\"{output_base_dir}/data/NF0016_T1/zstack_images\"\n", + " ).resolve(),\n", + " \"times_nested\": 2,\n", + " \"well_position\": 0,\n", + " \"channel_position\": 1,\n", + " },\n", + " \"NF0017\": {\n", + " \"parent\": pathlib.Path(\n", + " f\"{raw_image_dir}/NF0017-T3-P7 (AGP, Mito Parameter optimization)/Acquisition 03-07-2025\"\n", + " ).resolve(strict=True),\n", + " \"destination\": pathlib.Path(\n", + " f\"{output_base_dir}/data/NF0017/zstack_images\"\n", + " ).resolve(),\n", + " \"times_nested\": 2,\n", + " \"well_position\": 0,\n", + " \"channel_position\": 1,\n", + " },\n", + " \"NF0018_T6\": {\n", + " \"parent\": pathlib.Path(\n", + " f\"{raw_image_dir}/NF0018 (T6) Cell Painting-Pilot Drug Screeining/NF0018-Cell Painting Images/NF0018-All Acquisitions\"\n", + " ).resolve(strict=True),\n", + " \"destination\": pathlib.Path(\n", + " f\"{output_base_dir}/data/NF0018_T6/zstack_images\"\n", + " ).resolve(),\n", + " \"times_nested\": 2,\n", + " \"well_position\": 0,\n", + " \"channel_position\": 1,\n", + " },\n", + " \"NF0021_T1\": {\n", + " \"parent\": pathlib.Path(f\"{raw_image_dir}/NF0021-T1/NF0021-T1 Combined\").resolve(\n", + " strict=True\n", + " ),\n", + " \"destination\": pathlib.Path(\n", + " f\"{output_base_dir}/data/NF0021_T1/zstack_images\"\n", + " ).resolve(),\n", + " \"times_nested\": 2,\n", + " \"well_position\": 0,\n", + " \"channel_position\": 1,\n", + " },\n", + " \"NF0030_T1\": {\n", + " \"parent\": pathlib.Path(\n", + " f\"{raw_image_dir}/NF0030 Cell Painting/NF0030 Cell Painting/NF0030-Cell Painting Images/Combined\"\n", + " ).resolve(strict=True),\n", + " \"destination\": pathlib.Path(\n", + " f\"{output_base_dir}/data/NF0030_T1/zstack_images\"\n", + " ).resolve(),\n", + " \"times_nested\": 2,\n", + " \"well_position\": 0,\n", + " \"channel_position\": 1,\n", + " },\n", + " \"NF0031_T1_part_I\": {\n", + " \"parent\": pathlib.Path(\n", + " f\"{raw_image_dir}/NF0031-T1 Combined 1_2/NF0031-T1 Combined 1:2\"\n", + " ).resolve(strict=True),\n", + " \"destination\": pathlib.Path(\n", + " f\"{output_base_dir}/data/NF0031_T1/zstack_images\"\n", + " ).resolve(),\n", + " \"times_nested\": 0,\n", + " \"well_position\": -1,\n", + " \"channel_position\": -3,\n", + " },\n", + " \"NF0031_T1_part_II\": {\n", + " \"parent\": pathlib.Path(\n", + " f\"{raw_image_dir}/NF0031-T1 Combined 2_2/NF0031-T1 Combined 2:2\"\n", + " ).resolve(strict=True),\n", + " \"destination\": pathlib.Path(\n", + " f\"{output_base_dir}/data/NF0031_T1/zstack_images\"\n", + " ).resolve(),\n", + " \"times_nested\": 0,\n", + " \"well_position\": -1,\n", + " \"channel_position\": -3,\n", + " },\n", + " \"NF0035_T1_part_I\": {\n", + " \"parent\": pathlib.Path(\n", + " f\"{raw_image_dir}/NF0035-T1-Part-1/NF0035-T1-Combined Part-1\"\n", + " ).resolve(strict=True),\n", + " \"destination\": pathlib.Path(\n", + " f\"{output_base_dir}/data/NF0035_T1/zstack_images\"\n", + " ).resolve(),\n", + " \"times_nested\": 2,\n", + " \"well_position\": 0,\n", + " \"channel_position\": 1,\n", + " },\n", + " \"NF0035_T1_part_II\": {\n", + " \"parent\": pathlib.Path(\n", + " f\"{raw_image_dir}/NF0035-T1-Part-2/NF0035-T1-Combined Part-2\"\n", + " ).resolve(strict=True),\n", + " \"destination\": pathlib.Path(\n", + " f\"{output_base_dir}/data/NF0035_T1/zstack_images\"\n", + " ).resolve(),\n", + " \"times_nested\": 2,\n", + " \"well_position\": 0,\n", + " \"channel_position\": 1,\n", + " },\n", + " \"NF0037-T1-Z-1\": {\n", + " \"parent\": pathlib.Path(f\"{raw_image_dir}/NF0037-T1-Z-1/NF0037-T1-Z-1\").resolve(\n", + " strict=True\n", + " ),\n", + " \"destination\": pathlib.Path(\n", + " f\"{output_base_dir}/data/NF0037_T1-Z-1/zstack_images\"\n", + " ).resolve(),\n", + " \"times_nested\": 0,\n", + " \"well_position\": -1,\n", + " \"channel_position\": -3,\n", + " },\n", + " \"NF0037-T1-Z-0.5\": {\n", + " \"parent\": pathlib.Path(\n", + " f\"{raw_image_dir}/NF0037-T1-Z-0.5/NF0037-T1-ZTEST2-0.5\"\n", + " ).resolve(strict=True),\n", + " \"destination\": pathlib.Path(\n", + " f\"{output_base_dir}/data/NF0037_T1-Z-0.5/zstack_images\"\n", + " ).resolve(),\n", + " \"times_nested\": 0,\n", + " \"well_position\": -1,\n", + " \"channel_position\": -3,\n", + " },\n", + " \"NF0037-T1-Z-0.2\": {\n", + " \"parent\": pathlib.Path(\n", + " f\"{raw_image_dir}/NF0037-T1-Z-0.2/NF0037-T1-Z-0.2/NF0037-T1-ZTEST2-0.2\"\n", + " ).resolve(strict=True),\n", + " \"destination\": pathlib.Path(\n", + " f\"{output_base_dir}/data/NF0037_T1-Z-0.2/zstack_images\"\n", + " ).resolve(),\n", + " \"times_nested\": 0,\n", + " \"well_position\": -1,\n", + " \"channel_position\": -3,\n", + " },\n", + " \"NF0037-T1-Z-0.1\": {\n", + " \"parent\": pathlib.Path(\n", + " f\"{raw_image_dir}/NF0037-T1-Z-0.1/NF0037-T1-Z-0.1/NF0037-T1-ZTEST2\"\n", + " ).resolve(strict=True),\n", + " \"destination\": pathlib.Path(\n", + " f\"{output_base_dir}/data/NF0037_T1-Z-0.1/zstack_images\"\n", + " ).resolve(),\n", + " \"times_nested\": 0,\n", + " \"well_position\": -1,\n", + " \"channel_position\": -3,\n", + " },\n", + " \"NF0040_T1\": {\n", + " \"parent\": pathlib.Path(\n", + " f\"{raw_image_dir}/SARC0376 (NF0040) Cell Painting/SARC0376 (NF0040) Cell Painting/SARC0376 (NF0040)-Cell Painting Images/Combined\"\n", + " ).resolve(strict=True),\n", + " \"destination\": pathlib.Path(\n", + " f\"{output_base_dir}/data/NF0040_T1/zstack_images\"\n", + " ).resolve(),\n", + " \"times_nested\": 2,\n", + " \"well_position\": 0,\n", + " \"channel_position\": 1,\n", + " },\n", + " \"SACRO219_T1\": {\n", + " \"parent\": pathlib.Path(\n", + " f\"{raw_image_dir}/SARC0219-T2 Cell Painting-selected/SARC0219-T2 Combined Cell Painting images/SARC0219-T2 Combined/\"\n", + " ).resolve(strict=True),\n", + " \"destination\": pathlib.Path(\n", + " f\"{output_base_dir}/data/SARCO219_T1/zstack_images\"\n", + " ).resolve(),\n", + " \"times_nested\": 2,\n", + " \"well_position\": 0,\n", + " \"channel_position\": 1,\n", + " },\n", + " \"SARCO361_T1\": {\n", + " \"parent\": pathlib.Path(f\"{raw_image_dir}/SARC0361/SARC0361 Combined/\").resolve(\n", + " strict=True\n", + " ),\n", + " \"destination\": pathlib.Path(\n", + " f\"{output_base_dir}/data/SARCO361_T1/zstack_images\"\n", + " ).resolve(),\n", + " \"times_nested\": 2,\n", + " \"well_position\": 0,\n", + " \"channel_position\": 1,\n", + " },\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "4fbd735d", + "metadata": {}, + "source": [ + "## Copy and zstack images" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b9e8e22c", + "metadata": {}, + "outputs": [], + "source": [ + "# image channel names and extensions\n", + "image_extensions = {\".tif\", \".tiff\"}\n", + "channel_names = [\"405\", \"488\", \"555\", \"640\", \"TRANS\"]\n", + "# make a dictionary that contains a list for each channel name, storing both filepath and filename\n", + "channel_images = {\n", + " channel_name: {\"filename\": [], \"filepath\": []} for channel_name in channel_names\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "71d893d6", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "09b775bd5e68443cb3763697218cf591", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Processing patients: 0%| | 0/8 [00:00 bool:\n", - " \"\"\"Check that all nested folders within a well directory have the same names as the well directory itself.\n", - "\n", - " Args:\n", - " well_dir (pathlib.Path): Path to a single well directory.\n", - "\n", - " Returns:\n", - " bool: True if all nested folders inside this well directory have the same name as the well directory, False otherwise.\n", - " \"\"\"\n", - " # Get the name of the well directory (this will be the expected folder name)\n", - " well_name = well_dir.name\n", - "\n", - " # Get the immediate subdirectories in the well directory (e.g., Field_1, Field_2)\n", - " sub_dirs = [d for d in well_dir.iterdir() if d.is_dir()]\n", - "\n", - " if not sub_dirs:\n", - " return False # No nested folders found, treat as inconsistent\n", - "\n", - " # Check if each subdirectory contains a nested folder with the same name as the well directory\n", - " for sub in sub_dirs:\n", - " nested_folders = [d.name for d in sub.iterdir() if d.is_dir()]\n", - " if well_name not in nested_folders:\n", - " return False # Inconsistent folder structure found\n", - "\n", - " return True # All subdirectories have a nested folder with the same name as the well directory\n", - "\n", - "\n", - "def is_image_folder_empty(nested_dir: pathlib.Path) -> bool:\n", - " \"\"\"Check if a nested directory contains any images.\n", - "\n", - " Args:\n", - " nested_dir (pathlib.Path): Path to a directory nested within the well directory\n", - "\n", - " Returns:\n", - " bool: Boolean indicating whether the nested directory contains any images\n", - " \"\"\"\n", - " return not any(\n", - " image.suffix.lower() in image_extensions for image in nested_dir.rglob(\"*\")\n", - " )\n", - "\n", - "\n", - "def has_equal_images_per_channel(\n", - " nested_dir: pathlib.Path, channel_names: list[str]\n", - ") -> bool:\n", - " \"\"\"Check if all specified channels have the same number of images by looking for the channel name in the filenames.\n", - "\n", - " Args:\n", - " nested_dir (pathlib.Path): Path to a directory nested within the well directory.\n", - " channel_names (list[str]): List of strings of the channel names found in the nested directory.\n", - "\n", - " Returns:\n", - " bool: Boolean indicating whether all specified channels have the same number of images.\n", - " \"\"\"\n", - " # Initialize counts for each channel\n", - " channel_counts = {channel: 0 for channel in channel_names}\n", - "\n", - " # Count images for each channel based on the channel name in the filename\n", - " for image in nested_dir.rglob(\"*\"): # Search for all files recursively\n", - " if image.suffix.lower() in image_extensions: # Ensure it's an image file\n", - " for channel in channel_names:\n", - " if (\n", - " channel in image.name\n", - " ): # If the channel name is found in the image filename\n", - " channel_counts[channel] += 1\n", - "\n", - " # Get the unique set of image counts (if all counts are equal, there should be only one unique value)\n", - " image_counts = set(channel_counts.values())\n", - "\n", - " # If all counts are equal and non-zero, return True; otherwise, return False\n", - " return len(image_counts) == 1 and 0 not in image_counts" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run this cell through the script" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Function to process a single nested directory\n", - "\n", - "\n", - "def process_nested_dir(nested_dir, dest_well_dir, channel_names, image_extensions):\n", - " if not nested_dir.is_dir():\n", - " return f\"Skipping {nested_dir}: Not a directory\"\n", - "\n", - " if is_image_folder_empty(nested_dir):\n", - " return f\"Skipping {nested_dir}: No images found\"\n", - "\n", - " if not has_equal_images_per_channel(nested_dir, channel_names):\n", - " return f\"Skipping {nested_dir}: Unequal images per channel\"\n", - "\n", - " # Copy images to destination, skipping files with 'Tile' in their name\n", - " for image in nested_dir.rglob(\"*\"):\n", - " if image.suffix.lower() in image_extensions and \"Tile\" not in image.name:\n", - " shutil.copy2(image, dest_well_dir)\n", - "\n", - " return f\"Processed {nested_dir}\"\n", - "\n", - "\n", - "# Function to process a single well directory\n", - "def process_well_dir(well_dir, dest_dir, channel_names, image_extensions):\n", - " if not has_consistent_naming(well_dir):\n", - " return f\"Skipping {well_dir.stem}: Inconsistent nested folder names within well\"\n", - "\n", - " dest_well_dir = dest_dir / well_dir.name\n", - " dest_well_dir.mkdir(parents=True, exist_ok=True)\n", - "\n", - " nested_dirs = list(well_dir.iterdir())\n", - " for nested_dir in nested_dirs:\n", - " process_nested_dir(\n", - " nested_dir,\n", - " dest_well_dir,\n", - " channel_names,\n", - " image_extensions,\n", - " )\n", - "\n", - "\n", - "# Set channel names\n", - "channel_names = {\"405\", \"488\", \"555\", \"640\", \"TRANS\", \"Merge\"}\n", - "\n", - "# Loop through each key in the mapping to copy data from the parent to the destination\n", - "for key, paths in dir_mapping.items():\n", - " parent_dir = paths[\"parent\"]\n", - " dest_dir = paths[\"destination\"]\n", - "\n", - " print(f\"Processing {key}: {parent_dir} -> {dest_dir}\")\n", - "\n", - " # Ensure the destination directory exists\n", - " dest_dir.mkdir(parents=True, exist_ok=True)\n", - "\n", - " # Get all well-level directories\n", - " well_dirs = [d for d in parent_dir.iterdir() if d.is_dir()]\n", - "\n", - " if not well_dirs:\n", - " print(f\"Skipping {key}: No well directories found\")\n", - " continue\n", - " # Process well directories in parallel\n", - " with ProcessPoolExecutor(max_workers=multiprocessing.cpu_count() - 2) as executor:\n", - " futures = [\n", - " executor.submit(\n", - " process_well_dir, well_dir, dest_dir, channel_names, image_extensions\n", - " )\n", - " for well_dir in well_dirs\n", - " ]\n", - " for future in tqdm.tqdm(\n", - " as_completed(futures),\n", - " desc=f\"Processing {key}\",\n", - " leave=False,\n", - " total=len(well_dirs),\n", - " ):\n", - " pass\n", - "\n", - " print(f\"Completed processing {key}: {parent_dir} -> {dest_dir}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## NF0016 specific preprocessing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "parent_dir_NF0016 = pathlib.Path(\n", - " f\"{output_base_dir}/data/NF0016_T1/raw_images\"\n", - ").resolve(strict=True)\n", - "# get all dirs in the parent dir\n", - "parent_dir_NF0016 = list(parent_dir_NF0016.glob(\"*/\"))\n", - "parent_dir_NF0016 = [x for x in parent_dir_NF0016 if x.is_dir()]\n", - "# get all child files in the parent dir\n", - "file_dir_NF0016 = []\n", - "for parent_dir in parent_dir_NF0016:\n", - " file_dir_NF0016.extend(list(parent_dir.glob(\"*\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# rename the files in the parent dir\n", - "for file in file_dir_NF0016:\n", - " new_file_dir = pathlib.Path(\n", - " f\"{file.parent}/{str(file.stem).replace(' (60X)', '')}.{file.suffix}\"\n", - " )\n", - " file.rename(new_file_dir)\n", - "\n", - "# rename the parent dir\n", - "for parent_dir in parent_dir_NF0016:\n", - " new_parent_dir = pathlib.Path(\n", - " f\"{parent_dir.parent}/{str(parent_dir.stem).replace(' (60X)', '')}\"\n", - " )\n", - " # rename the parent dir\n", - " os.rename(parent_dir, new_parent_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## NF0018 specific preprocessing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "parent_dir_NF0018 = pathlib.Path(\n", - " f\"{output_base_dir}/data/NF0018_T6/raw_images\"\n", - ").resolve(strict=True)\n", - "# get all dirs in the parent dir\n", - "parent_dir_NF0018 = list(parent_dir_NF0018.glob(\"*/\"))\n", - "parent_dir_NF0018 = [x for x in parent_dir_NF0018 if x.is_dir()]\n", - "# get all child files in the parent dir\n", - "file_dir_NF0018 = []\n", - "for parent_dir in parent_dir_NF0018:\n", - " file_dir_NF0018.extend(list(parent_dir.glob(\"*\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# rename the files in the parent dir\n", - "for file in file_dir_NF0018:\n", - " new_file_dir = pathlib.Path(\n", - " f\"{file.parent}/{str(file.stem).replace(' (60X)', '')}{file.suffix}\"\n", - " )\n", - " file.rename(new_file_dir)\n", - "\n", - "# rename the parent dir\n", - "for parent_dir in parent_dir_NF0018:\n", - " new_parent_dir = pathlib.Path(\n", - " f\"{parent_dir.parent}/{str(parent_dir.stem).replace(' (60X)', '')}\"\n", - " )\n", - " # rename the parent dir\n", - " os.rename(parent_dir, new_parent_dir)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "gff_preprocessing_env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/0.preprocessing_data/notebooks/2.perform_file_corruption_checks.ipynb b/0.preprocessing_data/notebooks/2.perform_file_corruption_checks.ipynb new file mode 100644 index 0000000..5653340 --- /dev/null +++ b/0.preprocessing_data/notebooks/2.perform_file_corruption_checks.ipynb @@ -0,0 +1,560 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Check the files for corrupted files, or files that were not copied over correctly." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pathlib\n", + "import pprint\n", + "import re\n", + "import sys\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import tifffile as tiff\n", + "from notebook_init_utils import avoid_path_crash_bandicoot, init_notebook\n", + "\n", + "root_dir, in_notebook = init_notebook()\n", + "\n", + "if in_notebook:\n", + " import tqdm.notebook as tqdm\n", + "else:\n", + " import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def max_z_projection(patient: str, well_fov: str) -> None:\n", + " \"\"\"\n", + " Create a maximum intensity projection of the z-stack images for a given patient and well_fov.\n", + " Args:\n", + " patient (str): The patient identifier.\n", + " well_fov (str): The well and field of view identifier, formatted as \"well_fov\".\n", + " Returns:\n", + " None: The function saves the maximum intensity projection images to the specified output directory.\n", + " Raises:\n", + " FileNotFoundError: If the specified raw images directory does not exist.\n", + " ValueError: If no images are found for the specified channels.\n", + " \"\"\"\n", + " # Image extensions that we are looking to copy\n", + " image_extensions = {\".tif\", \".tiff\"}\n", + " channel_names = [\"405\", \"488\", \"555\", \"640\", \"TRANS\"]\n", + " # make a dictionary that contains a list for each channel name, storing both filepath and filename\n", + " channel_images = {\n", + " channel_name: {\"filename\": [], \"filepath\": []} for channel_name in channel_names\n", + " }\n", + " raw_images_path = pathlib.Path(f\"{root_dir}/data/{patient}/raw_images\").resolve(\n", + " strict=True\n", + " )\n", + " zstack_output_path = pathlib.Path(\n", + " f\"{root_dir}/data/{patient}/zstack_images\"\n", + " ).resolve(strict=True)\n", + " well_fov_dir = raw_images_path / well_fov\n", + " channel_images = {\n", + " channel_name: {\"filename\": [], \"filepath\": []} for channel_name in channel_names\n", + " }\n", + " for filename in well_fov_dir.glob(\"*\"):\n", + " if filename.suffix in image_extensions:\n", + " for channel_name in channel_names:\n", + " if channel_name in filename.name:\n", + " channel_images[channel_name][\"filepath\"].append(filename)\n", + "\n", + " for channel_name in tqdm.tqdm(\n", + " channel_names, desc=f\"Processing channels in {well_fov_dir.name}\", leave=False\n", + " ):\n", + " channel_images[channel_name][\"filepath\"] = sorted(\n", + " channel_images[channel_name][\"filepath\"]\n", + " )\n", + " if not channel_images[channel_name][\"filepath\"]:\n", + " print(\n", + " f\"No files found for channel {channel_name} in {well_dir}. Skipping...\"\n", + " )\n", + " continue\n", + "\n", + " images_to_stack = np.array(\n", + " [\n", + " tiff.imread(filepath)\n", + " for filepath in channel_images[channel_name][\"filepath\"]\n", + " ]\n", + " )\n", + " filepath = channel_images[channel_name][\"filepath\"][0]\n", + " well = str(filepath.parent).split(\"/\")[-1]\n", + " output_path = zstack_output_path / f\"{well}\" / f\"{well}_{channel_name}.tif\"\n", + " output_path.parent.mkdir(exist_ok=True, parents=True)\n", + " print(\n", + " f\"Saving max projection for {channel_name} channel in {well_fov} to {output_path}\"\n", + " )\n", + " tiff.imwrite(output_path, images_to_stack)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set input and output directories" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# check if bandicoot is set\n", + "bandicoot_path = pathlib.Path(os.path.expanduser(\"~/mnt/bandicoot\")).resolve()\n", + "\n", + "bandicoot = True" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "if bandicoot:\n", + " # comment out depending on whose computer you are on\n", + " # mike's computer\n", + " bandicoot_path = pathlib.Path(\n", + " os.path.expanduser(\"~/mnt/bandicoot/NF1_organoid_data\")\n", + " ).resolve(strict=True)\n", + " output_base_dir = bandicoot_path\n", + "else:\n", + " # comment out depending on whose computer you are on\n", + " # mike's computer\n", + " raw_image_dir = pathlib.Path(\n", + " os.path.expanduser(\"~/Desktop/20TB_A/NF1_Patient_organoids\")\n", + " ).resolve(strict=True)\n", + " # Jenna's computer\n", + " # raw_image_dir_local = pathlib.Path(\"/media/18tbdrive/GFF_organoid_data/\")\n", + " output_base_dir = raw_image_dir" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "patient_id_file_path = pathlib.Path(f\"{root_dir}/data/patient_IDs.txt\").resolve(\n", + " strict=True\n", + ")\n", + "list_of_patients = pd.read_csv(patient_id_file_path, header=None)[0].tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'NF0014_T1': {'raw_images': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0014_T1/raw_images'),\n", + " 'zstack_output': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0014_T1/zstack_images')},\n", + " 'NF0016_T1': {'raw_images': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0016_T1/raw_images'),\n", + " 'zstack_output': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0016_T1/zstack_images')},\n", + " 'NF0018_T6': {'raw_images': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0018_T6/raw_images'),\n", + " 'zstack_output': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0018_T6/zstack_images')},\n", + " 'NF0021_T1': {'raw_images': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0021_T1/raw_images'),\n", + " 'zstack_output': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0021_T1/zstack_images')},\n", + " 'NF0030_T1': {'raw_images': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0030_T1/raw_images'),\n", + " 'zstack_output': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0030_T1/zstack_images')},\n", + " 'NF0031_T1': {'raw_images': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0031_T1/raw_images'),\n", + " 'zstack_output': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0031_T1/zstack_images')},\n", + " 'NF0035_T1': {'raw_images': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0035_T1/raw_images'),\n", + " 'zstack_output': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0035_T1/zstack_images')},\n", + " 'NF0037_T1-Z-0.1': {'raw_images': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0037_T1-Z-0.1/raw_images'),\n", + " 'zstack_output': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0037_T1-Z-0.1/zstack_images')},\n", + " 'NF0037_T1-Z-0.2': {'raw_images': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0037_T1-Z-0.2/raw_images'),\n", + " 'zstack_output': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0037_T1-Z-0.2/zstack_images')},\n", + " 'NF0037_T1-Z-0.5': {'raw_images': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0037_T1-Z-0.5/raw_images'),\n", + " 'zstack_output': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0037_T1-Z-0.5/zstack_images')},\n", + " 'NF0037_T1-Z-1': {'raw_images': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0037_T1-Z-1/raw_images'),\n", + " 'zstack_output': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0037_T1-Z-1/zstack_images')},\n", + " 'NF0040_T1': {'raw_images': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0040_T1/raw_images'),\n", + " 'zstack_output': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/NF0040_T1/zstack_images')},\n", + " 'SARCO219_T2': {'raw_images': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/SARCO219_T2/raw_images'),\n", + " 'zstack_output': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/SARCO219_T2/zstack_images')},\n", + " 'SARCO361_T1': {'raw_images': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/SARCO361_T1/raw_images'),\n", + " 'zstack_output': PosixPath('/home/lippincm/mnt/bandicoot/NF1_organoid_data/data/SARCO361_T1/zstack_images')}}\n" + ] + } + ], + "source": [ + "patient_input_dict = {}\n", + "for patient in list_of_patients:\n", + " patient_input_dict[patient] = {\n", + " \"raw_images\": pathlib.Path(\n", + " f\"{output_base_dir}/data/{patient}/raw_images\"\n", + " ).resolve(),\n", + " \"zstack_output\": pathlib.Path(\n", + " f\"{output_base_dir}/data/{patient}/zstack_images\"\n", + " ).resolve(),\n", + " }\n", + "pprint.pprint(patient_input_dict)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create list of the well-site folders\n", + "Create a list of the well-site folders in the stack directory.\n", + "Then loop through each well-site folder and create a list of the channel images.\n", + "Then find (if any) corrupted files in the channel images.\n", + "This is done by checking if the size of the channel images for a given well-fov is the same as the size of the channel images for the other well-fovs.\n", + "If the size is different, then the file is corrupted." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "03df7ee9d6e148c68f98d81eef652692", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Processing patients: 0%| | 0/14 [00:00 None:\n", - " \"\"\"\n", - " Create a maximum intensity projection of the z-stack images for a given patient and well_fov.\n", - " Args:\n", - " patient (str): The patient identifier.\n", - " well_fov (str): The well and field of view identifier, formatted as \"well_fov\".\n", - " Returns:\n", - " None: The function saves the maximum intensity projection images to the specified output directory.\n", - " Raises:\n", - " FileNotFoundError: If the specified raw images directory does not exist.\n", - " ValueError: If no images are found for the specified channels.\n", - " \"\"\"\n", - " # Image extensions that we are looking to copy\n", - " image_extensions = {\".tif\", \".tiff\"}\n", - " channel_names = [\"405\", \"488\", \"555\", \"640\", \"TRANS\"]\n", - " # make a dictionary that contains a list for each channel name, storing both filepath and filename\n", - " channel_images = {\n", - " channel_name: {\"filename\": [], \"filepath\": []} for channel_name in channel_names\n", - " }\n", - " raw_images_path = pathlib.Path(f\"{root_dir}/data/{patient}/raw_images\").resolve(\n", - " strict=True\n", - " )\n", - " zstack_output_path = pathlib.Path(\n", - " f\"{root_dir}/data/{patient}/zstack_images\"\n", - " ).resolve(strict=True)\n", - " well_fov_dir = raw_images_path / well_fov\n", - " channel_images = {\n", - " channel_name: {\"filename\": [], \"filepath\": []} for channel_name in channel_names\n", - " }\n", - " for filename in well_fov_dir.glob(\"*\"):\n", - " if filename.suffix in image_extensions:\n", - " for channel_name in channel_names:\n", - " if channel_name in filename.name:\n", - " channel_images[channel_name][\"filepath\"].append(filename)\n", - "\n", - " for channel_name in tqdm.tqdm(\n", - " channel_names, desc=f\"Processing channels in {well_fov_dir.name}\", leave=False\n", - " ):\n", - " channel_images[channel_name][\"filepath\"] = sorted(\n", - " channel_images[channel_name][\"filepath\"]\n", - " )\n", - " if not channel_images[channel_name][\"filepath\"]:\n", - " print(\n", - " f\"No files found for channel {channel_name} in {well_dir}. Skipping...\"\n", - " )\n", - " continue\n", - "\n", - " images_to_stack = np.array(\n", - " [\n", - " tiff.imread(filepath)\n", - " for filepath in channel_images[channel_name][\"filepath\"]\n", - " ]\n", - " )\n", - " filepath = channel_images[channel_name][\"filepath\"][0]\n", - " well = str(filepath.parent).split(\"/\")[-1]\n", - " output_path = zstack_output_path / f\"{well}\" / f\"{well}_{channel_name}.tif\"\n", - " output_path.parent.mkdir(exist_ok=True, parents=True)\n", - " print(\n", - " f\"Saving max projection for {channel_name} channel in {well_fov} to {output_path}\"\n", - " )\n", - " tiff.imwrite(output_path, images_to_stack)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set input and output directories" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "bandicoot_path = pathlib.Path(os.path.expanduser(\"~/mnt/bandicoot\")).resolve()\n", - "raw_image_dir, output_base_dir = avoid_path_crash_bandicoot(bandicoot_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# patient_ids\n", - "# patient_id_file_path = pathlib.Path(f\"{raw_image_dir}/data/patient_IDs.txt\").resolve(\n", - "# strict=True\n", - "# )\n", - "# list_of_patients = pd.read_csv(patient_id_file_path, header=None)[0].tolist()\n", - "\n", - "list_of_patients = [\"NF0035_T1\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'NF0035_T1': {'raw_images': PosixPath('~/mnt/bandicoot/NF1_organoid_data/NF1_organoid_data/data/NF0035_T1/raw_images'),\n", - " 'zstack_output': PosixPath('~/mnt/bandicoot/NF1_organoid_data/data/NF0035_T1/zstack_images')}}\n" - ] - } - ], - "source": [ - "patient_input_dict = {}\n", - "for patient in list_of_patients:\n", - " patient_input_dict[patient] = {\n", - " \"raw_images\": pathlib.Path(\n", - " f\"{raw_image_dir}/data/{patient}/raw_images\"\n", - " ).resolve(),\n", - " \"zstack_output\": pathlib.Path(\n", - " f\"{output_base_dir}/data/{patient}/zstack_images\"\n", - " ).resolve(),\n", - " }\n", - "pprint.pprint(patient_input_dict)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create list of the well-site folders\n", - "Create a list of the well-site folders in the stack directory.\n", - "Then loop through each well-site folder and create a list of the channel images.\n", - "Then find (if any) corrupted files in the channel images.\n", - "This is done by checking if the size of the channel images for a given well-fov is the same as the size of the channel images for the other well-fovs.\n", - "If the size is different, then the file is corrupted." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "67805e53d95d414098daf747a168ab2a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Processing patients: 0%| | 0/1 [00:00=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "gff_3d_utils" +version = "0.0.0" +description = "Utility package for GFF 3D organoid profiling pipeline" +requires-python = ">=3.11" +authors = [ { name = "Your Name", email = "you@example.com" } ] +license = { text = "MIT" } + +dependencies = [ + "jupyter", + "tifffile", + "jupyterlab", + "pandas", + "scipy", + "numpy", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "black", + "ruff", + "pre-commit", + "ipykernel", + +] + + + +[tool.setuptools.packages.find] +where = ["."] +include = ["errors*"] + +[dependency-groups] +dev = [ + "ipykernel>=7.1.0", + "uv>=0.9.7", +] diff --git a/0.preprocessing_data/scripts/1.make_zstack_and_copy_over.py b/0.preprocessing_data/scripts/1.make_zstack_and_copy_over.py new file mode 100644 index 0000000..c5b138a --- /dev/null +++ b/0.preprocessing_data/scripts/1.make_zstack_and_copy_over.py @@ -0,0 +1,406 @@ +#!/usr/bin/env python +# coding: utf-8 + +# ## Imports + +# In[1]: + + +import argparse +import os +import pathlib +import sys + +import numpy as np +import tifffile +import tqdm +from notebook_init_utils import avoid_path_crash_bandicoot, init_notebook +from preprocessing_funcs import ( + check_well_dir_name_format, + get_to_the_unested_dir, + get_well_fov_dirs, + read_2D_image_for_zstacking, +) + +root_dir, in_notebook = init_notebook() + +if in_notebook: + import tqdm.notebook as tqdm +else: + import tqdm + + +# In[2]: + + +argparse = argparse.ArgumentParser( + description="Copy files from one directory to another" +) +argparse.add_argument("--HPC", action="store_true", help="Type of compute to run on") +# Parse arguments +args = argparse.parse_args(args=sys.argv[1:] if "ipykernel" not in sys.argv[0] else []) +HPC = args.HPC + + +# In[3]: + + +# check if bandicoot is set +bandicoot_path = pathlib.Path(os.path.expanduser("~/mnt/bandicoot")).resolve() +if not HPC and bandicoot_path.exists(): + bandicoot = True +else: + bandicoot = False + +bandicoot = False + + +# In[4]: + + +if HPC: + raw_image_dir = pathlib.Path("/pl/active/koala/GFF_Data/GFF-Raw/").resolve( + strict=True + ) + output_base_dir = root_dir +elif bandicoot: + # comment out depending on whose computer you are on + # mike's computer + bandicoot_path = pathlib.Path( + os.path.expanduser("~/mnt/bandicoot/NF1_organoid_data") + ).resolve(strict=True) + raw_image_dir = pathlib.Path(f"{bandicoot_path}/Raw_patient_files").resolve( + strict=True + ) + output_base_dir = bandicoot_path +else: + # comment out depending on whose computer you are on + # mike's computer + raw_image_dir = pathlib.Path( + os.path.expanduser("~/Desktop/20TB_A/NF1_Patient_organoids") + ).resolve(strict=True) + # Jenna's computer + # raw_image_dir_local = pathlib.Path("/media/18tbdrive/GFF_organoid_data/") + output_base_dir = root_dir +print(f"Raw image dir: {raw_image_dir}") + + +# ## Define paths + +# In[ ]: + + +# Define parent and destination directories in a single dictionary +""" +This dictionary maps patient IDs to their corresponding parent directories, +destination directories, and metadata for processing. +Nested dictionary name: patientID_tumorID +Keys: +- parent: Path to the parent directory containing raw images. +- destination: Path to the output directory for processed images. +- times_nested: Number of nested timepoint directories. +- well_position: Index of the well position in the filename when split by underscores. +- channel_position: Index of the channel position in the filename when split by underscores. +""" + +dir_mapping = { + "NF0014_T1": { + "parent": pathlib.Path( + f"{raw_image_dir}/NF0014-Thawed 3 (Raw image files)-Combined/NF0014-Thawed 3 (Raw image files)-Combined copy" + ).resolve(strict=True), + "destination": pathlib.Path( + f"{output_base_dir}/data/NF0014_T1/zstack_images" + ).resolve(), + "times_nested": 2, + "well_position": 0, + "channel_position": 1, + }, + "NF0014_T2": { + "parent": pathlib.Path( + f"{raw_image_dir}/NF0014-T2 Cell Painting/NF0014-T2 Combined/" + ).resolve(strict=True), + "destination": pathlib.Path( + f"{output_base_dir}/data/NF0014_T2/zstack_images" + ).resolve(), + "times_nested": 2, + "well_position": 0, + "channel_position": 1, + }, + "NF0016_T1": { + "parent": pathlib.Path( + f"{raw_image_dir}/NF0016 Cell Painting-Pilot Drug Screening-selected/NF0016-Cell Painting Images/NF0016-images copy" + ).resolve(strict=True), + "destination": pathlib.Path( + f"{output_base_dir}/data/NF0016_T1/zstack_images" + ).resolve(), + "times_nested": 2, + "well_position": 0, + "channel_position": 1, + }, + "NF0017": { + "parent": pathlib.Path( + f"{raw_image_dir}/NF0017-T3-P7 (AGP, Mito Parameter optimization)/Acquisition 03-07-2025" + ).resolve(strict=True), + "destination": pathlib.Path( + f"{output_base_dir}/data/NF0017/zstack_images" + ).resolve(), + "times_nested": 2, + "well_position": 0, + "channel_position": 1, + }, + "NF0018_T6": { + "parent": pathlib.Path( + f"{raw_image_dir}/NF0018 (T6) Cell Painting-Pilot Drug Screeining/NF0018-Cell Painting Images/NF0018-All Acquisitions" + ).resolve(strict=True), + "destination": pathlib.Path( + f"{output_base_dir}/data/NF0018_T6/zstack_images" + ).resolve(), + "times_nested": 2, + "well_position": 0, + "channel_position": 1, + }, + "NF0021_T1": { + "parent": pathlib.Path(f"{raw_image_dir}/NF0021-T1/NF0021-T1 Combined").resolve( + strict=True + ), + "destination": pathlib.Path( + f"{output_base_dir}/data/NF0021_T1/zstack_images" + ).resolve(), + "times_nested": 2, + "well_position": 0, + "channel_position": 1, + }, + "NF0030_T1": { + "parent": pathlib.Path( + f"{raw_image_dir}/NF0030 Cell Painting/NF0030 Cell Painting/NF0030-Cell Painting Images/Combined" + ).resolve(strict=True), + "destination": pathlib.Path( + f"{output_base_dir}/data/NF0030_T1/zstack_images" + ).resolve(), + "times_nested": 2, + "well_position": 0, + "channel_position": 1, + }, + "NF0031_T1_part_I": { + "parent": pathlib.Path( + f"{raw_image_dir}/NF0031-T1 Combined 1_2/NF0031-T1 Combined 1:2" + ).resolve(strict=True), + "destination": pathlib.Path( + f"{output_base_dir}/data/NF0031_T1/zstack_images" + ).resolve(), + "times_nested": 0, + "well_position": -1, + "channel_position": -3, + }, + "NF0031_T1_part_II": { + "parent": pathlib.Path( + f"{raw_image_dir}/NF0031-T1 Combined 2_2/NF0031-T1 Combined 2:2" + ).resolve(strict=True), + "destination": pathlib.Path( + f"{output_base_dir}/data/NF0031_T1/zstack_images" + ).resolve(), + "times_nested": 0, + "well_position": -1, + "channel_position": -3, + }, + "NF0035_T1_part_I": { + "parent": pathlib.Path( + f"{raw_image_dir}/NF0035-T1-Part-1/NF0035-T1-Combined Part-1" + ).resolve(strict=True), + "destination": pathlib.Path( + f"{output_base_dir}/data/NF0035_T1/zstack_images" + ).resolve(), + "times_nested": 2, + "well_position": 0, + "channel_position": 1, + }, + "NF0035_T1_part_II": { + "parent": pathlib.Path( + f"{raw_image_dir}/NF0035-T1-Part-2/NF0035-T1-Combined Part-2" + ).resolve(strict=True), + "destination": pathlib.Path( + f"{output_base_dir}/data/NF0035_T1/zstack_images" + ).resolve(), + "times_nested": 2, + "well_position": 0, + "channel_position": 1, + }, + "NF0037-T1-Z-1": { + "parent": pathlib.Path(f"{raw_image_dir}/NF0037-T1-Z-1/NF0037-T1-Z-1").resolve( + strict=True + ), + "destination": pathlib.Path( + f"{output_base_dir}/data/NF0037_T1-Z-1/zstack_images" + ).resolve(), + "times_nested": 0, + "well_position": -1, + "channel_position": -3, + }, + "NF0037-T1-Z-0.5": { + "parent": pathlib.Path( + f"{raw_image_dir}/NF0037-T1-Z-0.5/NF0037-T1-ZTEST2-0.5" + ).resolve(strict=True), + "destination": pathlib.Path( + f"{output_base_dir}/data/NF0037_T1-Z-0.5/zstack_images" + ).resolve(), + "times_nested": 0, + "well_position": -1, + "channel_position": -3, + }, + "NF0037-T1-Z-0.2": { + "parent": pathlib.Path( + f"{raw_image_dir}/NF0037-T1-Z-0.2/NF0037-T1-Z-0.2/NF0037-T1-ZTEST2-0.2" + ).resolve(strict=True), + "destination": pathlib.Path( + f"{output_base_dir}/data/NF0037_T1-Z-0.2/zstack_images" + ).resolve(), + "times_nested": 0, + "well_position": -1, + "channel_position": -3, + }, + "NF0037-T1-Z-0.1": { + "parent": pathlib.Path( + f"{raw_image_dir}/NF0037-T1-Z-0.1/NF0037-T1-Z-0.1/NF0037-T1-ZTEST2" + ).resolve(strict=True), + "destination": pathlib.Path( + f"{output_base_dir}/data/NF0037_T1-Z-0.1/zstack_images" + ).resolve(), + "times_nested": 0, + "well_position": -1, + "channel_position": -3, + }, + "NF0040_T1": { + "parent": pathlib.Path( + f"{raw_image_dir}/SARC0376 (NF0040) Cell Painting/SARC0376 (NF0040) Cell Painting/SARC0376 (NF0040)-Cell Painting Images/Combined" + ).resolve(strict=True), + "destination": pathlib.Path( + f"{output_base_dir}/data/NF0040_T1/zstack_images" + ).resolve(), + "times_nested": 2, + "well_position": 0, + "channel_position": 1, + }, + "SACRO219_T1": { + "parent": pathlib.Path( + f"{raw_image_dir}/SARC0219-T2 Cell Painting-selected/SARC0219-T2 Combined Cell Painting images/SARC0219-T2 Combined/" + ).resolve(strict=True), + "destination": pathlib.Path( + f"{output_base_dir}/data/SARCO219_T1/zstack_images" + ).resolve(), + "times_nested": 2, + "well_position": 0, + "channel_position": 1, + }, + "SARCO361_T1": { + "parent": pathlib.Path(f"{raw_image_dir}/SARC0361/SARC0361 Combined/").resolve( + strict=True + ), + "destination": pathlib.Path( + f"{output_base_dir}/data/SARCO361_T1/zstack_images" + ).resolve(), + "times_nested": 2, + "well_position": 0, + "channel_position": 1, + }, +} + + +# ## Copy and zstack images + +# In[6]: + + +# image channel names and extensions +image_extensions = {".tif", ".tiff"} +channel_names = ["405", "488", "555", "640", "TRANS"] +# make a dictionary that contains a list for each channel name, storing both filepath and filename +channel_images = { + channel_name: {"filename": [], "filepath": []} for channel_name in channel_names +} + + +# In[7]: + + +# loop through each patient +for patient in tqdm.tqdm(dir_mapping.keys(), desc="Processing patients", leave=True): + # get a list of all well directories and loop through them + all_well_dirs = get_well_fov_dirs(parent_dir=dir_mapping[patient]["parent"]) + + for well_dir in tqdm.tqdm( + all_well_dirs, desc=f"Processing wells for patient {patient}", leave=False + ): + # ensure that the dir follows the alpha numeric - numeric format + well_dir_name = well_dir.name + if not check_well_dir_name_format(well_dir_name): + print(f"Skipping directory with unexpected name format: {well_dir_name}") + continue + # step through the nested directories to get to the most branched child directory + most_branched_child_dir = get_to_the_unested_dir( + nested_dir=pathlib.Path(well_dir), + times_nested=dir_mapping[patient]["times_nested"], + ) + if most_branched_child_dir is None: + print(f"Could not un-nest directory: {well_dir}") + continue + + # get a list of files, sort thenm, and loop through them to categorize by channel + files = [f for f in most_branched_child_dir.glob("*") if f.is_file()] + files.sort() + channel_images = { + channel_name: {"filename": [], "filepath": []} + for channel_name in channel_names + } + for file in files: + if file.suffix in image_extensions: + if "tile" in file.stem.lower(): + channel = file.stem.split("_")[ + dir_mapping[patient]["channel_position"] - 1 + ] + else: + channel = file.stem.split("_")[ + dir_mapping[patient]["channel_position"] + ] + if channel in channel_images: + channel_images[channel]["filename"].append(file.name) + channel_images[channel]["filepath"].append(str(file)) + + # loop through each channel and create z-stack tiffs + for channel_name in tqdm.tqdm( + channel_names, + desc=f"Processing channels for well {well_dir_name} of patient {patient}", + leave=False, + ): + channel_images[channel_name]["filename"] = sorted( + channel_images[channel_name]["filename"] + ) + if not channel_images[channel_name]["filename"]: + print(f"No images found for channel {channel_name}") + continue + + # generate filename below + filepath = channel_images[channel_name]["filepath"][0] + well = pathlib.Path(filepath).stem.split("_")[ + dir_mapping[patient]["well_position"] + ] + + output_path = ( + dir_mapping[patient]["destination"] + / f"{well}" + / f"{well}_{channel_name}.tif" + ) + # generate output directory if it doesn't exist + # and check if output file already exists + output_path.parent.mkdir(exist_ok=True, parents=True) + if output_path.exists(): + continue + + # create z-stack tiff by reading in each 2D image and stacking them + images_to_stack = np.array( + [ + read_2D_image_for_zstacking(file) + for file in channel_images[channel_name]["filepath"] + ] + ) + # write the stacked images to a tiff file + tifffile.imwrite(output_path, images_to_stack) diff --git a/0.preprocessing_data/scripts/1.update_file_structure.py b/0.preprocessing_data/scripts/1.update_file_structure.py deleted file mode 100644 index 24666ed..0000000 --- a/0.preprocessing_data/scripts/1.update_file_structure.py +++ /dev/null @@ -1,434 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# # Copy raw images into one folder to use for CellProfiler processing -# -# Currently, the images are located nest deep within multiple folders. -# For best practices, we will copy the images (preserving metadata) to one folder that can be used for CellProfiler processing. -# This file is modified from its original version: https://github.com/WayScience/GFF_2D_organoid_prototyping . - -# ## Import libraries - -# In[ ]: - - -import argparse -import multiprocessing -import os -import pathlib -import shutil -import sys -from concurrent.futures import ProcessPoolExecutor, as_completed - -import tqdm - -# Get the current working directory -cwd = pathlib.Path.cwd() - -if (cwd / ".git").is_dir(): - root_dir = cwd -else: - root_dir = None - for parent in cwd.parents: - if (parent / ".git").is_dir(): - root_dir = parent - break - -sys.path.append(str(root_dir / "utils")) -from notebook_init_utils import avoid_path_crash_bandicoot, init_notebook - -root_dir, in_notebook = init_notebook() - -if in_notebook: - import tqdm.notebook as tqdm -else: - import tqdm - - -# ## Set paths and variables - -# In[ ]: - - -argparse = argparse.ArgumentParser( - description="Copy files from one directory to another" -) -argparse.add_argument("--HPC", action="store_true", help="Type of compute to run on") -# Parse arguments -args = argparse.parse_args(args=sys.argv[1:] if "ipykernel" not in sys.argv[0] else []) -HPC = args.HPC - -print(f"HPC: {HPC}") - - -# In[ ]: - - -# check if bandicoot is set -# check if bandicoot is set -bandicoot_path = pathlib.Path(os.path.expanduser("~/mnt/bandicoot")).resolve() -if not HPC and bandicoot_path.exists(): - bandicoot = True -else: - bandicoot = False - - -# In[ ]: - - -if HPC: - raw_image_dir = pathlib.Path("/pl/active/koala/GFF_Data/GFF-Raw/").resolve( - strict=True - ) - output_base_dir = root_dir -elif bandicoot: - # comment out depending on whose computer you are on - # mike's computer - bandicoot_path = pathlib.Path( - os.path.expanduser("~/mnt/bandicoot/NF1_organoid_data") - ).resolve(strict=True) - # Jenna's computer - # bandicoot_path = pathlib.Path("/media/18tbdrive/GFF_organoid_data/") - raw_image_dir = pathlib.Path(f"{bandicoot_path}/Raw_patient_files").resolve( - strict=True - ) - output_base_dir = bandicoot_path -else: - # comment out depending on whose computer you are on - # mike's computer - raw_image_dir = pathlib.Path("~/Desktop/20TB_A/NF1_Patient_organoids").resolve( - strict=True - ) - # Jenna's computer - # raw_image_dir_local = pathlib.Path("/media/18tbdrive/GFF_organoid_data/") - output_base_dir = root_dir - - -# In[ ]: - - -output_base_dir = root_dir - - -# In[ ]: - - -# Define parent and destination directories in a single dictionary -dir_mapping = { - "NF0014_T1": { - "parent": pathlib.Path( - f"{raw_image_dir}/NF0014-Thawed 3 (Raw image files)-Combined/NF0014-Thawed 3 (Raw image files)-Combined copy" - ).resolve(strict=True), - "destination": pathlib.Path( - f"{output_base_dir}/data/NF0014_T1/raw_images" - ).resolve(), - }, - "NF0014_T2": { - "parent": pathlib.Path( - f"{raw_image_dir}/NF0014-T2 Cell Painting/NF0014-T2 Combined/" - ).resolve(strict=True), - "destination": pathlib.Path( - f"{output_base_dir}/data/NF0014_T2/raw_images" - ).resolve(), - }, - "NF0016_T1": { - "parent": pathlib.Path( - f"{raw_image_dir}/NF0016 Cell Painting-Pilot Drug Screening-selected/NF0016-Cell Painting Images/NF0016-images copy" - ).resolve(strict=True), - "destination": pathlib.Path( - f"{output_base_dir}/data/NF0016_T1/raw_images" - ).resolve(), - }, - "NF0017": { - "parent": pathlib.Path( - f"{raw_image_dir}/NF0017-T3-P7 (AGP, Mito Parameter optimization)/Acquisition 03-07-2025" - ).resolve(strict=True), - "destination": pathlib.Path( - f"{output_base_dir}/data/NF0017/raw_images" - ).resolve(), - }, - "NF0018_T6": { - "parent": pathlib.Path( - f"{raw_image_dir}/NF0018 (T6) Cell Painting-Pilot Drug Screeining/NF0018-Cell Painting Images/NF0018-All Acquisitions" - ).resolve(strict=True), - "destination": pathlib.Path( - f"{output_base_dir}/data/NF0018_T6/raw_images" - ).resolve(), - }, - "NF0021_T1": { - "parent": pathlib.Path(f"{raw_image_dir}/NF0021-T1/NF0021-T1 Combined").resolve( - strict=True - ), - "destination": pathlib.Path( - f"{output_base_dir}/data/NF0021_T1/raw_images" - ).resolve(), - }, - "NF0030_T1": { - "parent": pathlib.Path( - f"{raw_image_dir}/NF0030 Cell Painting/NF0030 Cell Painting/NF0030-Cell Painting Images/Combined" - ).resolve(strict=True), - "destination": pathlib.Path( - f"{output_base_dir}/data/NF0030_T1/raw_images" - ).resolve(), - }, - "NF0040_T1": { - "parent": pathlib.Path( - f"{raw_image_dir}/SARC0376 (NF0040) Cell Painting/SARC0376 (NF0040) Cell Painting/SARC0376 (NF0040)-Cell Painting Images/Combined" - ).resolve(strict=True), - "destination": pathlib.Path( - f"{output_base_dir}/data/NF0040_T1/raw_images" - ).resolve(), - }, - "SACRO219_T1": { - "parent": pathlib.Path( - f"{raw_image_dir}/SARC0219-T2 Cell Painting-selected/SARC0219-T2 Combined Cell Painting images/SARC0219-T2 Combined/" - ).resolve(strict=True), - "destination": pathlib.Path( - f"{output_base_dir}/data/SARCO219_T1/raw_images" - ).resolve(), - }, - "SARCO361_T1": { - "parent": pathlib.Path(f"{raw_image_dir}/SARC0361/SARC0361 Combined/").resolve( - strict=True - ), - "destination": pathlib.Path( - f"{output_base_dir}/data/SARCO361_T1/raw_images" - ).resolve(), - }, -} - -# Image extensions that we are looking to copy -image_extensions = {".tif", ".tiff"} - - -# ## Reach the nested images and copy to one folder - -# ### Set QC functions that determine if a well/site is of good quality to process based on file structure - -# In[ ]: - - -def has_consistent_naming(well_dir: pathlib.Path) -> bool: - """Check that all nested folders within a well directory have the same names as the well directory itself. - - Args: - well_dir (pathlib.Path): Path to a single well directory. - - Returns: - bool: True if all nested folders inside this well directory have the same name as the well directory, False otherwise. - """ - # Get the name of the well directory (this will be the expected folder name) - well_name = well_dir.name - - # Get the immediate subdirectories in the well directory (e.g., Field_1, Field_2) - sub_dirs = [d for d in well_dir.iterdir() if d.is_dir()] - - if not sub_dirs: - return False # No nested folders found, treat as inconsistent - - # Check if each subdirectory contains a nested folder with the same name as the well directory - for sub in sub_dirs: - nested_folders = [d.name for d in sub.iterdir() if d.is_dir()] - if well_name not in nested_folders: - return False # Inconsistent folder structure found - - return True # All subdirectories have a nested folder with the same name as the well directory - - -def is_image_folder_empty(nested_dir: pathlib.Path) -> bool: - """Check if a nested directory contains any images. - - Args: - nested_dir (pathlib.Path): Path to a directory nested within the well directory - - Returns: - bool: Boolean indicating whether the nested directory contains any images - """ - return not any( - image.suffix.lower() in image_extensions for image in nested_dir.rglob("*") - ) - - -def has_equal_images_per_channel( - nested_dir: pathlib.Path, channel_names: list[str] -) -> bool: - """Check if all specified channels have the same number of images by looking for the channel name in the filenames. - - Args: - nested_dir (pathlib.Path): Path to a directory nested within the well directory. - channel_names (list[str]): List of strings of the channel names found in the nested directory. - - Returns: - bool: Boolean indicating whether all specified channels have the same number of images. - """ - # Initialize counts for each channel - channel_counts = {channel: 0 for channel in channel_names} - - # Count images for each channel based on the channel name in the filename - for image in nested_dir.rglob("*"): # Search for all files recursively - if image.suffix.lower() in image_extensions: # Ensure it's an image file - for channel in channel_names: - if ( - channel in image.name - ): # If the channel name is found in the image filename - channel_counts[channel] += 1 - - # Get the unique set of image counts (if all counts are equal, there should be only one unique value) - image_counts = set(channel_counts.values()) - - # If all counts are equal and non-zero, return True; otherwise, return False - return len(image_counts) == 1 and 0 not in image_counts - - -# Run this cell through the script - -# In[ ]: - - -# Function to process a single nested directory - - -def process_nested_dir(nested_dir, dest_well_dir, channel_names, image_extensions): - if not nested_dir.is_dir(): - return f"Skipping {nested_dir}: Not a directory" - - if is_image_folder_empty(nested_dir): - return f"Skipping {nested_dir}: No images found" - - if not has_equal_images_per_channel(nested_dir, channel_names): - return f"Skipping {nested_dir}: Unequal images per channel" - - # Copy images to destination, skipping files with 'Tile' in their name - for image in nested_dir.rglob("*"): - if image.suffix.lower() in image_extensions and "Tile" not in image.name: - shutil.copy2(image, dest_well_dir) - - return f"Processed {nested_dir}" - - -# Function to process a single well directory -def process_well_dir(well_dir, dest_dir, channel_names, image_extensions): - if not has_consistent_naming(well_dir): - return f"Skipping {well_dir.stem}: Inconsistent nested folder names within well" - - dest_well_dir = dest_dir / well_dir.name - dest_well_dir.mkdir(parents=True, exist_ok=True) - - nested_dirs = list(well_dir.iterdir()) - for nested_dir in nested_dirs: - process_nested_dir( - nested_dir, - dest_well_dir, - channel_names, - image_extensions, - ) - - -# Set channel names -channel_names = {"405", "488", "555", "640", "TRANS", "Merge"} - -# Loop through each key in the mapping to copy data from the parent to the destination -for key, paths in dir_mapping.items(): - parent_dir = paths["parent"] - dest_dir = paths["destination"] - - print(f"Processing {key}: {parent_dir} -> {dest_dir}") - - # Ensure the destination directory exists - dest_dir.mkdir(parents=True, exist_ok=True) - - # Get all well-level directories - well_dirs = [d for d in parent_dir.iterdir() if d.is_dir()] - - if not well_dirs: - print(f"Skipping {key}: No well directories found") - continue - # Process well directories in parallel - with ProcessPoolExecutor(max_workers=multiprocessing.cpu_count() - 2) as executor: - futures = [ - executor.submit( - process_well_dir, well_dir, dest_dir, channel_names, image_extensions - ) - for well_dir in well_dirs - ] - for future in tqdm.tqdm( - as_completed(futures), - desc=f"Processing {key}", - leave=False, - total=len(well_dirs), - ): - pass - - print(f"Completed processing {key}: {parent_dir} -> {dest_dir}") - - -# ## NF0016 specific preprocessing - -# In[ ]: - - -parent_dir_NF0016 = pathlib.Path( - f"{output_base_dir}/data/NF0016_T1/raw_images" -).resolve(strict=True) -# get all dirs in the parent dir -parent_dir_NF0016 = list(parent_dir_NF0016.glob("*/")) -parent_dir_NF0016 = [x for x in parent_dir_NF0016 if x.is_dir()] -# get all child files in the parent dir -file_dir_NF0016 = [] -for parent_dir in parent_dir_NF0016: - file_dir_NF0016.extend(list(parent_dir.glob("*"))) - - -# In[ ]: - - -# rename the files in the parent dir -for file in file_dir_NF0016: - new_file_dir = pathlib.Path( - f"{file.parent}/{str(file.stem).replace(' (60X)', '')}.{file.suffix}" - ) - file.rename(new_file_dir) - -# rename the parent dir -for parent_dir in parent_dir_NF0016: - new_parent_dir = pathlib.Path( - f"{parent_dir.parent}/{str(parent_dir.stem).replace(' (60X)', '')}" - ) - # rename the parent dir - os.rename(parent_dir, new_parent_dir) - - -# ## NF0018 specific preprocessing - -# In[ ]: - - -parent_dir_NF0018 = pathlib.Path( - f"{output_base_dir}/data/NF0018_T6/raw_images" -).resolve(strict=True) -# get all dirs in the parent dir -parent_dir_NF0018 = list(parent_dir_NF0018.glob("*/")) -parent_dir_NF0018 = [x for x in parent_dir_NF0018 if x.is_dir()] -# get all child files in the parent dir -file_dir_NF0018 = [] -for parent_dir in parent_dir_NF0018: - file_dir_NF0018.extend(list(parent_dir.glob("*"))) - - -# In[ ]: - - -# rename the files in the parent dir -for file in file_dir_NF0018: - new_file_dir = pathlib.Path( - f"{file.parent}/{str(file.stem).replace(' (60X)', '')}{file.suffix}" - ) - file.rename(new_file_dir) - -# rename the parent dir -for parent_dir in parent_dir_NF0018: - new_parent_dir = pathlib.Path( - f"{parent_dir.parent}/{str(parent_dir.stem).replace(' (60X)', '')}" - ) - # rename the parent dir - os.rename(parent_dir, new_parent_dir) diff --git a/0.preprocessing_data/scripts/2b.perform_file_corruption_checks.py b/0.preprocessing_data/scripts/2.perform_file_corruption_checks.py similarity index 83% rename from 0.preprocessing_data/scripts/2b.perform_file_corruption_checks.py rename to 0.preprocessing_data/scripts/2.perform_file_corruption_checks.py index e950986..4c70b54 100644 --- a/0.preprocessing_data/scripts/2b.perform_file_corruption_checks.py +++ b/0.preprocessing_data/scripts/2.perform_file_corruption_checks.py @@ -5,33 +5,20 @@ # ## Import libraries -# In[ ]: +# In[1]: import os import pathlib import pprint -import sys import numpy as np import pandas as pd import tifffile as tiff - -# Get the current working directory -cwd = pathlib.Path.cwd() - -if (cwd / ".git").is_dir(): - root_dir = cwd -else: - root_dir = None - for parent in cwd.parents: - if (parent / ".git").is_dir(): - root_dir = parent - break -sys.path.append(str(root_dir / "utils")) from notebook_init_utils import avoid_path_crash_bandicoot, init_notebook root_dir, in_notebook = init_notebook() + if in_notebook: import tqdm.notebook as tqdm else: @@ -106,33 +93,53 @@ def max_z_projection(patient: str, well_fov: str) -> None: # ## Set input and output directories -# In[ ]: +# In[3]: +# check if bandicoot is set bandicoot_path = pathlib.Path(os.path.expanduser("~/mnt/bandicoot")).resolve() -raw_image_dir, output_base_dir = avoid_path_crash_bandicoot(bandicoot_path) +bandicoot = True -# In[4]: +# In[4]: -# patient_ids -# patient_id_file_path = pathlib.Path(f"{raw_image_dir}/data/patient_IDs.txt").resolve( -# strict=True -# ) -# list_of_patients = pd.read_csv(patient_id_file_path, header=None)[0].tolist() -list_of_patients = ["NF0035_T1"] +if bandicoot: + # comment out depending on whose computer you are on + # mike's computer + bandicoot_path = pathlib.Path( + os.path.expanduser("~/mnt/bandicoot/NF1_organoid_data") + ).resolve(strict=True) + output_base_dir = bandicoot_path +else: + # comment out depending on whose computer you are on + # mike's computer + raw_image_dir = pathlib.Path( + os.path.expanduser("~/Desktop/20TB_A/NF1_Patient_organoids") + ).resolve(strict=True) + # Jenna's computer + # raw_image_dir_local = pathlib.Path("/media/18tbdrive/GFF_organoid_data/") + output_base_dir = raw_image_dir # In[5]: +patient_id_file_path = pathlib.Path(f"{root_dir}/data/patient_IDs.txt").resolve( + strict=True +) +list_of_patients = pd.read_csv(patient_id_file_path, header=None)[0].tolist() + + +# In[6]: + + patient_input_dict = {} for patient in list_of_patients: patient_input_dict[patient] = { "raw_images": pathlib.Path( - f"{raw_image_dir}/data/{patient}/raw_images" + f"{output_base_dir}/data/{patient}/raw_images" ).resolve(), "zstack_output": pathlib.Path( f"{output_base_dir}/data/{patient}/zstack_images" @@ -148,7 +155,7 @@ def max_z_projection(patient: str, well_fov: str) -> None: # This is done by checking if the size of the channel images for a given well-fov is the same as the size of the channel images for the other well-fovs. # If the size is different, then the file is corrupted. -# In[6]: +# In[7]: patient_well_fovs_to_fix = [] @@ -179,14 +186,3 @@ def max_z_projection(patient: str, well_fov: str) -> None: f"""Need to check and fix a total of {len(patient_well_fovs_to_fix)} patient well_fovs:""" ) pprint.pprint(patient_well_fovs_to_fix) - - -# ## With the list of corrupted files, recreate the z-stack images -# This is the point where the z-stack images are created from the individual z-slice images for each FOV per well. - -# In[7]: - - -for patient_well_fov in patient_well_fovs_to_fix: - patient, well_fov = patient_well_fov.split() - max_z_projection(patient, well_fov) diff --git a/0.preprocessing_data/scripts/2a.make_z-stack_images.py b/0.preprocessing_data/scripts/2a.make_z-stack_images.py deleted file mode 100644 index 47c317b..0000000 --- a/0.preprocessing_data/scripts/2a.make_z-stack_images.py +++ /dev/null @@ -1,172 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# # Create z-stack images from the individual z-slice images for each FOV per well - -# ## Import libraries - -# In[ ]: - - -import os -import pathlib -import pprint -import sys - -import numpy as np -import pandas as pd -import tifffile as tiff - -# Get the current working directory -cwd = pathlib.Path.cwd() - -if (cwd / ".git").is_dir(): - root_dir = cwd -else: - root_dir = None - for parent in cwd.parents: - if (parent / ".git").is_dir(): - root_dir = parent - break -sys.path.append(str(root_dir / "utils")) -from notebook_init_utils import avoid_path_crash_bandicoot, init_notebook - -root_dir, in_notebook = init_notebook() - -if in_notebook: - import tqdm.notebook as tqdm -else: - import tqdm - - -# ## Set input and output directories - -# In[ ]: - - -bandicoot_path = pathlib.Path(os.path.expanduser("~/mnt/bandicoot")).resolve() -if bandicoot_path.exists(): - bandicoot = True -else: - bandicoot = False - - -# In[ ]: - - -raw_image_dir, output_base_dir = avoid_path_crash_bandicoot(bandicoot_path) - - -# In[ ]: - - -# # patient_ids -# patient_id_file_path = pathlib.Path(f"{root_dir}/data/patient_IDs.txt").resolve( -# strict=True -# ) -# list_of_patients = pd.read_csv(patient_id_file_path, header=None)[0].tolist() - -list_of_patients = ["NF0035_T1"] - - -# In[ ]: - - -patient_input_dict = {} -for patient in list_of_patients: - patient_input_dict[patient] = { - "raw_images": pathlib.Path( - f"{raw_image_dir}/data/{patient}/raw_images" - ).resolve(), - "zstack_output": pathlib.Path( - f"{raw_image_dir}/data/{patient}/zstack_images" - ).resolve(), - } -pprint.pprint(patient_input_dict) - - -# In[ ]: - - -# Image extensions that we are looking to copy -image_extensions = {".tif", ".tiff"} - - -# In[ ]: - - -unlisted_images = {"patient": ["NF0014_T1"], "image_set": ["F11-3"]} - - -# In[ ]: - - -image_extensions = {".tif", ".tiff"} -channel_names = ["405", "488", "555", "640", "TRANS"] -# make a dictionary that contains a list for each channel name, storing both filepath and filename -channel_images = { - channel_name: {"filename": [], "filepath": []} for channel_name in channel_names -} -channel_images - - -# ## Create list of the well-site folders - -# In[ ]: - - -# loop through patients, well_fovs, and each channel -# outer loop through patients -for patient in tqdm.tqdm(patient_input_dict.keys(), desc="Processing patients"): - input_dirs = [ - x for x in patient_input_dict[patient]["raw_images"].iterdir() if x.is_dir() - ] - input_dirs.sort() - - for well_dir in tqdm.tqdm( - input_dirs, desc=f"Processing wells for patient {patient}", leave=False - ): - channel_images = { - channel_name: {"filename": [], "filepath": []} - for channel_name in channel_names - } - if ( - patient in unlisted_images["patient"] - and well_dir.name in unlisted_images["image_set"] - ): - print(f"Skipping {well_dir.name} because it is not listed.") - continue - - for filename in well_dir.glob("*"): - if filename.suffix in image_extensions: - for channel_name in channel_names: - if channel_name in filename.name: - channel_images[channel_name]["filepath"].append(filename) - - for channel_name in tqdm.tqdm( - channel_names, desc=f"Processing channels in {well_dir.name}", leave=False - ): - channel_images[channel_name]["filepath"] = sorted( - channel_images[channel_name]["filepath"] - ) - if not channel_images[channel_name]["filepath"]: - print( - f"No files found for channel {channel_name} in {well_dir}. Skipping..." - ) - continue - - images_to_stack = np.array( - [ - tiff.imread(filepath) - for filepath in channel_images[channel_name]["filepath"] - ] - ) - filepath = channel_images[channel_name]["filepath"][0] - well = str(filepath.parent).split("/")[-1] - output_path = ( - patient_input_dict[patient]["zstack_output"] - / f"{well}" - / f"{well}_{channel_name}.tif" - ) - output_path.parent.mkdir(exist_ok=True, parents=True) - tiff.imwrite(output_path, images_to_stack) diff --git a/environments/preprocessing_env.yml b/environments/preprocessing_env.yml index acebc78..7647f60 100644 --- a/environments/preprocessing_env.yml +++ b/environments/preprocessing_env.yml @@ -2,16 +2,16 @@ name: gff_preprocessing_env channels: - conda-forge dependencies: - - conda-forge::python + - conda-forge::python>=3.11 - conda-forge::jupyter=1.0.0 - conda-forge::pip - conda-forge::tifffile - conda-forge::jupyterlab - - conda-forge::pandas=1.4.4 + - conda-forge::pandas - conda-forge::ipykernel - conda-forge::nb_conda_kernels - - conda-forge::scipy=1.10.0 - - conda-forge::numpy=1.22 + - conda-forge::scipy + - conda-forge::numpy - conda-forge::scikit-learn - conda-forge::seaborn - conda-forge::umap-learn @@ -20,3 +20,4 @@ dependencies: - pip: - Cytotable - coSMicQC>=0.1.2 + - -e ../utils diff --git a/utils/preprocessing_funcs.py b/utils/preprocessing_funcs.py new file mode 100644 index 0000000..29649ba --- /dev/null +++ b/utils/preprocessing_funcs.py @@ -0,0 +1,108 @@ +import pathlib +import re + +import numpy as np +import tifffile + + +def read_2D_image_for_zstacking( + file_path: str, +) -> np.ndarray: + """ + Description + ----------- + Reads in a 2D image from a given file path and returns it as a numpy array. + + Parameters + + ---------- + file_path : str + The path to the 2D image file. + Returns + ------- + np.ndarray + The 2D image as a numpy array. + + Raises + ------- + ValueError + If the image has more than 2 dimensions. + """ + + img = tifffile.imread(file_path) + + if len(img.shape) > 2 and img.shape[2] == 3: + # If the image has 3 channels (e.g., RGB), convert to grayscale + img = img[:, :, 0] + + return img + + +def get_well_fov_dirs(parent_dir: pathlib.Path) -> list[pathlib.Path]: + """ + Retrieve all well fov dirs in a given parent dir + + Parameters + ---------- + parent_dir : pathlib.Path + Patient parent dir + + Returns + ------- + List[pathlib.Path] + List of well fov dirs in _parent_dir + """ + well_dirs = [d for d in parent_dir.iterdir() if d.is_dir()] + + return well_dirs + + +def get_to_the_unested_dir( + nested_dir: pathlib.Path, times_nested: int +) -> pathlib.Path | None: + """ + Unest the dir given the number of time the directories are nested. + + Parameters + ---------- + nested_dir : pathlib.Path + The parent directory containing the nested dirs + times_nested : int + The number of times that a dir is nested + + Returns + ------- + pathlib.Path | None + The output file path of the least nested parent dir or None + """ + for _ in range(times_nested): + nested_dir = nested_dir.glob("*").__next__() + nested_dir = [d for d in nested_dir.parent.iterdir() if d.is_dir()] + if len(nested_dir) != 1: + return None + nested_dir = nested_dir[0] + return nested_dir + + +def check_well_dir_name_format(dir_name: str) -> bool: + """ + Check if a well directory name matches the expected format. + Accepts formats like: + - A1-1 + - A01-01 + - A1-1 (60X) + - A12-34 (any trailing parenthetical/metadata) + + Parameters + ---------- + dir_name : str + The name of the directory to check. + Returns + ------- + bool + True if the directory name matches the expected format, False otherwise. + + """ + dir_name = dir_name.strip() + pattern = re.compile(r"^[A-Z]\d{1,2}-\d{1,2}(?:\s*\(.*\))?$") + return bool(pattern.match(dir_name)) diff --git a/utils/pyproject.toml b/utils/pyproject.toml new file mode 100644 index 0000000..29d96eb --- /dev/null +++ b/utils/pyproject.toml @@ -0,0 +1,49 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "preprocessing_module" +version = "0.0.0" +description = "Utility package for GFF 3D organoid profiling pipeline" +requires-python = ">=3.11" +authors = [ { name = "Your Name", email = "you@example.com" } ] +license = { text = "MIT" } + +dependencies = [ +] + +[project.optional-dependencies] +dev = [ + "pytest", + "black", + "ruff", + "pre-commit", + "ipykernel", + +] + +[project.scripts] +arg_parsing = "arg_parsing_utils:main" +file_checking = "file_checking:main" +file_reading = "file_reading:main" +notebook_init_utils = "notebook_init_utils:main" +segmentation_decoupling = "segmentation_decoupling:main" +exceptions = "errors.exceptions:main" +preprocessing_funcs = "preprocessing_funcs:main" + +[tool.setuptools] +py-modules = [ +"arg_parsing_utils", +"file_checking", +"file_reading", +"notebook_init_utils", +"segmentation_decoupling", +"errors.exceptions", +"preprocessing_funcs" +] + + +[tool.setuptools.packages.find] +where = ["."] +include = ["errors*"]