diff --git a/docs/PREPROCESSING.md b/docs/PREPROCESSING.md index 5b50e0f..96c3254 100644 --- a/docs/PREPROCESSING.md +++ b/docs/PREPROCESSING.md @@ -52,7 +52,7 @@ src # Polymorphic structures: Generate SDFs -Use the segmentation data for polymorphic structures as input to the SDF generation step. +Use the segmentation data for polymorphic structures as input to the SDF generation step. ``` src @@ -65,8 +65,8 @@ src          └── pc_sdfs.py <- Sample point clouds from scaled meshes ``` -The scale factors can be computed using the `get_max_bounding_box` script. Alternatively, the pre-computed scale factors can be downloaded along with the rest of the preprocessed data. The following scale factors are available for download +The scale factors can be computed using the `get_max_bounding_box` script. Alternatively, the pre-computed scale factors can be downloaded along with the rest of the preprocessed data. The following scale factors are available for download 1. [WTC-11 hIPSc single cell image dataset v1 nucleolus (NPM1)](https://open.quiltdata.com/b/allencell/tree/aics/morphology_appropriate_representation_learning/preprocessed_data/npm1/scale_factor.npz) 2. [WTC-11 hIPSc single cell image dataset v1 nucleolus (NPM1) 64 resolution](https://open.quiltdata.com/b/allencell/tree/aics/morphology_appropriate_representation_learning/preprocessed_data/npm1_64_res/scale_factor.npz) -3. [WTC-11 hIPSc single cell image dataset v1 polymorphic structures](https://open.quiltdata.com/b/allencell/tree/aics/morphology_appropriate_representation_learning/preprocessed_data/other_polymorphic/scale_factor.npz) \ No newline at end of file +3. [WTC-11 hIPSc single cell image dataset v1 polymorphic structures](https://open.quiltdata.com/b/allencell/tree/aics/morphology_appropriate_representation_learning/preprocessed_data/other_polymorphic/scale_factor.npz) diff --git a/docs/USAGE.md b/docs/USAGE.md index 20ec5a6..dd0b575 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -226,4 +226,3 @@ To run a baseline LDA on the DMSO subset of the drug perturbation dataset, run ``` python src/br/analysis/run_drugdata_LDA.py --save_path "./outputs_npm1_perturb/" --embeddings_path "./morphology_appropriate_representation_learning/model_embeddings/npm1_perturb/" --dataset_name "npm1_perturb" --raw_path "./NPM1_single_cell_drug_perturbations/" --baseline True ``` - diff --git a/src/br/analysis/analysis_utils.py b/src/br/analysis/analysis_utils.py index b28bd74..ab79be2 100644 --- a/src/br/analysis/analysis_utils.py +++ b/src/br/analysis/analysis_utils.py @@ -634,6 +634,56 @@ def archetypes_polymorphic(this_save_path, archetypes_df, all_ret, all_features) arch_dict.to_csv(this_save_path / "archetypes.csv") +def make_pca_score_plot(this_save_path, all_ret, stratify_key, dataset_name): + cols = [i for i in all_ret.columns if "mu" in i] + feats = all_ret[cols].values + + pca = PCA() + pca_feats = pca.fit_transform(feats) + pca_feats = pd.DataFrame(pca_feats, columns=[f"PC_{i+1}" for i in range(len(cols))]) + pca_feats = pd.concat([pca_feats, all_ret], axis=1) + + n_colors = len(all_ret[stratify_key].unique()) + colors = sns.color_palette("Paired", n_colors) + + hue_order = None + if dataset_name == "pcna": + hue_order = [ + "G1", + "earlyS", + "earlyS-midS", + "midS", + "midS-lateS", + "lateS", + "lateS-G2", + "G2", + ] + colors = ( + sns.color_palette("Greens", 2) + + sns.color_palette("Reds", 4) + + sns.color_palette("Purples", 2) + ) + elif dataset_name == "other_polymorphic": + hue_order = ["1.0", "2.0", "3.0", "4.0", ">=5"] + + fig, ax = plt.subplots(1, 1, figsize=(5, 5)) + g = sns.scatterplot( + ax=ax, + data=pca_feats, + x="PC_1", + y="PC_2", + hue=stratify_key, + hue_order=hue_order, + palette=colors, + ) + g.legend(loc="center left", bbox_to_anchor=(1, 0.5), ncol=1) + + fig.savefig(this_save_path / f"pcs_{dataset_name}.png", bbox_inches="tight", dpi=300) + fig.savefig(this_save_path / f"pcs_{dataset_name}.pdf", bbox_inches="tight", dpi=300) + + pca_feats.to_csv(this_save_path / f"pcs_{dataset_name}.csv") + + def make_pacmap(this_save_path, all_ret, feats_archs): cols = [i for i in all_ret.columns if "mu" in i] @@ -642,8 +692,36 @@ def make_pacmap(this_save_path, all_ret, feats_archs): X_transformed = embedding.fit_transform(feats, init="pca") archs_transform = embedding.transform(feats_archs, init="pca", basis=feats) + + gene_to_struct = { + "NUP153": "Nuclear Pores", + "FBL": "Nucleoli (DFC)", + "LAMP1": "Lysosomes", + "ST6GAL1": "Golgi", + "SON": "Nuclear Speckles", + "HIST1H2BJ": "Histones", + "SMC1A": "Cohesins", + "CETN2": "Centrioles", + "SLC25A17": "Peroxisomes", + "RAB5A": "Endosomes", + "NPM1": "Nucleoli (GC)", + } + all_ret["structure_name"] = all_ret["structure_name"].apply(lambda x: gene_to_struct[x]) + labels = all_ret["structure_name"].values - colors = sns.color_palette("Paired", len(np.unique(labels))) + if "Golgi" in labels: + colors = sns.color_palette("Set2", len(np.unique(labels))) + else: + colors = sns.color_palette("Paired", len(np.unique(labels))) + + tmp = pd.DataFrame( + X_transformed, columns=[f"PACMAP_{i}" for i in range(X_transformed.shape[1])] + ) + tmp2 = pd.DataFrame( + archs_transform, columns=[f"Archetype_{i}" for i in range(archs_transform.shape[1])] + ) + tmp = pd.concat([tmp, tmp2, all_ret[["structure_name"]]], axis=1) + tmp.to_csv(this_save_path / "pacmap_archetypes.csv") cdict = {i: colors[j] for j, i in enumerate(np.unique(labels))} @@ -659,7 +737,10 @@ def make_pacmap(this_save_path, all_ret, feats_archs): alpha=0.6, ) ax.legend() - lgnd = plt.legend(loc="upper right", numpoints=1, fontsize=10) + if "Golgi" in labels: + lgnd = plt.legend(loc="upper left", numpoints=1, fontsize=10) + else: + lgnd = plt.legend(loc="upper right", numpoints=1, fontsize=10) # change the marker size manually for both lines for handle in lgnd.legend_handles: diff --git a/src/br/analysis/run_analysis.py b/src/br/analysis/run_analysis.py index 91d98b9..483f724 100644 --- a/src/br/analysis/run_analysis.py +++ b/src/br/analysis/run_analysis.py @@ -12,6 +12,7 @@ latent_walk_polymorphic, latent_walk_save_recons, make_pacmap, + make_pca_score_plot, pseudo_time_analysis, setup_gpu, str2bool, @@ -48,6 +49,8 @@ def main(args): this_save_path = Path(args.save_path) / Path("latent_walks") this_save_path.mkdir(parents=True, exist_ok=True) + make_pca_score_plot(this_save_path, all_ret, stratify_key, args.dataset_name) + if args.sdf: latent_walk_polymorphic(stratify_key, all_ret, this_save_path, latent_dim) else: diff --git a/src/br/analysis/run_classification.py b/src/br/analysis/run_classification.py index 4a21767..03aafcd 100644 --- a/src/br/analysis/run_classification.py +++ b/src/br/analysis/run_classification.py @@ -2,16 +2,18 @@ import os import sys from pathlib import Path -import pandas as pd + +import matplotlib.pyplot as plt import numpy as np -from br.models.compute_features import get_embeddings -from br.models.utils import get_all_configs_per_dataset +import pandas as pd +import seaborn as sns from skimage import measure as skmeasure from skimage import morphology as skmorpho from tqdm import tqdm + from br.features.classification import get_classification_df -import matplotlib.pyplot as plt -import seaborn as sns +from br.models.compute_features import get_embeddings +from br.models.utils import get_all_configs_per_dataset def get_surface_area(input_img): @@ -20,9 +22,9 @@ def get_surface_area(input_img): input_img[:, :, [0, -1]] = 0 input_img[:, [0, -1], :] = 0 input_img[[0, -1], :, :] = 0 - input_img_surface = np.logical_xor( - input_img, skmorpho.binary_erosion(input_img) - ).astype(np.uint8) + input_img_surface = np.logical_xor(input_img, skmorpho.binary_erosion(input_img)).astype( + np.uint8 + ) # Loop through the boundary voxels to calculate the number of # boundary faces. Using 6-neighborhod. pxl_z, pxl_y, pxl_x = np.nonzero(input_img_surface) @@ -113,20 +115,14 @@ def main(args): feats["CellId"] = row["CellId"] all_feats.append(pd.DataFrame(feats, index=[0])) all_feats = pd.concat(all_feats, axis=0) - all_feats = all_feats.merge( - orig_df[["CellId", "vol_bins", "vol_bins_inds"]], on="CellId" - ) + all_feats = all_feats.merge(orig_df[["CellId", "vol_bins", "vol_bins_inds"]], on="CellId") all_feats["mean_volume"] = all_feats["shape_volume"] / all_feats["connectivity_cc"] all_feats["mean_surface_area"] = ( all_feats["roundness_surface_area"] / all_feats["connectivity_cc"] ) - all_feats = all_feats.merge( - orig_df[["CellId", "STR_connectivity_cc_thresh"]], on="CellId" - ) - all_feats = all_feats.loc[all_feats["CellId"] != 724520].reset_index( - drop=True - ) # nan row + all_feats = all_feats.merge(orig_df[["CellId", "STR_connectivity_cc_thresh"]], on="CellId") + all_feats = all_feats.loc[all_feats["CellId"] != 724520].reset_index(drop=True) # nan row all_ret = all_ret.loc[all_ret["CellId"] != 724520].reset_index(drop=True) # nan row assert not all_feats["mean_surface_area"].isna().any() @@ -134,7 +130,6 @@ def main(args): orig_df[["CellId", "vol_bins", "vol_bins_inds"]], on="CellId", ) - from br.features.classification import get_classification_df all_baseline = [] all_feats["model"] = "baseline" @@ -175,9 +170,7 @@ def main(args): "(676.015, 818.646)", "(818.646, 961.277)", ] - g = sns.boxplot( - ax=ax, data=plot, x="vol_bin", y="top_1_acc", hue="features", order=x_order - ) + g = sns.boxplot(ax=ax, data=plot, x="vol_bin", y="top_1_acc", hue="features", order=x_order) plt.xticks(rotation=30) ax.set_xticklabels( ["0", "1", "2", "3", "4"], rotation=0 @@ -191,6 +184,7 @@ def main(args): bbox_inches="tight", dpi=300, ) + plot.to_csv(args.save_path + "classification_number_pieces.csv") # fig.savefig("classification_number_pieces_nogrouping.png", bbox_inches="tight", dpi=300) @@ -198,18 +192,14 @@ def main(args): parser = argparse.ArgumentParser( description="Script for computing perturbation detection metrics" ) - parser.add_argument( - "--save_path", type=str, required=True, help="Path to save the results." - ) + parser.add_argument("--save_path", type=str, required=True, help="Path to save the results.") parser.add_argument( "--embeddings_path", type=str, required=True, help="Path to the saved embeddings.", ) - parser.add_argument( - "--dataset_name", type=str, required=True, help="Name of the dataset." - ) + parser.add_argument("--dataset_name", type=str, required=True, help="Name of the dataset.") args = parser.parse_args() # Validate that required paths are provided @@ -221,6 +211,6 @@ def main(args): """ Example run: - + python src/br/analysis/run_classification.py --save_path "./outputs_npm1/" --embeddings_path "./morphology_appropriate_representation_learning/model_embeddings/npm1/" --dataset_name "npm1" """ diff --git a/src/br/analysis/run_drugdata_LDA.py b/src/br/analysis/run_drugdata_LDA.py index 8e10a91..175a80c 100644 --- a/src/br/analysis/run_drugdata_LDA.py +++ b/src/br/analysis/run_drugdata_LDA.py @@ -2,18 +2,21 @@ import os import sys from pathlib import Path -from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA -from sklearn.decomposition import PCA -from tqdm import tqdm -import numpy as np -from skimage.io import imread + +import matplotlib as mpl import matplotlib.pyplot as plt +import numpy as np import pandas as pd -from br.models.compute_features import get_embeddings +import seaborn as sns +from skimage import measure +from skimage.io import imread +from sklearn.decomposition import PCA +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA +from tqdm import tqdm + from br.analysis.analysis_utils import str2bool +from br.models.compute_features import get_embeddings from br.models.utils import get_all_configs_per_dataset -from skimage import measure -import seaborn as sns def pad_to_size(array, target_shape, padding_value=0): @@ -39,9 +42,7 @@ def pad_to_size(array, target_shape, padding_value=0): padding = [(padding_before[i], padding_after[i]) for i in range(len(target_shape))] - padded_array = np.pad( - array, pad_width=padding, mode="constant", constant_values=padding_value - ) + padded_array = np.pad(array, pad_width=padding, mode="constant", constant_values=padding_value) return padded_array @@ -53,9 +54,8 @@ def get_image(cell_id, raw_df): def sort_by_second_element_with_index(data): - """ - Sorts a list of lists based on the second element of each sublist - and returns a list of original indices in the sorted order. + """Sorts a list of lists based on the second element of each sublist and returns a list of + original indices in the sorted order. Args: data: A list of lists. @@ -92,10 +92,8 @@ def merge_contours(contours, distance_threshold): # Calculate distances between all pairs of points distances = np.sqrt( - ( - (contour1[:, 0, None] - contour2[:, 0]) ** 2 - + (contour1[:, 1, None] - contour2[:, 1]) ** 2 - ) + (contour1[:, 0, None] - contour2[:, 0]) ** 2 + + (contour1[:, 1, None] - contour2[:, 1]) ** 2 ) # Find the minimum distance @@ -136,12 +134,8 @@ def main(args): args.embeddings_path, ) raw_df = pd.read_csv(Path(args.raw_path) / "manifest.csv") - raw_df["crop_raw"] = raw_df["crop_raw"].apply( - lambda x: Path(args.raw_path) / Path(x) - ) - raw_df["crop_seg_nuc"] = raw_df["crop_seg_nuc"].apply( - lambda x: Path(args.raw_path) / Path(x) - ) + raw_df["crop_raw"] = raw_df["crop_raw"].apply(lambda x: Path(args.raw_path) / Path(x)) + raw_df["crop_seg_nuc"] = raw_df["crop_seg_nuc"].apply(lambda x: Path(args.raw_path) / Path(x)) map_ = { "Actinomyocin D 0.5ug per mL": "Actinomyocin D", @@ -163,7 +157,7 @@ def main(args): "Brefeldin 5uM": "Brefeldin", } all_ret["condition"] = all_ret["condition"].replace(map_) - all_ret = all_ret.merge(raw_df[['CellId', 'plate_id']], on='CellId') + all_ret = all_ret.merge(raw_df[["CellId", "plate_id"]], on="CellId") cols = [i for i in all_ret.columns if "mu" in i] hits = [ @@ -196,11 +190,11 @@ def main(args): sns.set_context("talk") if args.baseline: - all_ret = all_ret.loc[all_ret['condition'] == 'DMSO (control)'] - hits = [[215, 214, 231, 213, 232, 230, 233, 216]] # random sample of plates + all_ret = all_ret.loc[all_ret["condition"] == "DMSO (control)"] + hits = [[215, 214, 231, 213, 232, 230, 233, 216]] # random sample of plates merge_thresh = [11] - scale_lows = [i*2 for i in scale_lows] - scale_highs = [i*3 for i in scale_highs] + scale_lows = [i * 2 for i in scale_lows] + scale_highs = [i * 3 for i in scale_highs] for j, hit in enumerate(hits): print("Analysis for", hit) @@ -210,8 +204,8 @@ def main(args): tmp1 = all_ret.loc[all_ret["condition"] == "DMSO (control)"] tmp2 = all_ret.loc[all_ret["condition"] == hit] else: - tmp1 = all_ret.loc[all_ret['plate_id'].isin(hit[:4])] - tmp2 = all_ret.loc[all_ret['plate_id'].isin(hit[4:])] + tmp1 = all_ret.loc[all_ret["plate_id"].isin(hit[:4])] + tmp2 = all_ret.loc[all_ret["plate_id"].isin(hit[4:])] tmp1["class"] = 0 tmp2["class"] = 1 tmp = pd.concat([tmp1, tmp2], axis=0).reset_index(drop=True) @@ -222,6 +216,8 @@ def main(args): preds = clf.fit_transform(X, y) lda_direction = clf.coef_[0] lda_line = np.array([-lda_direction * scale_low, lda_direction * scale_high]) + + mpl.rcParams["font.size"] = 17 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6)) colors = plt.cm.Set2(np.linspace(0, 1, 8)) # PCA Projection plot @@ -296,6 +292,8 @@ def main(args): count = 0 max_x, max_y = 0, 0 seen = set() + tmp["LDA_line"] = np.NaN + for w in tqdm(walk, desc="Traversing PC-LDA line"): dist = np.linalg.norm(X - w, axis=1) dist_argsort = np.argsort(dist) @@ -318,7 +316,7 @@ def main(args): classes = [] from scipy import ndimage - for w in tqdm(walk, desc="Traversing PC-LDA line"): + for j_index, w in enumerate(tqdm(walk, desc="Traversing PC-LDA line")): dist = np.linalg.norm(X - w, axis=1) dist_argsort = np.argsort(dist) examples = [] @@ -337,6 +335,7 @@ def main(args): this_class = tmp.iloc[idx]["class"] classes.append(this_class) + tmp.loc[tmp["CellId"] == this_id, "LDA_line"] = j_index ax1.scatter( tmp.iloc[idx]["pc_0"], @@ -362,6 +361,7 @@ def main(args): count += 1 movie = np.hstack(movie) movie2 = np.hstack(movie2) + tmp.to_csv(save_path + f"LDA_{hit}.csv") contours = measure.find_contours(movie2, 0.5) @@ -390,20 +390,18 @@ def main(args): parser = argparse.ArgumentParser( description="Script for computing perturbation detection metrics" ) - parser.add_argument( - "--save_path", type=str, required=True, help="Path to save the results." - ) + parser.add_argument("--save_path", type=str, required=True, help="Path to save the results.") parser.add_argument( "--embeddings_path", type=str, required=True, help="Path to the saved embeddings.", ) + parser.add_argument("--dataset_name", type=str, required=True, help="Name of the dataset.") + parser.add_argument("--raw_path", type=str, required=True, help="Path to raw data") parser.add_argument( - "--dataset_name", type=str, required=True, help="Name of the dataset." + "--baseline", type=str2bool, default=False, help="Perform LDA baseline only" ) - parser.add_argument("--raw_path", type=str, required=True, help="Path to raw data") - parser.add_argument("--baseline", type=str2bool, default=False, help="Perform LDA baseline only") args = parser.parse_args() # Validate that required paths are provided @@ -419,6 +417,6 @@ def main(args): For all drugs: python src/br/analysis/run_drugdata_LDA.py --save_path "./outputs_npm1_perturb/" --embeddings_path "./morphology_appropriate_representation_learning/model_embeddings/npm1_perturb/" --dataset_name "npm1_perturb" --raw_path "./NPM1_single_cell_drug_perturbations/" - For baseline (DMSO subset 1 -> DMSO subset 2): + For baseline (DMSO subset 1 -> DMSO subset 2): python src/br/analysis/run_drugdata_LDA.py --save_path "./outputs_npm1_perturb/" --embeddings_path "./morphology_appropriate_representation_learning/model_embeddings/npm1_perturb/" --dataset_name "npm1_perturb" --raw_path "./NPM1_single_cell_drug_perturbations/" --baseline True """ diff --git a/src/br/features/classification.py b/src/br/features/classification.py index ad63f17..992a342 100644 --- a/src/br/features/classification.py +++ b/src/br/features/classification.py @@ -91,8 +91,6 @@ def get_classification(this_mo, target_col, cols=None): if cols is None: cols = [i for i in this_mo.columns if "mu" in i] - - clf = proba_logreg( random_state=20, class_weight=class_weight,