AllenCell · ritvikvasan · Apr 22, 2025 · Apr 22, 2025 · Apr 22, 2025
diff --git a/docs/PREPROCESSING.md b/docs/PREPROCESSING.md
@@ -52,7 +52,7 @@ src
 
 # Polymorphic structures: Generate SDFs
 
-Use the segmentation data for polymorphic structures as input to the SDF generation step. 
+Use the segmentation data for polymorphic structures as input to the SDF generation step.
 
 ```
 src
@@ -65,8 +65,8 @@ src
                └── pc_sdfs.py    <- Sample point clouds from scaled meshes
 ```
 
-The scale factors can be computed using the `get_max_bounding_box` script. Alternatively, the pre-computed scale factors can be downloaded along with the rest of the preprocessed data. The following scale factors are available for download 
+The scale factors can be computed using the `get_max_bounding_box` script. Alternatively, the pre-computed scale factors can be downloaded along with the rest of the preprocessed data. The following scale factors are available for download
 
 1. [WTC-11 hIPSc single cell image dataset v1 nucleolus (NPM1)](https://open.quiltdata.com/b/allencell/tree/aics/morphology_appropriate_representation_learning/preprocessed_data/npm1/scale_factor.npz)
 2. [WTC-11 hIPSc single cell image dataset v1 nucleolus (NPM1) 64 resolution](https://open.quiltdata.com/b/allencell/tree/aics/morphology_appropriate_representation_learning/preprocessed_data/npm1_64_res/scale_factor.npz)
-3. [WTC-11 hIPSc single cell image dataset v1 polymorphic structures](https://open.quiltdata.com/b/allencell/tree/aics/morphology_appropriate_representation_learning/preprocessed_data/other_polymorphic/scale_factor.npz)
+3. [WTC-11 hIPSc single cell image dataset v1 polymorphic structures](https://open.quiltdata.com/b/allencell/tree/aics/morphology_appropriate_representation_learning/preprocessed_data/other_polymorphic/scale_factor.npz)
diff --git a/docs/USAGE.md b/docs/USAGE.md
@@ -226,4 +226,3 @@ To run a baseline LDA on the DMSO subset of the drug perturbation dataset, run
 ```
 python src/br/analysis/run_drugdata_LDA.py --save_path "./outputs_npm1_perturb/" --embeddings_path "./morphology_appropriate_representation_learning/model_embeddings/npm1_perturb/" --dataset_name "npm1_perturb" --raw_path "./NPM1_single_cell_drug_perturbations/" --baseline True
 ```
-
diff --git a/src/br/analysis/analysis_utils.py b/src/br/analysis/analysis_utils.py
@@ -634,6 +634,56 @@ def archetypes_polymorphic(this_save_path, archetypes_df, all_ret, all_features)
     arch_dict.to_csv(this_save_path / "archetypes.csv")
 
 
+def make_pca_score_plot(this_save_path, all_ret, stratify_key, dataset_name):
+    cols = [i for i in all_ret.columns if "mu" in i]
+    feats = all_ret[cols].values
+
+    pca = PCA()
+    pca_feats = pca.fit_transform(feats)
+    pca_feats = pd.DataFrame(pca_feats, columns=[f"PC_{i+1}" for i in range(len(cols))])
+    pca_feats = pd.concat([pca_feats, all_ret], axis=1)
+
+    n_colors = len(all_ret[stratify_key].unique())
+    colors = sns.color_palette("Paired", n_colors)
+
+    hue_order = None
+    if dataset_name == "pcna":
+        hue_order = [
+            "G1",
+            "earlyS",
+            "earlyS-midS",
+            "midS",
+            "midS-lateS",
+            "lateS",
+            "lateS-G2",
+            "G2",
+        ]
+        colors = (
+            sns.color_palette("Greens", 2)
+            + sns.color_palette("Reds", 4)
+            + sns.color_palette("Purples", 2)
+        )
+    elif dataset_name == "other_polymorphic":
+        hue_order = ["1.0", "2.0", "3.0", "4.0", ">=5"]
+
+    fig, ax = plt.subplots(1, 1, figsize=(5, 5))
+    g = sns.scatterplot(
+        ax=ax,
+        data=pca_feats,
+        x="PC_1",
+        y="PC_2",
+        hue=stratify_key,
+        hue_order=hue_order,
+        palette=colors,
+    )
+    g.legend(loc="center left", bbox_to_anchor=(1, 0.5), ncol=1)
+
+    fig.savefig(this_save_path / f"pcs_{dataset_name}.png", bbox_inches="tight", dpi=300)
+    fig.savefig(this_save_path / f"pcs_{dataset_name}.pdf", bbox_inches="tight", dpi=300)
+
+    pca_feats.to_csv(this_save_path / f"pcs_{dataset_name}.csv")
+
+
 def make_pacmap(this_save_path, all_ret, feats_archs):
 
     cols = [i for i in all_ret.columns if "mu" in i]
@@ -642,8 +692,36 @@ def make_pacmap(this_save_path, all_ret, feats_archs):
     X_transformed = embedding.fit_transform(feats, init="pca")
 
     archs_transform = embedding.transform(feats_archs, init="pca", basis=feats)
+
+    gene_to_struct = {
+        "NUP153": "Nuclear Pores",
+        "FBL": "Nucleoli (DFC)",
+        "LAMP1": "Lysosomes",
+        "ST6GAL1": "Golgi",
+        "SON": "Nuclear Speckles",
+        "HIST1H2BJ": "Histones",
+        "SMC1A": "Cohesins",
+        "CETN2": "Centrioles",
+        "SLC25A17": "Peroxisomes",
+        "RAB5A": "Endosomes",
+        "NPM1": "Nucleoli (GC)",
+    }
+    all_ret["structure_name"] = all_ret["structure_name"].apply(lambda x: gene_to_struct[x])
+
     labels = all_ret["structure_name"].values
-    colors = sns.color_palette("Paired", len(np.unique(labels)))
+    if "Golgi" in labels:
+        colors = sns.color_palette("Set2", len(np.unique(labels)))
+    else:
+        colors = sns.color_palette("Paired", len(np.unique(labels)))
+
+    tmp = pd.DataFrame(
+        X_transformed, columns=[f"PACMAP_{i}" for i in range(X_transformed.shape[1])]
+    )
+    tmp2 = pd.DataFrame(
+        archs_transform, columns=[f"Archetype_{i}" for i in range(archs_transform.shape[1])]
+    )
+    tmp = pd.concat([tmp, tmp2, all_ret[["structure_name"]]], axis=1)
+    tmp.to_csv(this_save_path / "pacmap_archetypes.csv")
 
     cdict = {i: colors[j] for j, i in enumerate(np.unique(labels))}
 
@@ -659,7 +737,10 @@ def make_pacmap(this_save_path, all_ret, feats_archs):
             alpha=0.6,
         )
     ax.legend()
-    lgnd = plt.legend(loc="upper right", numpoints=1, fontsize=10)
+    if "Golgi" in labels:
+        lgnd = plt.legend(loc="upper left", numpoints=1, fontsize=10)
+    else:
+        lgnd = plt.legend(loc="upper right", numpoints=1, fontsize=10)
 
     # change the marker size manually for both lines
     for handle in lgnd.legend_handles:

diff --git a/src/br/analysis/run_analysis.py b/src/br/analysis/run_analysis.py
@@ -12,6 +12,7 @@
     latent_walk_polymorphic,
     latent_walk_save_recons,
     make_pacmap,
+    make_pca_score_plot,
     pseudo_time_analysis,
     setup_gpu,
     str2bool,
@@ -48,6 +49,8 @@ def main(args):
     this_save_path = Path(args.save_path) / Path("latent_walks")
     this_save_path.mkdir(parents=True, exist_ok=True)
 
+    make_pca_score_plot(this_save_path, all_ret, stratify_key, args.dataset_name)
+
     if args.sdf:
         latent_walk_polymorphic(stratify_key, all_ret, this_save_path, latent_dim)
     else:

diff --git a/src/br/analysis/run_classification.py b/src/br/analysis/run_classification.py
@@ -2,16 +2,18 @@
 import os
 import sys
 from pathlib import Path
-import pandas as pd
+
+import matplotlib.pyplot as plt
 import numpy as np
-from br.models.compute_features import get_embeddings
-from br.models.utils import get_all_configs_per_dataset
+import pandas as pd
+import seaborn as sns
 from skimage import measure as skmeasure
 from skimage import morphology as skmorpho
 from tqdm import tqdm
+
 from br.features.classification import get_classification_df
-import matplotlib.pyplot as plt
-import seaborn as sns
+from br.models.compute_features import get_embeddings
+from br.models.utils import get_all_configs_per_dataset
 
 
 def get_surface_area(input_img):
@@ -20,9 +22,9 @@ def get_surface_area(input_img):
     input_img[:, :, [0, -1]] = 0
     input_img[:, [0, -1], :] = 0
     input_img[[0, -1], :, :] = 0
-    input_img_surface = np.logical_xor(
-        input_img, skmorpho.binary_erosion(input_img)
-    ).astype(np.uint8)
+    input_img_surface = np.logical_xor(input_img, skmorpho.binary_erosion(input_img)).astype(
+        np.uint8
+    )
     # Loop through the boundary voxels to calculate the number of
     # boundary faces. Using 6-neighborhod.
     pxl_z, pxl_y, pxl_x = np.nonzero(input_img_surface)
@@ -113,28 +115,21 @@ def main(args):
         feats["CellId"] = row["CellId"]
         all_feats.append(pd.DataFrame(feats, index=[0]))
     all_feats = pd.concat(all_feats, axis=0)
-    all_feats = all_feats.merge(
-        orig_df[["CellId", "vol_bins", "vol_bins_inds"]], on="CellId"
-    )
+    all_feats = all_feats.merge(orig_df[["CellId", "vol_bins", "vol_bins_inds"]], on="CellId")
     all_feats["mean_volume"] = all_feats["shape_volume"] / all_feats["connectivity_cc"]
     all_feats["mean_surface_area"] = (
         all_feats["roundness_surface_area"] / all_feats["connectivity_cc"]
     )
 
-    all_feats = all_feats.merge(
-        orig_df[["CellId", "STR_connectivity_cc_thresh"]], on="CellId"
-    )
-    all_feats = all_feats.loc[all_feats["CellId"] != 724520].reset_index(
-        drop=True
-    )  # nan row
+    all_feats = all_feats.merge(orig_df[["CellId", "STR_connectivity_cc_thresh"]], on="CellId")
+    all_feats = all_feats.loc[all_feats["CellId"] != 724520].reset_index(drop=True)  # nan row
     all_ret = all_ret.loc[all_ret["CellId"] != 724520].reset_index(drop=True)  # nan row
     assert not all_feats["mean_surface_area"].isna().any()
 
     all_ret = all_ret.merge(
         orig_df[["CellId", "vol_bins", "vol_bins_inds"]],
         on="CellId",
     )
-    from br.features.classification import get_classification_df
 
     all_baseline = []
     all_feats["model"] = "baseline"
@@ -175,9 +170,7 @@ def main(args):
         "(676.015, 818.646)",
         "(818.646, 961.277)",
     ]
-    g = sns.boxplot(
-        ax=ax, data=plot, x="vol_bin", y="top_1_acc", hue="features", order=x_order
-    )
+    g = sns.boxplot(ax=ax, data=plot, x="vol_bin", y="top_1_acc", hue="features", order=x_order)
     plt.xticks(rotation=30)
     ax.set_xticklabels(
         ["0", "1", "2", "3", "4"], rotation=0
@@ -191,25 +184,22 @@ def main(args):
         bbox_inches="tight",
         dpi=300,
     )
+    plot.to_csv(args.save_path + "classification_number_pieces.csv")
     # fig.savefig("classification_number_pieces_nogrouping.png", bbox_inches="tight", dpi=300)
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Script for computing perturbation detection metrics"
     )
-    parser.add_argument(
-        "--save_path", type=str, required=True, help="Path to save the results."
-    )
+    parser.add_argument("--save_path", type=str, required=True, help="Path to save the results.")
     parser.add_argument(
         "--embeddings_path",
         type=str,
         required=True,
         help="Path to the saved embeddings.",
     )
-    parser.add_argument(
-        "--dataset_name", type=str, required=True, help="Name of the dataset."
-    )
+    parser.add_argument("--dataset_name", type=str, required=True, help="Name of the dataset.")
     args = parser.parse_args()
 
     # Validate that required paths are provided
@@ -221,6 +211,6 @@ def main(args):
 
     """
     Example run:
-    
+
     python src/br/analysis/run_classification.py --save_path "./outputs_npm1/" --embeddings_path "./morphology_appropriate_representation_learning/model_embeddings/npm1/" --dataset_name "npm1"
     """
Original file line number	Diff line number	Diff line change
Expand Up		@@ -226,4 +226,3 @@ To run a baseline LDA on the DMSO subset of the drug perturbation dataset, run
		```
		python src/br/analysis/run_drugdata_LDA.py --save_path "./outputs_npm1_perturb/" --embeddings_path "./morphology_appropriate_representation_learning/model_embeddings/npm1_perturb/" --dataset_name "npm1_perturb" --raw_path "./NPM1_single_cell_drug_perturbations/" --baseline True
		```