Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/PREPROCESSING.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ src

# Polymorphic structures: Generate SDFs

Use the segmentation data for polymorphic structures as input to the SDF generation step.
Use the segmentation data for polymorphic structures as input to the SDF generation step.

```
src
Expand All @@ -65,8 +65,8 @@ src
         └── pc_sdfs.py <- Sample point clouds from scaled meshes
```

The scale factors can be computed using the `get_max_bounding_box` script. Alternatively, the pre-computed scale factors can be downloaded along with the rest of the preprocessed data. The following scale factors are available for download
The scale factors can be computed using the `get_max_bounding_box` script. Alternatively, the pre-computed scale factors can be downloaded along with the rest of the preprocessed data. The following scale factors are available for download

1. [WTC-11 hIPSc single cell image dataset v1 nucleolus (NPM1)](https://open.quiltdata.com/b/allencell/tree/aics/morphology_appropriate_representation_learning/preprocessed_data/npm1/scale_factor.npz)
2. [WTC-11 hIPSc single cell image dataset v1 nucleolus (NPM1) 64 resolution](https://open.quiltdata.com/b/allencell/tree/aics/morphology_appropriate_representation_learning/preprocessed_data/npm1_64_res/scale_factor.npz)
3. [WTC-11 hIPSc single cell image dataset v1 polymorphic structures](https://open.quiltdata.com/b/allencell/tree/aics/morphology_appropriate_representation_learning/preprocessed_data/other_polymorphic/scale_factor.npz)
3. [WTC-11 hIPSc single cell image dataset v1 polymorphic structures](https://open.quiltdata.com/b/allencell/tree/aics/morphology_appropriate_representation_learning/preprocessed_data/other_polymorphic/scale_factor.npz)
1 change: 0 additions & 1 deletion docs/USAGE.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,4 +226,3 @@ To run a baseline LDA on the DMSO subset of the drug perturbation dataset, run
```
python src/br/analysis/run_drugdata_LDA.py --save_path "./outputs_npm1_perturb/" --embeddings_path "./morphology_appropriate_representation_learning/model_embeddings/npm1_perturb/" --dataset_name "npm1_perturb" --raw_path "./NPM1_single_cell_drug_perturbations/" --baseline True
```

85 changes: 83 additions & 2 deletions src/br/analysis/analysis_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,6 +634,56 @@ def archetypes_polymorphic(this_save_path, archetypes_df, all_ret, all_features)
arch_dict.to_csv(this_save_path / "archetypes.csv")


def make_pca_score_plot(this_save_path, all_ret, stratify_key, dataset_name):
cols = [i for i in all_ret.columns if "mu" in i]
feats = all_ret[cols].values

pca = PCA()
pca_feats = pca.fit_transform(feats)
pca_feats = pd.DataFrame(pca_feats, columns=[f"PC_{i+1}" for i in range(len(cols))])
pca_feats = pd.concat([pca_feats, all_ret], axis=1)

n_colors = len(all_ret[stratify_key].unique())
colors = sns.color_palette("Paired", n_colors)

hue_order = None
if dataset_name == "pcna":
hue_order = [
"G1",
"earlyS",
"earlyS-midS",
"midS",
"midS-lateS",
"lateS",
"lateS-G2",
"G2",
]
colors = (
sns.color_palette("Greens", 2)
+ sns.color_palette("Reds", 4)
+ sns.color_palette("Purples", 2)
)
elif dataset_name == "other_polymorphic":
hue_order = ["1.0", "2.0", "3.0", "4.0", ">=5"]

fig, ax = plt.subplots(1, 1, figsize=(5, 5))
g = sns.scatterplot(
ax=ax,
data=pca_feats,
x="PC_1",
y="PC_2",
hue=stratify_key,
hue_order=hue_order,
palette=colors,
)
g.legend(loc="center left", bbox_to_anchor=(1, 0.5), ncol=1)

fig.savefig(this_save_path / f"pcs_{dataset_name}.png", bbox_inches="tight", dpi=300)
fig.savefig(this_save_path / f"pcs_{dataset_name}.pdf", bbox_inches="tight", dpi=300)

pca_feats.to_csv(this_save_path / f"pcs_{dataset_name}.csv")


def make_pacmap(this_save_path, all_ret, feats_archs):

cols = [i for i in all_ret.columns if "mu" in i]
Expand All @@ -642,8 +692,36 @@ def make_pacmap(this_save_path, all_ret, feats_archs):
X_transformed = embedding.fit_transform(feats, init="pca")

archs_transform = embedding.transform(feats_archs, init="pca", basis=feats)

gene_to_struct = {
"NUP153": "Nuclear Pores",
"FBL": "Nucleoli (DFC)",
"LAMP1": "Lysosomes",
"ST6GAL1": "Golgi",
"SON": "Nuclear Speckles",
"HIST1H2BJ": "Histones",
"SMC1A": "Cohesins",
"CETN2": "Centrioles",
"SLC25A17": "Peroxisomes",
"RAB5A": "Endosomes",
"NPM1": "Nucleoli (GC)",
}
all_ret["structure_name"] = all_ret["structure_name"].apply(lambda x: gene_to_struct[x])

labels = all_ret["structure_name"].values
colors = sns.color_palette("Paired", len(np.unique(labels)))
if "Golgi" in labels:
colors = sns.color_palette("Set2", len(np.unique(labels)))
else:
colors = sns.color_palette("Paired", len(np.unique(labels)))

tmp = pd.DataFrame(
X_transformed, columns=[f"PACMAP_{i}" for i in range(X_transformed.shape[1])]
)
tmp2 = pd.DataFrame(
archs_transform, columns=[f"Archetype_{i}" for i in range(archs_transform.shape[1])]
)
tmp = pd.concat([tmp, tmp2, all_ret[["structure_name"]]], axis=1)
tmp.to_csv(this_save_path / "pacmap_archetypes.csv")

cdict = {i: colors[j] for j, i in enumerate(np.unique(labels))}

Expand All @@ -659,7 +737,10 @@ def make_pacmap(this_save_path, all_ret, feats_archs):
alpha=0.6,
)
ax.legend()
lgnd = plt.legend(loc="upper right", numpoints=1, fontsize=10)
if "Golgi" in labels:
lgnd = plt.legend(loc="upper left", numpoints=1, fontsize=10)
else:
lgnd = plt.legend(loc="upper right", numpoints=1, fontsize=10)

# change the marker size manually for both lines
for handle in lgnd.legend_handles:
Expand Down
3 changes: 3 additions & 0 deletions src/br/analysis/run_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
latent_walk_polymorphic,
latent_walk_save_recons,
make_pacmap,
make_pca_score_plot,
pseudo_time_analysis,
setup_gpu,
str2bool,
Expand Down Expand Up @@ -48,6 +49,8 @@ def main(args):
this_save_path = Path(args.save_path) / Path("latent_walks")
this_save_path.mkdir(parents=True, exist_ok=True)

make_pca_score_plot(this_save_path, all_ret, stratify_key, args.dataset_name)

if args.sdf:
latent_walk_polymorphic(stratify_key, all_ret, this_save_path, latent_dim)
else:
Expand Down
46 changes: 18 additions & 28 deletions src/br/analysis/run_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@
import os
import sys
from pathlib import Path
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np
from br.models.compute_features import get_embeddings
from br.models.utils import get_all_configs_per_dataset
import pandas as pd
import seaborn as sns
from skimage import measure as skmeasure
from skimage import morphology as skmorpho
from tqdm import tqdm

from br.features.classification import get_classification_df
import matplotlib.pyplot as plt
import seaborn as sns
from br.models.compute_features import get_embeddings
from br.models.utils import get_all_configs_per_dataset


def get_surface_area(input_img):
Expand All @@ -20,9 +22,9 @@ def get_surface_area(input_img):
input_img[:, :, [0, -1]] = 0
input_img[:, [0, -1], :] = 0
input_img[[0, -1], :, :] = 0
input_img_surface = np.logical_xor(
input_img, skmorpho.binary_erosion(input_img)
).astype(np.uint8)
input_img_surface = np.logical_xor(input_img, skmorpho.binary_erosion(input_img)).astype(
np.uint8
)
# Loop through the boundary voxels to calculate the number of
# boundary faces. Using 6-neighborhod.
pxl_z, pxl_y, pxl_x = np.nonzero(input_img_surface)
Expand Down Expand Up @@ -113,28 +115,21 @@ def main(args):
feats["CellId"] = row["CellId"]
all_feats.append(pd.DataFrame(feats, index=[0]))
all_feats = pd.concat(all_feats, axis=0)
all_feats = all_feats.merge(
orig_df[["CellId", "vol_bins", "vol_bins_inds"]], on="CellId"
)
all_feats = all_feats.merge(orig_df[["CellId", "vol_bins", "vol_bins_inds"]], on="CellId")
all_feats["mean_volume"] = all_feats["shape_volume"] / all_feats["connectivity_cc"]
all_feats["mean_surface_area"] = (
all_feats["roundness_surface_area"] / all_feats["connectivity_cc"]
)

all_feats = all_feats.merge(
orig_df[["CellId", "STR_connectivity_cc_thresh"]], on="CellId"
)
all_feats = all_feats.loc[all_feats["CellId"] != 724520].reset_index(
drop=True
) # nan row
all_feats = all_feats.merge(orig_df[["CellId", "STR_connectivity_cc_thresh"]], on="CellId")
all_feats = all_feats.loc[all_feats["CellId"] != 724520].reset_index(drop=True) # nan row
all_ret = all_ret.loc[all_ret["CellId"] != 724520].reset_index(drop=True) # nan row
assert not all_feats["mean_surface_area"].isna().any()

all_ret = all_ret.merge(
orig_df[["CellId", "vol_bins", "vol_bins_inds"]],
on="CellId",
)
from br.features.classification import get_classification_df

all_baseline = []
all_feats["model"] = "baseline"
Expand Down Expand Up @@ -175,9 +170,7 @@ def main(args):
"(676.015, 818.646)",
"(818.646, 961.277)",
]
g = sns.boxplot(
ax=ax, data=plot, x="vol_bin", y="top_1_acc", hue="features", order=x_order
)
g = sns.boxplot(ax=ax, data=plot, x="vol_bin", y="top_1_acc", hue="features", order=x_order)
plt.xticks(rotation=30)
ax.set_xticklabels(
["0", "1", "2", "3", "4"], rotation=0
Expand All @@ -191,25 +184,22 @@ def main(args):
bbox_inches="tight",
dpi=300,
)
plot.to_csv(args.save_path + "classification_number_pieces.csv")
# fig.savefig("classification_number_pieces_nogrouping.png", bbox_inches="tight", dpi=300)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Script for computing perturbation detection metrics"
)
parser.add_argument(
"--save_path", type=str, required=True, help="Path to save the results."
)
parser.add_argument("--save_path", type=str, required=True, help="Path to save the results.")
parser.add_argument(
"--embeddings_path",
type=str,
required=True,
help="Path to the saved embeddings.",
)
parser.add_argument(
"--dataset_name", type=str, required=True, help="Name of the dataset."
)
parser.add_argument("--dataset_name", type=str, required=True, help="Name of the dataset.")
args = parser.parse_args()

# Validate that required paths are provided
Expand All @@ -221,6 +211,6 @@ def main(args):

"""
Example run:

python src/br/analysis/run_classification.py --save_path "./outputs_npm1/" --embeddings_path "./morphology_appropriate_representation_learning/model_embeddings/npm1/" --dataset_name "npm1"
"""
Loading
Loading