From 9dfa487b0ce79c52e12adb24e972ad2069ec2fd1 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Tue, 17 Sep 2024 11:42:01 -0700 Subject: [PATCH 01/68] add mean and std over growth phase --- .../lib/preprocessing/add_features.py | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/nuc_morph_analysis/lib/preprocessing/add_features.py b/nuc_morph_analysis/lib/preprocessing/add_features.py index fca6c9be..4d47d501 100644 --- a/nuc_morph_analysis/lib/preprocessing/add_features.py +++ b/nuc_morph_analysis/lib/preprocessing/add_features.py @@ -140,6 +140,63 @@ def add_feature_at(df, frame_column, feature, feature_column, multiplier=1): ) return df +def add_mean_feature_over_trajectory(df, feature_list, multiplier_list): + """ + Add the mean of a given feature over the growth trajectory + from transition to frame breakdown. + + Parameters + ---------- + df : DataFrame + The dataframe + feature_list : list + List of column names + multiplier_list : list + List of scale to multiply the mean by + + Returns + ------- + df : DataFrame + The dataframe with the added mean feature columns + """ + for feature, multiplier in zip(feature_list, multiplier_list): + for tid, dft in df.groupby("track_id"): + start = dft.frame_transition.unique()[0] + stop = dft.Fb.unique()[0] + df_mean = dft[(dft['index_sequence'] >= start) & (dft['index_sequence'] <= stop)] + mean = df_mean[feature].mean() * multiplier + df.loc[df.track_id == tid, f"mean_{feature}"] = mean + return df + + +def add_std_feature_over_trajectory(df, feature_list, multiplier_list): + """ + Add the standard deviation of a given feature over the growth trajectory + from transition to frame breakdown. + + Parameters + ---------- + df : DataFrame + The dataframe + feature_list : list + List of column names + multiplier_list : list + List of scale to multiply the mean by + + Returns + ------- + df : DataFrame + The dataframe with the added deviation feature columns + """ + for feature, multiplier in zip(feature_list, multiplier_list): + for tid, dft in df.groupby("track_id"): + start = dft.frame_transition.unique()[0] + stop = dft.Fb.unique()[0] + df_std = dft[(dft['index_sequence'] >= start) & (dft['index_sequence'] <= stop)] + std = df_std[feature].std() * multiplier + df.loc[df.track_id == tid, f"std_{feature}"] = std + return df + def add_volume_at(df, pixel_size, frame_column): """ From 7680ba1b6f27c181bab74ec61cae9adca254720d Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Tue, 17 Sep 2024 12:03:11 -0700 Subject: [PATCH 02/68] add features for LRM --- .../lib/preprocessing/global_dataset_filtering.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py index 23b9b8af..d7a4b582 100644 --- a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py +++ b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py @@ -233,7 +233,11 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_features.add_location_at(df_full, frame, "y") df_full = add_features.add_time_at(df_full, frame, interval) df_full = add_features.add_colony_time_at(df_full, frame, interval) - + # For LRM + df_full = add_features.add_feature_at(df_full, frame, 'height', 'height_percentile', pix_size) + df_full = add_features.add_feature_at(df_full, frame, 'density', 'density', pix_size) + df_full = add_features.add_feature_at(df_full, frame, 'aspect_ratio', 'aspect_ratio') + df_full = add_features.add_duration_in_frames(df_full, "Ff", "frame_transition") df_full = add_features.add_duration_in_frames(df_full, "frame_transition", "Fb") df_full = add_features.add_duration_in_frames(df_full, "Ff", "Fb") @@ -244,6 +248,12 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_growth_features.add_early_growth_rate(df_full, interval) df_full = add_growth_features.add_late_growth_rate_by_endpoints(df_full) df_full = add_growth_features.fit_tracks_to_time_powerlaw(df_full, "volume", interval) + + # For LRM + ft_list = ['height', 'density', 'volume', 'mesh_sa'] + multiplier_list = [pix_size, 1 / pix_size**2, pix_size**3, pix_size**2] + df_full = add_features.add_mean_feature_over_trajectory(df_full, ft_list, multiplier_list) + df_full = add_features.add_std_feature_over_trajectory(df_full, ft_list, multiplier_list) # Add flag for use after merging back to main manifest df_full = add_features.add_full_track_flag(df_full) From e92a4aa20125308eecd1f0676d0d2636a169e019 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Tue, 17 Sep 2024 12:57:03 -0700 Subject: [PATCH 03/68] update added features --- .../lib/preprocessing/global_dataset_filtering.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py index d7a4b582..739afbe5 100644 --- a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py +++ b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py @@ -236,7 +236,9 @@ def process_full_tracks(df_all, thresh, pix_size, interval): # For LRM df_full = add_features.add_feature_at(df_full, frame, 'height', 'height_percentile', pix_size) df_full = add_features.add_feature_at(df_full, frame, 'density', 'density', pix_size) - df_full = add_features.add_feature_at(df_full, frame, 'aspect_ratio', 'aspect_ratio') + df_full = add_features.add_feature_at(df_full, frame, 'xy_aspect', 'xy_aspect') + df_full = add_features.add_feature_at(df_full, frame, 'SA_vol_ratio', 'SA_vol_ratio') + df_full = add_features.add_duration_in_frames(df_full, "Ff", "frame_transition") df_full = add_features.add_duration_in_frames(df_full, "frame_transition", "Fb") @@ -250,7 +252,7 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_growth_features.fit_tracks_to_time_powerlaw(df_full, "volume", interval) # For LRM - ft_list = ['height', 'density', 'volume', 'mesh_sa'] + ft_list = ['height', 'density', 'volume', 'mesh_sa', 'xy_aspect', 'SA_vol_ratio'] multiplier_list = [pix_size, 1 / pix_size**2, pix_size**3, pix_size**2] df_full = add_features.add_mean_feature_over_trajectory(df_full, ft_list, multiplier_list) df_full = add_features.add_std_feature_over_trajectory(df_full, ft_list, multiplier_list) From 99a9f26b588434d54d30396e72b5020d660f36f4 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Tue, 17 Sep 2024 13:01:44 -0700 Subject: [PATCH 04/68] update documentation --- nuc_morph_analysis/lib/preprocessing/add_features.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nuc_morph_analysis/lib/preprocessing/add_features.py b/nuc_morph_analysis/lib/preprocessing/add_features.py index 4d47d501..41b068ec 100644 --- a/nuc_morph_analysis/lib/preprocessing/add_features.py +++ b/nuc_morph_analysis/lib/preprocessing/add_features.py @@ -181,12 +181,12 @@ def add_std_feature_over_trajectory(df, feature_list, multiplier_list): feature_list : list List of column names multiplier_list : list - List of scale to multiply the mean by + List of scale to multiply the std by Returns ------- df : DataFrame - The dataframe with the added deviation feature columns + The dataframe with the added standard deviation feature columns """ for feature, multiplier in zip(feature_list, multiplier_list): for tid, dft in df.groupby("track_id"): From 7ace64f96990558e6e21eccee217c3280301f6be Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Tue, 17 Sep 2024 13:14:15 -0700 Subject: [PATCH 05/68] update features, simplify --- .../lib/preprocessing/global_dataset_filtering.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py index 739afbe5..f7fcd61d 100644 --- a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py +++ b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py @@ -233,13 +233,7 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_features.add_location_at(df_full, frame, "y") df_full = add_features.add_time_at(df_full, frame, interval) df_full = add_features.add_colony_time_at(df_full, frame, interval) - # For LRM - df_full = add_features.add_feature_at(df_full, frame, 'height', 'height_percentile', pix_size) - df_full = add_features.add_feature_at(df_full, frame, 'density', 'density', pix_size) - df_full = add_features.add_feature_at(df_full, frame, 'xy_aspect', 'xy_aspect') - df_full = add_features.add_feature_at(df_full, frame, 'SA_vol_ratio', 'SA_vol_ratio') - - + df_full = add_features.add_duration_in_frames(df_full, "Ff", "frame_transition") df_full = add_features.add_duration_in_frames(df_full, "frame_transition", "Fb") df_full = add_features.add_duration_in_frames(df_full, "Ff", "Fb") @@ -252,6 +246,13 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_growth_features.fit_tracks_to_time_powerlaw(df_full, "volume", interval) # For LRM + df_full = add_features.add_feature_at(df_full, "frame_transition", 'height', 'height_percentile', pix_size) + df_full = add_features.add_feature_at(df_full, "frame_transition", 'density', 'density', pix_size) + df_full = add_features.add_feature_at(df_full, "frame_transition", 'xy_aspect', 'xy_aspect') + df_full = add_features.add_feature_at(df_full, "frame_transition", 'SA_vol_ratio', 'SA_vol_ratio') + df_full = add_features.add_feature_at(df_full, "frame_transition", 'SA_vol_ratio', 'SA_vol_ratio') + df_full = add_features.add_feature_at(df_full, "frame_transition", 'transient_gr_whole_colony', 'neighbor_avg_dxdt_48_volume_whole_colony') + ft_list = ['height', 'density', 'volume', 'mesh_sa', 'xy_aspect', 'SA_vol_ratio'] multiplier_list = [pix_size, 1 / pix_size**2, pix_size**3, pix_size**2] df_full = add_features.add_mean_feature_over_trajectory(df_full, ft_list, multiplier_list) From 94d7cf692cb26825e4bff0c0f1cf55d6dd31901d Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Tue, 17 Sep 2024 13:35:04 -0700 Subject: [PATCH 06/68] Add transient gr of colony at tp, mean, std --- .../lib/preprocessing/global_dataset_filtering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py index f7fcd61d..95faf26c 100644 --- a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py +++ b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py @@ -253,7 +253,7 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_features.add_feature_at(df_full, "frame_transition", 'SA_vol_ratio', 'SA_vol_ratio') df_full = add_features.add_feature_at(df_full, "frame_transition", 'transient_gr_whole_colony', 'neighbor_avg_dxdt_48_volume_whole_colony') - ft_list = ['height', 'density', 'volume', 'mesh_sa', 'xy_aspect', 'SA_vol_ratio'] + ft_list = ['height', 'density', 'volume', 'mesh_sa', 'xy_aspect', 'SA_vol_ratio', 'neighbor_avg_dxdt_48_volume_whole_colony'] multiplier_list = [pix_size, 1 / pix_size**2, pix_size**3, pix_size**2] df_full = add_features.add_mean_feature_over_trajectory(df_full, ft_list, multiplier_list) df_full = add_features.add_std_feature_over_trajectory(df_full, ft_list, multiplier_list) From 845b3fe965d5b218e3da02afce61c030dd8de159 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Tue, 17 Sep 2024 14:08:41 -0700 Subject: [PATCH 07/68] update features and scale --- .../lib/preprocessing/global_dataset_filtering.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py index 95faf26c..b5f05172 100644 --- a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py +++ b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py @@ -251,10 +251,10 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_features.add_feature_at(df_full, "frame_transition", 'xy_aspect', 'xy_aspect') df_full = add_features.add_feature_at(df_full, "frame_transition", 'SA_vol_ratio', 'SA_vol_ratio') df_full = add_features.add_feature_at(df_full, "frame_transition", 'SA_vol_ratio', 'SA_vol_ratio') - df_full = add_features.add_feature_at(df_full, "frame_transition", 'transient_gr_whole_colony', 'neighbor_avg_dxdt_48_volume_whole_colony') + df_full = add_features.add_feature_at(df_full, "frame_transition", 'neighbor_avg_dxdt_48_volume_whole_colony', 'neighbor_avg_dxdt_48_volume_whole_colony') ft_list = ['height', 'density', 'volume', 'mesh_sa', 'xy_aspect', 'SA_vol_ratio', 'neighbor_avg_dxdt_48_volume_whole_colony'] - multiplier_list = [pix_size, 1 / pix_size**2, pix_size**3, pix_size**2] + multiplier_list = [pix_size, 1 / pix_size**2, pix_size**3, pix_size**2, 1, 1, 1] df_full = add_features.add_mean_feature_over_trajectory(df_full, ft_list, multiplier_list) df_full = add_features.add_std_feature_over_trajectory(df_full, ft_list, multiplier_list) From 097546fbf5ca86017ad91912442ff6ae5334f367 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Tue, 17 Sep 2024 14:40:05 -0700 Subject: [PATCH 08/68] update label tables --- .../lib/visualization/label_tables.py | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/nuc_morph_analysis/lib/visualization/label_tables.py b/nuc_morph_analysis/lib/visualization/label_tables.py index e1f27b2b..7df604ac 100644 --- a/nuc_morph_analysis/lib/visualization/label_tables.py +++ b/nuc_morph_analysis/lib/visualization/label_tables.py @@ -131,8 +131,10 @@ def get_scale_factor_table(dataset="all_baseline"): ("Ff", "frame_formation"): "Formation", ("frame_inflection", "frame_transition"): "Transtion", ("Fb", "frame_breakdown"): "Breakdown", + "time_at_A": "Time at Formation", "time_at_B": "Movie Time at Transition", "time_at_C": "Movie Time at Breakdown", + "colony_time_at_A": "Aligned Colony Time at Formation", "colony_time_at_B": "Aligned Colony Time at Transition", "colony_time_at_C": "Aligned Colony Time at Breakdown", "duration_AB": "Rapid Expansion Duration", @@ -216,8 +218,28 @@ def get_scale_factor_table(dataset="all_baseline"): "colony_area": "area of colony (brightfield)", "nucleus_colony_area_ratio": "ratio of nuclear area to colony area", "seg_twoD_zMIP_area": "total projected nuclear area", + # LRM feats + "height_at_B": "Height at start of growth", + "density_at_B": "Density at start of growth", + "xy_aspect_at_B": "XY aspect ratio at start of growth", + "SA_vol_ratio_at_B": "SA/Volume ratio at start of growth", + "neighbor_avg_dxdt_48_volume_whole_colony_at_B": "Transient growth rate of whole colony at start of growth", + "mean_volume": "Mean volume during growth", + "mean_height": "Mean height during growth", + "mean_density": "Mean density during growth", + "mean_mesh_sa": "Mean surface area during growth", + "mean_xy_aspect": "Mean XY aspect ratio during growth", + "mean_SA_vol_ratio": "Mean SA/Volume ratio during growth", + "mean_neighbor_avg_dxdt_48_volume_whole_colony": "Mean transient growth rate of whole colony during growth", + "std_volume": "Standard deviation volume during growth", + "std_height": "Standard deviation height during growth", + "std_density": "Standard deviation density during growth", + "std_mesh_sa": "Standard deviation surface area during growth", + "std_xy_aspect": "Standard deviation XY aspect ratio during growth", + "std_SA_vol_ratio": "Standard deviation SA/Volume ratio during growth", + "std_neighbor_avg_dxdt_48_volume_whole_colony": "Standard deviation transient growth rate of whole colony during growth", } -# now add the dxdt columns + def convert_to_hr(bin_interval, dataset="all_baseline"): From d6fd7abaa072f46b7903a2974d30b642a206fd02 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Fri, 20 Sep 2024 11:29:35 -0700 Subject: [PATCH 09/68] Use .values instead of .unique() --- nuc_morph_analysis/lib/preprocessing/add_features.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nuc_morph_analysis/lib/preprocessing/add_features.py b/nuc_morph_analysis/lib/preprocessing/add_features.py index 41b068ec..56bf9903 100644 --- a/nuc_morph_analysis/lib/preprocessing/add_features.py +++ b/nuc_morph_analysis/lib/preprocessing/add_features.py @@ -161,8 +161,8 @@ def add_mean_feature_over_trajectory(df, feature_list, multiplier_list): """ for feature, multiplier in zip(feature_list, multiplier_list): for tid, dft in df.groupby("track_id"): - start = dft.frame_transition.unique()[0] - stop = dft.Fb.unique()[0] + start = dft.frame_transition.values[0] + stop = dft.Fb.values[0] df_mean = dft[(dft['index_sequence'] >= start) & (dft['index_sequence'] <= stop)] mean = df_mean[feature].mean() * multiplier df.loc[df.track_id == tid, f"mean_{feature}"] = mean @@ -190,8 +190,8 @@ def add_std_feature_over_trajectory(df, feature_list, multiplier_list): """ for feature, multiplier in zip(feature_list, multiplier_list): for tid, dft in df.groupby("track_id"): - start = dft.frame_transition.unique()[0] - stop = dft.Fb.unique()[0] + start = dft.frame_transition.values[0] + stop = dft.Fb.values[0] df_std = dft[(dft['index_sequence'] >= start) & (dft['index_sequence'] <= stop)] std = df_std[feature].std() * multiplier df.loc[df.track_id == tid, f"std_{feature}"] = std From 567d19ab738eb0b9219181a8bfd91203470215b6 Mon Sep 17 00:00:00 2001 From: Ritvik Vasan Date: Mon, 23 Sep 2024 11:39:26 -0700 Subject: [PATCH 10/68] linear model changes + label table updates --- .../linear_regression_workflow.py | 73 +++++++++++-------- .../linear_regression/scripts/added_volume.sh | 5 +- .../linear_regression/scripts/duration_BC.sh | 5 +- .../scripts/ending_volume.sh | 5 +- .../lib/visualization/label_tables.py | 22 +++--- 5 files changed, 63 insertions(+), 47 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py index bbadf2f6..6dd19847 100644 --- a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py @@ -18,7 +18,7 @@ from nuc_morph_analysis.lib.preprocessing import global_dataset_filtering, filter_data from sklearn.model_selection import permutation_test_score from nuc_morph_analysis.lib.visualization.plotting_tools import get_plot_labels_for_metric - +import imageio pd.options.mode.chained_assignment = None # default='warn' warnings.simplefilter(action="ignore", category=FutureWarning) @@ -57,6 +57,9 @@ def fit_linear_regression( all_coef_alpha = [] all_perms = {'score': [], 'perm_score_mean': [], 'perm_score_std': [], 'p_value': [], 'alpha': []} + import ipdb + ipdb.set_trace() + # find best alpha for Lasso model for alpha_ind, this_alpha in enumerate(alpha): print("fitting alpha", this_alpha) @@ -141,7 +144,9 @@ def fit_linear_regression( # Get test scores for all alpha all_test_sc = pd.concat(all_test_sc, axis=0).reset_index(drop=True) all_test_sc["Test MSE"] = -all_test_sc["Test MSE"] - all_test_sc.to_csv(save_path / "mse.csv") + save_path = save_path / Path(f"{target}_{alpha[alpha_ind - 1]}") + save_path.mkdir(parents=True, exist_ok=True) + all_test_sc.to_csv(save_path / f"mse.csv") # Get coeffs for all alpha all_coef_alpha = pd.concat(all_coef_alpha, axis=0).reset_index(drop=True) @@ -150,11 +155,11 @@ def fit_linear_regression( var_name="Column", value_name="Coefficient Importance", ).reset_index(drop=True) - all_coef_alpha.to_csv(save_path / "coefficients.csv") + all_coef_alpha.to_csv(save_path / f"coefficients.csv") # Get permutation scores and p values for all alpha all_perms = pd.DataFrame(all_perms).reset_index(drop=True) - all_perms.to_csv(save_path / "perm_scores.csv") + all_perms.to_csv(save_path / f"perm_scores.csv") # Save coefficient plot for max alpha value save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path) @@ -163,32 +168,40 @@ def fit_linear_regression( def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path): - # subset to max alpha - max_alpha = all_coef_alpha['alpha'].max() - all_coef_alpha = all_coef_alpha.loc[all_coef_alpha['alpha'] == max_alpha].reset_index(drop=True) - all_test_sc = all_test_sc.loc[all_test_sc['alpha'] == max_alpha].reset_index(drop=True) - all_perms = all_perms.loc[all_perms['alpha'] == max_alpha].reset_index(drop=True) - - p_value = round(all_perms['p_value'].item(), 3) - test_r2_mean = round(all_test_sc['Test r$^2$'].mean(), 2) - test_r2_std = round(all_test_sc['Test r$^2$'].std()/2, 2) - - - g = sns.catplot( - data=all_coef_alpha, - x='Column', - y="Coefficient Importance", - kind="bar", - errorbar="sd", - aspect=1.5, - height=4, - ) - g.fig.subplots_adjust(top=0.8) # adjust the Figure in rp - g.fig.suptitle(f'p-value {p_value}, test r^2 {test_r2_mean}+-{test_r2_std}') - label_list = [get_plot_labels_for_metric(col)[1] for col in all_coef_alpha['Column'].unique()] - g.set_xticklabels(label_list, rotation=90) - print(f'Saving coefficients_{target}_alpha_{max_alpha}.png') - g.savefig(save_path / f'coefficients_{target}_alpha_{max_alpha}.png') + files = [] + for alpha in all_coef_alpha['alpha'].unique(): + all_coef_alpha = all_coef_alpha.loc[all_coef_alpha['alpha'] == alpha].reset_index(drop=True) + all_test_sc = all_test_sc.loc[all_test_sc['alpha'] == alpha].reset_index(drop=True) + all_perms = all_perms.loc[all_perms['alpha'] == alpha].reset_index(drop=True) + + p_value = round(all_perms['p_value'].item(), 3) + test_r2_mean = round(all_test_sc['Test r$^2$'].mean(), 2) + test_r2_std = round(all_test_sc['Test r$^2$'].std()/2, 2) + + + g = sns.catplot( + data=all_coef_alpha, + y='Column', + x="Coefficient Importance", + kind="bar", + errorbar="sd", + aspect=1.5, + height=5, + ) + g.fig.subplots_adjust(top=0.8) # adjust the Figure in rp + g.fig.suptitle(f'p-value {p_value}, test r^2 {test_r2_mean}+-{test_r2_std}') + label_list = [get_plot_labels_for_metric(col)[1] for col in all_coef_alpha['Column'].unique()] + g.set_yticklabels(label_list) + print(f'Saving coefficients_{target}_alpha_{max_alpha}.png') + this_path = str(save_path / Path(f'coefficients_{target}_alpha_{max_alpha}.png')) + files.append(this_path) + g.savefig(this_path) + + # save movie of pngs + writer = imageio.get_writer('test.mp4', fps=20) + for im in files: + writer.append_data(imageio.imread(im)) + writer.close() def list_of_strings(arg): return arg.split(",") diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume.sh b/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume.sh index 2b6da4d1..e3376001 100755 --- a/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume.sh +++ b/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume.sh @@ -1,5 +1,6 @@ python linear_regression_workflow.py \ ---cols 'volume_at_A','volume_at_B','time_at_A','time_at_B','colony_time_at_A','colony_time_at_B','SA_at_B' \ +--cols 'volume_at_B','height_at_B','time_at_B','xy_aspect_at_B','SA_vol_ratio_at_B','colony_time_at_B','SA_at_B','mean_height','mean_density','mean_xy_aspect','mean_SA_vol_ratio','mean_neighbor_avg_dxdt_48_volume_whole_colony','std_height','std_density','std_xy_aspect','std_SA_vol_ratio','std_neighbor_avg_dxdt_48_volume_whole_colony' \ --alpha_range 0,0.1,0.5,1,1.3,1.5,2,2.5,5,10,11,12,13 \ --target 'delta_volume_BC' \ ---save_path "../../figures/" \ \ No newline at end of file +--save_path "../../figures/" \ +--tolerance 0.05 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC.sh b/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC.sh index ac7c53d7..fcb1163c 100755 --- a/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC.sh +++ b/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC.sh @@ -1,5 +1,6 @@ python linear_regression_workflow.py \ ---cols 'volume_at_A','volume_at_B','time_at_A','time_at_B','colony_time_at_A','colony_time_at_B','SA_at_B' \ +--cols 'volume_at_B','height_at_B','time_at_B','xy_aspect_at_B','SA_vol_ratio_at_B','colony_time_at_B','SA_at_B','mean_height','mean_density','mean_volume','mean_mesh_sa','mean_xy_aspect','mean_SA_vol_ratio','mean_neighbor_avg_dxdt_48_volume_whole_colony','std_height','std_density','std_volume','std_mesh_sa','std_xy_aspect','std_SA_vol_ratio','std_neighbor_avg_dxdt_48_volume_whole_colony' \ --alpha_range 0,0.1,0.5,1,1.3,1.5,2,2.5,5,10 \ --target 'duration_BC' \ ---save_path "../../figures/" \ \ No newline at end of file +--save_path "../../figures/" \ +--tolerance 0.05 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume.sh b/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume.sh index bc098992..419bdfb1 100755 --- a/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume.sh +++ b/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume.sh @@ -1,5 +1,6 @@ python linear_regression_workflow.py \ ---cols 'volume_at_A','volume_at_B','time_at_A','time_at_B','colony_time_at_A','colony_time_at_B','SA_at_B' \ +--cols 'volume_at_B','height_at_B','time_at_B','xy_aspect_at_B','SA_vol_ratio_at_B','colony_time_at_B','SA_at_B','mean_height','mean_density','mean_xy_aspect','mean_SA_vol_ratio','mean_neighbor_avg_dxdt_48_volume_whole_colony','std_height','std_density','std_xy_aspect','std_SA_vol_ratio','std_neighbor_avg_dxdt_48_volume_whole_colony' \ --alpha_range 0,0.1,0.5,1,1.3,1.5,2,2.5,5,10,11,12,13 \ --target 'volume_at_C' \ ---save_path "../../figures/" \ \ No newline at end of file +--save_path "../../figures/" \ +--tolerance 0.05 \ No newline at end of file diff --git a/nuc_morph_analysis/lib/visualization/label_tables.py b/nuc_morph_analysis/lib/visualization/label_tables.py index 7df604ac..b3ec7ee9 100644 --- a/nuc_morph_analysis/lib/visualization/label_tables.py +++ b/nuc_morph_analysis/lib/visualization/label_tables.py @@ -135,8 +135,8 @@ def get_scale_factor_table(dataset="all_baseline"): "time_at_B": "Movie Time at Transition", "time_at_C": "Movie Time at Breakdown", "colony_time_at_A": "Aligned Colony Time at Formation", - "colony_time_at_B": "Aligned Colony Time at Transition", - "colony_time_at_C": "Aligned Colony Time at Breakdown", + "colony_time_at_B": "Starting Aligned Colony Time", + "colony_time_at_C": "Ending Aligned Colony Time", "duration_AB": "Rapid Expansion Duration", ("duration_BC", "duration_BC_hr"): "Growth Duration", # Volume @@ -169,8 +169,8 @@ def get_scale_factor_table(dataset="all_baseline"): # Surface Area "mesh_sa": "Surface Area", "SA_at_A": "Surface Area at A", - "SA_at_B": "Surface Area at B", - "SA_at_C": "Surface Area at C", + "SA_at_B": "Starting Surface Area", + "SA_at_C": "Ending Surface Area", "delta_SA_BC": "\u0394Surface Area B to C", "SA_fold_change_BC": "Surface Area Fold-Change B to C", "SA_fold_change_fromB": "Surface Area Fold-Change", @@ -231,13 +231,13 @@ def get_scale_factor_table(dataset="all_baseline"): "mean_xy_aspect": "Mean XY aspect ratio during growth", "mean_SA_vol_ratio": "Mean SA/Volume ratio during growth", "mean_neighbor_avg_dxdt_48_volume_whole_colony": "Mean transient growth rate of whole colony during growth", - "std_volume": "Standard deviation volume during growth", - "std_height": "Standard deviation height during growth", - "std_density": "Standard deviation density during growth", - "std_mesh_sa": "Standard deviation surface area during growth", - "std_xy_aspect": "Standard deviation XY aspect ratio during growth", - "std_SA_vol_ratio": "Standard deviation SA/Volume ratio during growth", - "std_neighbor_avg_dxdt_48_volume_whole_colony": "Standard deviation transient growth rate of whole colony during growth", + "std_volume": "Stdev. volume during growth", + "std_height": "Stdev. height during growth", + "std_density": "Stdev. density during growth", + "std_mesh_sa": "Stdev. surface area during growth", + "std_xy_aspect": "Stdev. XY aspect ratio during growth", + "std_SA_vol_ratio": "Stdev. SA/Volume ratio during growth", + "std_neighbor_avg_dxdt_48_volume_whole_colony": "Stdev. transient growth rate of whole colony during growth", } From bfff7c2a3df7d722393677d23864b4452204ca26 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Mon, 23 Sep 2024 11:52:22 -0700 Subject: [PATCH 11/68] use label tables to get scale --- .../lib/preprocessing/global_dataset_filtering.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py index b5f05172..950bd75d 100644 --- a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py +++ b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py @@ -17,6 +17,7 @@ add_fov_touch_timepoint_for_colonies, ) from nuc_morph_analysis.analyses.height.add_colony_time import add_colony_time_all_datasets +from nuc_morph_analysis.lib.visualization.label_tables import get_plot_labels_for_metric def load_dataset_with_features( @@ -254,7 +255,7 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_features.add_feature_at(df_full, "frame_transition", 'neighbor_avg_dxdt_48_volume_whole_colony', 'neighbor_avg_dxdt_48_volume_whole_colony') ft_list = ['height', 'density', 'volume', 'mesh_sa', 'xy_aspect', 'SA_vol_ratio', 'neighbor_avg_dxdt_48_volume_whole_colony'] - multiplier_list = [pix_size, 1 / pix_size**2, pix_size**3, pix_size**2, 1, 1, 1] + multiplier_list = [get_plot_labels_for_metric(x)[0] for x in ft_list] df_full = add_features.add_mean_feature_over_trajectory(df_full, ft_list, multiplier_list) df_full = add_features.add_std_feature_over_trajectory(df_full, ft_list, multiplier_list) From 36a0d6358fe8f8bff5fd64393e1a360afa501ea4 Mon Sep 17 00:00:00 2001 From: Ritvik Vasan Date: Mon, 23 Sep 2024 12:08:41 -0700 Subject: [PATCH 12/68] remove stray pdb + make movie --- .../linear_regression_workflow.py | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py index 6dd19847..d4e750f9 100644 --- a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py @@ -57,9 +57,6 @@ def fit_linear_regression( all_coef_alpha = [] all_perms = {'score': [], 'perm_score_mean': [], 'perm_score_std': [], 'p_value': [], 'alpha': []} - import ipdb - ipdb.set_trace() - # find best alpha for Lasso model for alpha_ind, this_alpha in enumerate(alpha): print("fitting alpha", this_alpha) @@ -168,19 +165,19 @@ def fit_linear_regression( def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path): + xlim = None + ylim = None files = [] for alpha in all_coef_alpha['alpha'].unique(): - all_coef_alpha = all_coef_alpha.loc[all_coef_alpha['alpha'] == alpha].reset_index(drop=True) - all_test_sc = all_test_sc.loc[all_test_sc['alpha'] == alpha].reset_index(drop=True) - all_perms = all_perms.loc[all_perms['alpha'] == alpha].reset_index(drop=True) - - p_value = round(all_perms['p_value'].item(), 3) - test_r2_mean = round(all_test_sc['Test r$^2$'].mean(), 2) - test_r2_std = round(all_test_sc['Test r$^2$'].std()/2, 2) - + this_coef_alpha = all_coef_alpha.loc[all_coef_alpha['alpha'] == alpha].reset_index(drop=True) + this_test_sc = all_test_sc.loc[all_test_sc['alpha'] == alpha].reset_index(drop=True) + this_perms = all_perms.loc[all_perms['alpha'] == alpha].reset_index(drop=True) + p_value = round(this_perms['p_value'].item(), 3) + test_r2_mean = round(this_test_sc['Test r$^2$'].mean(), 2) + test_r2_std = round(this_test_sc['Test r$^2$'].std()/2, 2) g = sns.catplot( - data=all_coef_alpha, + data=this_coef_alpha, y='Column', x="Coefficient Importance", kind="bar", @@ -189,16 +186,20 @@ def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path): height=5, ) g.fig.subplots_adjust(top=0.8) # adjust the Figure in rp - g.fig.suptitle(f'p-value {p_value}, test r^2 {test_r2_mean}+-{test_r2_std}') + g.fig.suptitle(f'alpha {alpha} test r^2 {test_r2_mean}+-{test_r2_std} p-value {p_value}') label_list = [get_plot_labels_for_metric(col)[1] for col in all_coef_alpha['Column'].unique()] g.set_yticklabels(label_list) - print(f'Saving coefficients_{target}_alpha_{max_alpha}.png') - this_path = str(save_path / Path(f'coefficients_{target}_alpha_{max_alpha}.png')) + print(f'Saving coefficients_{target}_alpha_{alpha}.png') + this_path = str(save_path / Path(f'coefficients_{target}_alpha_{alpha}.png')) files.append(this_path) + + if not xlim: + xlim = g.fig.axes[0].get_xlim() + g.set(xlim=xlim) g.savefig(this_path) # save movie of pngs - writer = imageio.get_writer('test.mp4', fps=20) + writer = imageio.get_writer(save_path / 'test.mp4', fps=2) for im in files: writer.append_data(imageio.imread(im)) writer.close() From 09dd626940232e5e341c3a4f9065571c8ca26fb0 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Mon, 23 Sep 2024 13:29:42 -0700 Subject: [PATCH 13/68] fix import, add function --- .../lib/preprocessing/global_dataset_filtering.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py index 950bd75d..c63c5963 100644 --- a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py +++ b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py @@ -17,7 +17,7 @@ add_fov_touch_timepoint_for_colonies, ) from nuc_morph_analysis.analyses.height.add_colony_time import add_colony_time_all_datasets -from nuc_morph_analysis.lib.visualization.label_tables import get_plot_labels_for_metric +from nuc_morph_analysis.lib.visualization.plotting_tools import get_plot_labels_for_metric def load_dataset_with_features( @@ -252,7 +252,7 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_features.add_feature_at(df_full, "frame_transition", 'xy_aspect', 'xy_aspect') df_full = add_features.add_feature_at(df_full, "frame_transition", 'SA_vol_ratio', 'SA_vol_ratio') df_full = add_features.add_feature_at(df_full, "frame_transition", 'SA_vol_ratio', 'SA_vol_ratio') - df_full = add_features.add_feature_at(df_full, "frame_transition", 'neighbor_avg_dxdt_48_volume_whole_colony', 'neighbor_avg_dxdt_48_volume_whole_colony') + df_full = add_features.get_early_transient_gr_of_whole_colony(df_full) ft_list = ['height', 'density', 'volume', 'mesh_sa', 'xy_aspect', 'SA_vol_ratio', 'neighbor_avg_dxdt_48_volume_whole_colony'] multiplier_list = [get_plot_labels_for_metric(x)[0] for x in ft_list] From 5a71cf6b3d564ee30dc6660d9cab856aef8d6e6e Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Mon, 23 Sep 2024 13:49:15 -0700 Subject: [PATCH 14/68] get_early_transient_gr_of_whole_colony --- .../lib/preprocessing/add_features.py | 30 +++++++++++++++++++ .../preprocessing/global_dataset_filtering.py | 2 +- .../lib/visualization/label_tables.py | 2 +- 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/nuc_morph_analysis/lib/preprocessing/add_features.py b/nuc_morph_analysis/lib/preprocessing/add_features.py index 56bf9903..802927b4 100644 --- a/nuc_morph_analysis/lib/preprocessing/add_features.py +++ b/nuc_morph_analysis/lib/preprocessing/add_features.py @@ -169,6 +169,36 @@ def add_mean_feature_over_trajectory(df, feature_list, multiplier_list): return df +def get_early_transient_gr_of_whole_colony(df, scale, time_shift=25): + """ + Get the transient growth rate of the colony 2 hours into the growth trajectory. + + This time shift of two hours into the growth trajectory is necessary because the metric + is calculated as the average of a 4 hour rolling window. The middle of a four hour window + does not occur until two hours into the timelapse. To calculate this feature equivalently + for each trajectory, two hours was used for all tracks to get a metric for the transient + growth rate of the colony early in the growth trajectory. + + Parameters + ---------- + df : DataFrame + The dataframe + time_shift : int + The time shift in frames to calculate the transient growth rate in frames + + Returns + ------- + df : DataFrame + The dataframe with the added transient growth rate feature columns + """ + for tid, dft in df.groupby("track_id"): + t_calculate = dft.index_sequence.min() + time_shift + transient_gr_whole_colony = df.loc[df.index_sequence == t_calculate, "neighbor_avg_dxdt_48_volume_whole_colony"].values[0] + df.loc[df.track_id == tid, "early_transient_gr_whole_colony"] = transient_gr_whole_colony * scale + + return df + + def add_std_feature_over_trajectory(df, feature_list, multiplier_list): """ Add the standard deviation of a given feature over the growth trajectory diff --git a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py index c63c5963..b03b9bde 100644 --- a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py +++ b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py @@ -252,7 +252,7 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_features.add_feature_at(df_full, "frame_transition", 'xy_aspect', 'xy_aspect') df_full = add_features.add_feature_at(df_full, "frame_transition", 'SA_vol_ratio', 'SA_vol_ratio') df_full = add_features.add_feature_at(df_full, "frame_transition", 'SA_vol_ratio', 'SA_vol_ratio') - df_full = add_features.get_early_transient_gr_of_whole_colony(df_full) + df_full = add_features.get_early_transient_gr_of_whole_colony(df_full, scale=get_plot_labels_for_metric('neighbor_avg_dxdt_48_volume_whole_colony')[0]) ft_list = ['height', 'density', 'volume', 'mesh_sa', 'xy_aspect', 'SA_vol_ratio', 'neighbor_avg_dxdt_48_volume_whole_colony'] multiplier_list = [get_plot_labels_for_metric(x)[0] for x in ft_list] diff --git a/nuc_morph_analysis/lib/visualization/label_tables.py b/nuc_morph_analysis/lib/visualization/label_tables.py index b3ec7ee9..9c2f6281 100644 --- a/nuc_morph_analysis/lib/visualization/label_tables.py +++ b/nuc_morph_analysis/lib/visualization/label_tables.py @@ -223,7 +223,7 @@ def get_scale_factor_table(dataset="all_baseline"): "density_at_B": "Density at start of growth", "xy_aspect_at_B": "XY aspect ratio at start of growth", "SA_vol_ratio_at_B": "SA/Volume ratio at start of growth", - "neighbor_avg_dxdt_48_volume_whole_colony_at_B": "Transient growth rate of whole colony at start of growth", + "early_transient_gr_whole_colony": "Transient growth rate of whole colony around start of growth", "mean_volume": "Mean volume during growth", "mean_height": "Mean height during growth", "mean_density": "Mean density during growth", From 2beea44c5349808b3524f7c45fd9a3de3d0b84d2 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Mon, 23 Sep 2024 17:09:23 -0700 Subject: [PATCH 15/68] add lineage features for lrm --- .../lib/preprocessing/add_features.py | 57 +++++++++++++++++++ .../preprocessing/global_dataset_filtering.py | 2 + 2 files changed, 59 insertions(+) diff --git a/nuc_morph_analysis/lib/preprocessing/add_features.py b/nuc_morph_analysis/lib/preprocessing/add_features.py index 802927b4..d2c3b8d0 100644 --- a/nuc_morph_analysis/lib/preprocessing/add_features.py +++ b/nuc_morph_analysis/lib/preprocessing/add_features.py @@ -476,3 +476,60 @@ def add_non_interphase_size_shape_flag(df): [f"non_interphase_{filter_key}" for filter_key in NON_INTERPHASE_FILTER_THRESHOLDS.keys()] ].any(axis=1) return df + +def get_sister(df, pid, current_tid): + """ + Gets the track_id of the sibling + + Parameters + ---------- + df: Dataframe + The dataset dataframe + track_id: int + The track_id of the cell + + Returns + ------- + sister_id: List + List containing the track_id of the sibling cell + """ + df_sisters = df.loc[df.parent_id == pid] + tids = df_sisters.track_id.unique() + sister_id = [tid for tid in tids if tid != current_tid] + return sister_id + +def add_lineage_features(df, feature_list): + """ + If the full track has a full track sister or mother, add the given relative's feature as a single track feature column in the dataframe. + + Paramaters + ---------- + df: DataFrame + The dataframe + feature_list: list + List of column names + + Returns + ------- + df: DataFrame + The dataframe with new columns (ie mothers_vol_at_B, sisters_duration) + """ + + for feature in feature_list: + df[f"mothers_{feature}"] = np.nan + df[f"sisters_{feature}"] = np.nan + + df_lineage = df[df['colony'].isin(['small', 'medium'])] + + for tid, dft in df_lineage.groupby("track_id"): + parent_id = dft.parent_id.values[0] + if parent_id != -1 and parent_id in df_lineage.track_id.unique(): + for feature in feature_list: + df.loc[df.track_id == tid, f"mothers_{feature}"] = df_lineage.loc[df_lineage.track_id == parent_id, feature].values[0] + if parent_id != -1: + sister_id = get_sister(df_lineage, parent_id, tid) + if len(sister_id) > 0: + for feature in feature_list: + df.loc[df.track_id == tid, f"sisters_{feature}"] = df_lineage.loc[df_lineage.track_id == sister_id[0], feature].values[0] + + return df \ No newline at end of file diff --git a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py index b03b9bde..e0c812f7 100644 --- a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py +++ b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py @@ -247,6 +247,8 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_growth_features.fit_tracks_to_time_powerlaw(df_full, "volume", interval) # For LRM + df_full = add_features.add_lineage_features(df_full, feature_list=['volume_at_B', 'duration_BC']) + df_full = add_features.add_feature_at(df_full, "frame_transition", 'height', 'height_percentile', pix_size) df_full = add_features.add_feature_at(df_full, "frame_transition", 'density', 'density', pix_size) df_full = add_features.add_feature_at(df_full, "frame_transition", 'xy_aspect', 'xy_aspect') From df9e859f6a98c1c142cbd661c6a46ac9a51a7144 Mon Sep 17 00:00:00 2001 From: Ritvik Vasan Date: Tue, 24 Sep 2024 08:22:38 -0700 Subject: [PATCH 16/68] add sisters + reorient plots --- .../linear_regression_workflow.py | 106 +++++++++++------- .../linear_regression/scripts/added_volume.sh | 7 +- .../scripts/added_volume_sisters.sh | 5 + .../linear_regression/scripts/duration_BC.sh | 7 +- .../scripts/duration_BC_sisters.sh | 5 + .../scripts/ending_volume.sh | 5 +- .../scripts/ending_volume_sisters.sh | 5 + 7 files changed, 90 insertions(+), 50 deletions(-) create mode 100755 nuc_morph_analysis/analyses/linear_regression/scripts/added_volume_sisters.sh create mode 100755 nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC_sisters.sh create mode 100755 nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume_sisters.sh diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py index d4e750f9..4a58f15d 100644 --- a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py @@ -17,12 +17,16 @@ from tqdm import tqdm from nuc_morph_analysis.lib.preprocessing import global_dataset_filtering, filter_data from sklearn.model_selection import permutation_test_score -from nuc_morph_analysis.lib.visualization.plotting_tools import get_plot_labels_for_metric +from nuc_morph_analysis.lib.visualization.plotting_tools import ( + get_plot_labels_for_metric, +) import imageio + pd.options.mode.chained_assignment = None # default='warn' warnings.simplefilter(action="ignore", category=FutureWarning) + def main(cols, target, alpha_range, tolerance, save_path, cached_dataframe=None): save_path = Path(save_path) @@ -36,11 +40,12 @@ def main(cols, target, alpha_range, tolerance, save_path, cached_dataframe=None) else: df_track_level_features = pd.read_csv(cached_dataframe) - fit_linear_regression(df_track_level_features, cols, target, alpha_range, tolerance, save_path) + fit_linear_regression( + df_track_level_features, cols, target, alpha_range, tolerance, save_path + ) -def fit_linear_regression( - data, cols, target, alpha, tol, save_path -): + +def fit_linear_regression(data, cols, target, alpha, tol, save_path): """ data - track level features cols - input features @@ -55,7 +60,17 @@ def fit_linear_regression( # init empty dicts and lists all_test_sc = [] all_coef_alpha = [] - all_perms = {'score': [], 'perm_score_mean': [], 'perm_score_std': [], 'p_value': [], 'alpha': []} + all_perms = { + "score": [], + "perm_score_mean": [], + "perm_score_std": [], + "p_value": [], + "alpha": [], + } + + # remove 0 alpha due to convergence errors + alpha = [i for i in alpha if i != 0] + alpha = [round(i, 1) for i in alpha] # find best alpha for Lasso model for alpha_ind, this_alpha in enumerate(alpha): @@ -80,7 +95,12 @@ def fit_linear_regression( # run permutation test score, permutation_scores, pvalue = permutation_test_score( - model, all_input, all_target, random_state=random_state, cv=5, n_permutations=500, + model, + all_input, + all_target, + random_state=random_state, + cv=5, + n_permutations=500, ) # break if permutation score is less than linear regression value (max possible) @@ -94,11 +114,11 @@ def fit_linear_regression( # if relatively equal to linear regression value, then continue # save permutation score and p_value to dictionary - all_perms['score'].append(score) - all_perms['perm_score_mean'].append(permutation_scores.mean()) - all_perms['perm_score_std'].append(permutation_scores.std()) - all_perms['p_value'].append(pvalue) - all_perms['alpha'].append(this_alpha) + all_perms["score"].append(score) + all_perms["perm_score_mean"].append(permutation_scores.mean()) + all_perms["perm_score_std"].append(permutation_scores.std()) + all_perms["p_value"].append(pvalue) + all_perms["alpha"].append(this_alpha) # run cross validate to get model coefficients cv_model = cross_validate( @@ -121,9 +141,7 @@ def fit_linear_regression( # Save test r^2 and test MSE to dataframe range_test_scores = [round(i, 2) for i in cv_model["test_r2"]] - range_errors = [ - round(i, 2) for i in cv_model["test_neg_mean_squared_error"] - ] + range_errors = [round(i, 2) for i in cv_model["test_neg_mean_squared_error"]] test_sc = pd.DataFrame() test_sc[r"Test r$^2$"] = range_test_scores test_sc["Test MSE"] = range_errors @@ -141,9 +159,9 @@ def fit_linear_regression( # Get test scores for all alpha all_test_sc = pd.concat(all_test_sc, axis=0).reset_index(drop=True) all_test_sc["Test MSE"] = -all_test_sc["Test MSE"] - save_path = save_path / Path(f"{target}_{alpha[alpha_ind - 1]}") + save_path = save_path / Path(f"{target}") save_path.mkdir(parents=True, exist_ok=True) - all_test_sc.to_csv(save_path / f"mse.csv") + all_test_sc.to_csv(save_path / "mse.csv") # Get coeffs for all alpha all_coef_alpha = pd.concat(all_coef_alpha, axis=0).reset_index(drop=True) @@ -152,45 +170,54 @@ def fit_linear_regression( var_name="Column", value_name="Coefficient Importance", ).reset_index(drop=True) - all_coef_alpha.to_csv(save_path / f"coefficients.csv") + all_coef_alpha.to_csv(save_path / "coefficients.csv") # Get permutation scores and p values for all alpha all_perms = pd.DataFrame(all_perms).reset_index(drop=True) - all_perms.to_csv(save_path / f"perm_scores.csv") + all_perms.to_csv(save_path / "perm_scores.csv") # Save coefficient plot for max alpha value save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path) return all_coef_alpha, all_test_sc, all_perms + def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path): xlim = None - ylim = None files = [] - for alpha in all_coef_alpha['alpha'].unique(): - this_coef_alpha = all_coef_alpha.loc[all_coef_alpha['alpha'] == alpha].reset_index(drop=True) - this_test_sc = all_test_sc.loc[all_test_sc['alpha'] == alpha].reset_index(drop=True) - this_perms = all_perms.loc[all_perms['alpha'] == alpha].reset_index(drop=True) - p_value = round(this_perms['p_value'].item(), 3) - test_r2_mean = round(this_test_sc['Test r$^2$'].mean(), 2) - test_r2_std = round(this_test_sc['Test r$^2$'].std()/2, 2) + for alpha in all_coef_alpha["alpha"].unique(): + this_coef_alpha = all_coef_alpha.loc[ + all_coef_alpha["alpha"] == alpha + ].reset_index(drop=True) + this_test_sc = all_test_sc.loc[all_test_sc["alpha"] == alpha].reset_index( + drop=True + ) + this_perms = all_perms.loc[all_perms["alpha"] == alpha].reset_index(drop=True) + p_value = round(this_perms["p_value"].item(), 3) + test_r2_mean = round(this_test_sc["Test r$^2$"].mean(), 2) + test_r2_std = round(this_test_sc["Test r$^2$"].std() / 2, 2) g = sns.catplot( data=this_coef_alpha, - y='Column', + y="Column", x="Coefficient Importance", kind="bar", errorbar="sd", - aspect=1.5, - height=5, + aspect=2, + height=6, ) - g.fig.subplots_adjust(top=0.8) # adjust the Figure in rp - g.fig.suptitle(f'alpha {alpha} test r^2 {test_r2_mean}+-{test_r2_std} p-value {p_value}') - label_list = [get_plot_labels_for_metric(col)[1] for col in all_coef_alpha['Column'].unique()] + g.fig.subplots_adjust(top=0.8) # adjust the Figure in rp + g.fig.suptitle( + f"alpha {alpha} test r^2 {test_r2_mean}+-{test_r2_std} p-value {p_value}" + ) + label_list = [ + get_plot_labels_for_metric(col)[1] + for col in all_coef_alpha["Column"].unique() + ] g.set_yticklabels(label_list) - print(f'Saving coefficients_{target}_alpha_{alpha}.png') - this_path = str(save_path / Path(f'coefficients_{target}_alpha_{alpha}.png')) + print(f"Saving coefficients_{target}_alpha_{alpha}.png") + this_path = str(save_path / Path(f"coefficients_{target}_alpha_{alpha}.png")) files.append(this_path) if not xlim: @@ -199,11 +226,12 @@ def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path): g.savefig(this_path) # save movie of pngs - writer = imageio.get_writer(save_path / 'test.mp4', fps=2) + writer = imageio.get_writer(save_path / "coefficients_over_time.mp4", fps=2) for im in files: writer.append_data(imageio.imread(im)) writer.close() + def list_of_strings(arg): return arg.split(",") @@ -236,7 +264,7 @@ def str2bool(v): parser.add_argument( "--cols", type=list_of_strings, - default=['volume_at_B', 'time_at_B', 'colony_time_at_B', 'SA_at_B'], + default=["volume_at_B", "time_at_B", "colony_time_at_B", "SA_at_B"], help="Supply a list of column names to use as independent variables in the linear regression analysis.", ) parser.add_argument( @@ -248,7 +276,7 @@ def str2bool(v): parser.add_argument( "--alpha_range", type=list_of_floats, - default=[1.3], + default=np.arange(0, 15, 0.1, dtype=float), help="Supply a list of alpha values to use in lasso regression", ) parser.add_argument( @@ -270,5 +298,5 @@ def str2bool(v): alpha_range=args.alpha_range, tolerance=args.tolerance, save_path=args.save_path, - cached_dataframe=args.cached_dataframe + cached_dataframe=args.cached_dataframe, ) diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume.sh b/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume.sh index e3376001..d1b40635 100755 --- a/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume.sh +++ b/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume.sh @@ -1,6 +1,5 @@ python linear_regression_workflow.py \ ---cols 'volume_at_B','height_at_B','time_at_B','xy_aspect_at_B','SA_vol_ratio_at_B','colony_time_at_B','SA_at_B','mean_height','mean_density','mean_xy_aspect','mean_SA_vol_ratio','mean_neighbor_avg_dxdt_48_volume_whole_colony','std_height','std_density','std_xy_aspect','std_SA_vol_ratio','std_neighbor_avg_dxdt_48_volume_whole_colony' \ ---alpha_range 0,0.1,0.5,1,1.3,1.5,2,2.5,5,10,11,12,13 \ +--cols 'volume_at_B','height_at_B','time_at_B','xy_aspect_at_B','SA_vol_ratio_at_B','colony_time_at_B','SA_at_B','mean_height','mean_density','mean_xy_aspect','mean_SA_vol_ratio','mean_neighbor_avg_dxdt_48_volume_whole_colony','std_height','std_density','std_xy_aspect','std_SA_vol_ratio','std_neighbor_avg_dxdt_48_volume_whole_colony','early_transient_gr_whole_colony' \ --target 'delta_volume_BC' \ ---save_path "../../figures/" \ ---tolerance 0.05 \ No newline at end of file +--save_path "./" \ +--tolerance 0.08 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume_sisters.sh b/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume_sisters.sh new file mode 100755 index 00000000..3e56e4bb --- /dev/null +++ b/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume_sisters.sh @@ -0,0 +1,5 @@ +python linear_regression_workflow.py \ +--cols 'volume_at_B','mothers_volume_at_B','sisters_volume_at_B','mothers_duration_BC','sisters_duration_BC','height_at_B','time_at_B','xy_aspect_at_B','SA_vol_ratio_at_B','colony_time_at_B','SA_at_B','mean_height','mean_density','mean_xy_aspect','mean_SA_vol_ratio','mean_neighbor_avg_dxdt_48_volume_whole_colony','std_height','std_density','std_xy_aspect','std_SA_vol_ratio','std_neighbor_avg_dxdt_48_volume_whole_colony','early_transient_gr_whole_colony' \ +--target 'delta_volume_BC' \ +--save_path "./" \ +--tolerance 0.08 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC.sh b/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC.sh index fcb1163c..bd012bf8 100755 --- a/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC.sh +++ b/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC.sh @@ -1,6 +1,5 @@ python linear_regression_workflow.py \ ---cols 'volume_at_B','height_at_B','time_at_B','xy_aspect_at_B','SA_vol_ratio_at_B','colony_time_at_B','SA_at_B','mean_height','mean_density','mean_volume','mean_mesh_sa','mean_xy_aspect','mean_SA_vol_ratio','mean_neighbor_avg_dxdt_48_volume_whole_colony','std_height','std_density','std_volume','std_mesh_sa','std_xy_aspect','std_SA_vol_ratio','std_neighbor_avg_dxdt_48_volume_whole_colony' \ ---alpha_range 0,0.1,0.5,1,1.3,1.5,2,2.5,5,10 \ +--cols 'volume_at_B','height_at_B','time_at_B','xy_aspect_at_B','SA_vol_ratio_at_B','colony_time_at_B','SA_at_B','mean_height','mean_density','mean_volume','mean_mesh_sa','mean_xy_aspect','mean_SA_vol_ratio','mean_neighbor_avg_dxdt_48_volume_whole_colony','std_height','std_density','std_volume','std_mesh_sa','std_xy_aspect','std_SA_vol_ratio','std_neighbor_avg_dxdt_48_volume_whole_colony','early_transient_gr_whole_colony' \ --target 'duration_BC' \ ---save_path "../../figures/" \ ---tolerance 0.05 \ No newline at end of file +--save_path "./" \ +--tolerance 0.08 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC_sisters.sh b/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC_sisters.sh new file mode 100755 index 00000000..42b7c704 --- /dev/null +++ b/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC_sisters.sh @@ -0,0 +1,5 @@ +python linear_regression_workflow.py \ +--cols 'volume_at_B','mothers_volume_at_B','sisters_volume_at_B','mothers_duration_BC','sisters_duration_BC','height_at_B','time_at_B','xy_aspect_at_B','SA_vol_ratio_at_B','colony_time_at_B','SA_at_B','mean_height','mean_density','mean_volume','mean_mesh_sa','mean_xy_aspect','mean_SA_vol_ratio','mean_neighbor_avg_dxdt_48_volume_whole_colony','std_height','std_density','std_volume','std_mesh_sa','std_xy_aspect','std_SA_vol_ratio','std_neighbor_avg_dxdt_48_volume_whole_colony','early_transient_gr_whole_colony' \ +--target 'duration_BC' \ +--save_path "./" \ +--tolerance 0.08 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume.sh b/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume.sh index 419bdfb1..97e76194 100755 --- a/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume.sh +++ b/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume.sh @@ -1,6 +1,5 @@ python linear_regression_workflow.py \ ---cols 'volume_at_B','height_at_B','time_at_B','xy_aspect_at_B','SA_vol_ratio_at_B','colony_time_at_B','SA_at_B','mean_height','mean_density','mean_xy_aspect','mean_SA_vol_ratio','mean_neighbor_avg_dxdt_48_volume_whole_colony','std_height','std_density','std_xy_aspect','std_SA_vol_ratio','std_neighbor_avg_dxdt_48_volume_whole_colony' \ ---alpha_range 0,0.1,0.5,1,1.3,1.5,2,2.5,5,10,11,12,13 \ +--cols 'volume_at_B','height_at_B','time_at_B','xy_aspect_at_B','SA_vol_ratio_at_B','colony_time_at_B','SA_at_B','mean_height','mean_density','mean_xy_aspect','mean_SA_vol_ratio','mean_neighbor_avg_dxdt_48_volume_whole_colony','std_height','std_density','std_xy_aspect','std_SA_vol_ratio','std_neighbor_avg_dxdt_48_volume_whole_colony','early_transient_gr_whole_colony' \ --target 'volume_at_C' \ ---save_path "../../figures/" \ +--save_path "./" \ --tolerance 0.05 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume_sisters.sh b/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume_sisters.sh new file mode 100755 index 00000000..01d44aed --- /dev/null +++ b/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume_sisters.sh @@ -0,0 +1,5 @@ +python linear_regression_workflow.py \ +--cols 'volume_at_B','mothers_volume_at_B','sisters_volume_at_B','mothers_duration_BC','sisters_duration_BC','height_at_B','time_at_B','xy_aspect_at_B','SA_vol_ratio_at_B','colony_time_at_B','SA_at_B','mean_height','mean_density','mean_xy_aspect','mean_SA_vol_ratio','mean_neighbor_avg_dxdt_48_volume_whole_colony','std_height','std_density','std_xy_aspect','std_SA_vol_ratio','std_neighbor_avg_dxdt_48_volume_whole_colony','early_transient_gr_whole_colony' \ +--target 'volume_at_C' \ +--save_path "./" \ +--tolerance 0.05 \ No newline at end of file From 140641cadbfc28097d8a771deb239ec49345ea4b Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Tue, 24 Sep 2024 16:18:43 -0700 Subject: [PATCH 17/68] update figure formatting + label tables --- .../linear_regression_workflow.py | 6 +-- .../lib/visualization/label_tables.py | 50 ++++++++++--------- 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py index 4a58f15d..f3c000a0 100644 --- a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py @@ -205,11 +205,11 @@ def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path): kind="bar", errorbar="sd", aspect=2, - height=6, + height=10, ) - g.fig.subplots_adjust(top=0.8) # adjust the Figure in rp + g.fig.subplots_adjust(top=.9) # adjust the Figure in rp g.fig.suptitle( - f"alpha {alpha} test r^2 {test_r2_mean}+-{test_r2_std} p-value {p_value}" + f"Prediction of {get_plot_labels_for_metric(target)[1]}\nalpha={alpha}, test r\u00B2={test_r2_mean}±{test_r2_std}, P={p_value}" ) label_list = [ get_plot_labels_for_metric(col)[1] diff --git a/nuc_morph_analysis/lib/visualization/label_tables.py b/nuc_morph_analysis/lib/visualization/label_tables.py index ddf865bf..fcebbdfc 100644 --- a/nuc_morph_analysis/lib/visualization/label_tables.py +++ b/nuc_morph_analysis/lib/visualization/label_tables.py @@ -132,7 +132,7 @@ def get_scale_factor_table(dataset="all_baseline"): ("frame_inflection", "frame_transition"): "Transtion", ("Fb", "frame_breakdown"): "Breakdown", "time_at_A": "Time at Formation", - "time_at_B": "Movie Time at Transition", + "time_at_B": "Starting movie time", "time_at_C": "Movie Time at Breakdown", "colony_time_at_A": "Aligned Colony Time at Formation", "colony_time_at_B": "Starting Aligned Colony Time", @@ -142,7 +142,7 @@ def get_scale_factor_table(dataset="all_baseline"): # Volume "volume": "Volume", "volume_at_A": "Volume at Formation", - "volume_at_B": "Starting Volume", + "volume_at_B": "Starting volume", "volume_at_C": "Ending Volume", "Volume_C": "Volume at Breakdown", "volume_fold_change_BC": "Volume Fold-Change", @@ -164,7 +164,7 @@ def get_scale_factor_table(dataset="all_baseline"): "height": "Height", "avg_height": "Average Height", "height_fold_change_BC": "Growth Height Fold-Change", - "height_at_B": "Starting Height", + "height_at_B": "Starting height", "height_at_C": "Ending Height", # Surface Area "mesh_sa": "Surface Area", @@ -186,7 +186,7 @@ def get_scale_factor_table(dataset="all_baseline"): "zy_aspect": "YZ Aspect Ratio", "xz_aspect_fold_change_BC": "XZ Aspect Ratio Fold-Change B to C", "avg_xz_aspect_ratio": "Average XZ Aspect Ratio", - "xz_aspect_at_B": "XZ Aspect Ratio at B", + "xz_aspect_at_B": "Starting XZ aspect ratio", # Colony Position "distance": "Distance", "distance_from_centroid": "Distance From Centroid", @@ -206,6 +206,8 @@ def get_scale_factor_table(dataset="all_baseline"): # Lineage "parent_id": "Parent ID", "family_id": "Family ID", + "sisters_volume_at_B": "Sisters starting volume", + "sisters_duration_BC": "Sisters growth duration", # Flags "is_outlier": "Outlier Flag", "is_tp_outlier": "Single Timepoint Outlier", @@ -219,29 +221,31 @@ def get_scale_factor_table(dataset="all_baseline"): "nucleus_colony_area_ratio": "ratio of nuclear area to colony area", "seg_twoD_zMIP_area": "total projected nuclear area", # LRM feats - "height_at_B": "Height at start of growth", - "density_at_B": "Density at start of growth", - "xy_aspect_at_B": "XY aspect ratio at start of growth", - "SA_vol_ratio_at_B": "SA/Volume ratio at start of growth", - "early_transient_gr_whole_colony": "Transient growth rate of whole colony around start of growth", - "mean_volume": "Mean volume during growth", - "mean_height": "Mean height during growth", - "mean_density": "Mean density during growth", - "mean_mesh_sa": "Mean surface area during growth", - "mean_xy_aspect": "Mean XY aspect ratio during growth", - "mean_SA_vol_ratio": "Mean SA/Volume ratio during growth", - "mean_neighbor_avg_dxdt_48_volume_whole_colony": "Mean transient growth rate of whole colony during growth", - "std_volume": "Stdev. volume during growth", - "std_height": "Stdev. height during growth", - "std_density": "Stdev. density during growth", - "std_mesh_sa": "Stdev. surface area during growth", - "std_xy_aspect": "Stdev. XY aspect ratio during growth", - "std_SA_vol_ratio": "Stdev. SA/Volume ratio during growth", - "std_neighbor_avg_dxdt_48_volume_whole_colony": "Stdev. transient growth rate of whole colony during growth", + "height_at_B": "Starting height", + "density_at_B": "Starting density", + "xy_aspect_at_B": "Starting XY aspect ratio", + "SA_vol_ratio_at_B": "Starting SA/Volume ratio", + "early_transient_gr_whole_colony": "~Starting transient growth rate of whole colony", + "mean_volume": "Mean volume", + "mean_height": "Mean height", + "mean_density": "Mean density", + "mean_mesh_sa": "Mean surface area", + "mean_xy_aspect": "Mean XY aspect ratio", + "mean_SA_vol_ratio": "Mean SA/Volume ratio", + "mean_neighbor_avg_dxdt_48_volume_whole_colony": "Mean transient growth rate of whole colony", + "std_volume": "Stdev. volume", + "std_height": "Stdev. height", + "std_density": "Stdev. density", + "std_mesh_sa": "Stdev. surface area", + "std_xy_aspect": "Stdev. XY aspect ratio", + "std_SA_vol_ratio": "Stdev. SA/Volume ratio", + "std_neighbor_avg_dxdt_48_volume_whole_colony": "Stdev. transient growth rate of whole colony", # mitotic and apoptotic neighbor columns "number_of_frame_of_breakdown_neighbors": "# of neighboring cells undergoing breakdown", "number_of_frame_of_formation_neighbors": "# of neighboring cells undergoing formation", "number_of_frame_of_death_neighbors": "# of neighboring cells undergoing death", + "sum_has_mitotic_neighbor": "# mitotic neighbors", + "sum_has_dying_neighbor": "# dying neighbors", } From 3ae21bd4fd65c0f1ee62b7e3e8d395d5c9e1d008 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Wed, 25 Sep 2024 21:15:26 -0700 Subject: [PATCH 18/68] update_plot_axis --- .../analyses/linear_regression/linear_regression_workflow.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py index f3c000a0..049a9aec 100644 --- a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py @@ -207,6 +207,9 @@ def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path): aspect=2, height=10, ) + + g.set(ylabel='') + g.fig.subplots_adjust(top=.9) # adjust the Figure in rp g.fig.suptitle( f"Prediction of {get_plot_labels_for_metric(target)[1]}\nalpha={alpha}, test r\u00B2={test_r2_mean}±{test_r2_std}, P={p_value}" @@ -223,7 +226,7 @@ def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path): if not xlim: xlim = g.fig.axes[0].get_xlim() g.set(xlim=xlim) - g.savefig(this_path) + g.savefig(this_path, dpi=300) # save movie of pngs writer = imageio.get_writer(save_path / "coefficients_over_time.mp4", fps=2) From 8cb3d395a0d72091a592fbac38d61c57ead9ddbe Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Wed, 25 Sep 2024 21:15:52 -0700 Subject: [PATCH 19/68] update label tables --- .../lib/visualization/label_tables.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/nuc_morph_analysis/lib/visualization/label_tables.py b/nuc_morph_analysis/lib/visualization/label_tables.py index fcebbdfc..2966a9c5 100644 --- a/nuc_morph_analysis/lib/visualization/label_tables.py +++ b/nuc_morph_analysis/lib/visualization/label_tables.py @@ -240,12 +240,24 @@ def get_scale_factor_table(dataset="all_baseline"): "std_xy_aspect": "Stdev. XY aspect ratio", "std_SA_vol_ratio": "Stdev. SA/Volume ratio", "std_neighbor_avg_dxdt_48_volume_whole_colony": "Stdev. transient growth rate of whole colony", + 'neighbor_avg_lrm_volume_90um_at_B': "Starting avg. volume in 90um radius", + 'neighbor_avg_lrm_height_90um_at_B': "Starting avg. height in 90um radius", + 'neighbor_avg_lrm_density_90um_at_B': "Starting avg. density in 90um radius", + 'neighbor_avg_lrm_xy_aspect_90um_at_B': "Starting avg. XY aspect ratio in 90um radius", + 'neighbor_avg_lrm_mesh_sa_90um_at_B': "Starting avg. surface area in 90um radius", + + 'mean_neighbor_avg_lrm_volume_90um': " Avg. mean volume in 90um radius", + 'mean_neighbor_avg_lrm_height_90um': "Avg. mean height in 90um radius", + 'mean_neighbor_avg_lrm_density_90um': "Avg. mean density in 90um radius", + 'mean_neighbor_avg_lrm_xy_aspect_90um': "Avg. mean XY aspect ratio in 90um radius", + 'mean_neighbor_avg_lrm_mesh_sa_90um': "Avg. mean surface area in 90um radius", + # mitotic and apoptotic neighbor columns "number_of_frame_of_breakdown_neighbors": "# of neighboring cells undergoing breakdown", "number_of_frame_of_formation_neighbors": "# of neighboring cells undergoing formation", "number_of_frame_of_death_neighbors": "# of neighboring cells undergoing death", - "sum_has_mitotic_neighbor": "# mitotic neighbors", - "sum_has_dying_neighbor": "# dying neighbors", + "sum_has_mitotic_neighbor": "Sum of mitotic neighbors", + "sum_has_dying_neighbor": "Sum of dying neighbors", } From 2202666978f05f0e49e6bdb2817ac70ba1067048 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Wed, 25 Sep 2024 22:09:54 -0700 Subject: [PATCH 20/68] add extrinsic neighborhood feats --- .../preprocessing/global_dataset_filtering.py | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py index fd7bfcc4..9d07c754 100644 --- a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py +++ b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py @@ -9,6 +9,7 @@ is_tp_outlier, add_features, add_neighborhood_avg_features, + add_neighborhood_avg_features_lrm, compute_change_over_time, ) from nuc_morph_analysis.analyses.volume import add_growth_features @@ -199,6 +200,9 @@ def process_all_tracks(df, dataset, remove_growth_outliers, num_workers): df = add_features.add_non_interphase_size_shape_flag(df) df = add_change_over_time(df) df = add_neighborhood_avg_features.run_script(df, num_workers=num_workers) + df = add_neighborhood_avg_features_lrm.run_script(df, num_workers=num_workers, + feature_list=["volume", "height", "density", "xy_aspect", "mesh_sa"], + exclude_outliers=False) if dataset == "all_baseline": df = add_colony_time_all_datasets(df) @@ -261,17 +265,23 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_features.add_feature_at(df_full, "frame_transition", 'height', 'height_percentile', pix_size) df_full = add_features.add_feature_at(df_full, "frame_transition", 'density', 'density', pix_size) - df_full = add_features.add_feature_at(df_full, "frame_transition", 'xy_aspect', 'xy_aspect') - df_full = add_features.add_feature_at(df_full, "frame_transition", 'SA_vol_ratio', 'SA_vol_ratio') - df_full = add_features.add_feature_at(df_full, "frame_transition", 'SA_vol_ratio', 'SA_vol_ratio') + for feature in ['xy_aspect', 'SA_vol_ratio', 'neighbor_avg_lrm_volume_90um', 'neighbor_avg_lrm_height_90um', + 'neighbor_avg_lrm_density_90um','neighbor_avg_lrm_xy_aspect_90um','neighbor_avg_lrm_mesh_sa_90um']: + df_full = add_features.add_feature_at(df_full, "frame_transition", feature, feature) + df_full = add_features.get_early_transient_gr_of_whole_colony(df_full, scale=get_plot_labels_for_metric('neighbor_avg_dxdt_48_volume_whole_colony')[0]) + df_full = add_features.sum_mitotic_events_along_full_track(df_full) - ft_list = ['height', 'density', 'volume', 'mesh_sa', 'xy_aspect', 'SA_vol_ratio', 'neighbor_avg_dxdt_48_volume_whole_colony'] + ft_list = ['neighbor_avg_dxdt_48_volume_whole_colony', + 'neighbor_avg_lrm_volume_90um', + 'neighbor_avg_lrm_height_90um', + 'neighbor_avg_lrm_density_90um', + 'neighbor_avg_lrm_xy_aspect_90um', + 'neighbor_avg_lrm_mesh_sa_90um'] multiplier_list = [get_plot_labels_for_metric(x)[0] for x in ft_list] df_full = add_features.add_mean_feature_over_trajectory(df_full, ft_list, multiplier_list) - df_full = add_features.add_std_feature_over_trajectory(df_full, ft_list, multiplier_list) - - df_full = add_features.sum_mitotic_events_along_full_track(df_full) + for feat in ft_list: + df_full = add_features.add_feature_at(df_full, "frame_transition", feat, feat) # Add flag for use after merging back to main manifest df_full = add_features.add_full_track_flag(df_full) From 656b292b6f53fd7dfdd51f9738e476f235add387 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Wed, 25 Sep 2024 22:10:16 -0700 Subject: [PATCH 21/68] add neighborhood feats for lrm --- .../add_neighborhood_avg_features_lrm.py | 168 ++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 nuc_morph_analysis/lib/preprocessing/add_neighborhood_avg_features_lrm.py diff --git a/nuc_morph_analysis/lib/preprocessing/add_neighborhood_avg_features_lrm.py b/nuc_morph_analysis/lib/preprocessing/add_neighborhood_avg_features_lrm.py new file mode 100644 index 00000000..36525e5f --- /dev/null +++ b/nuc_morph_analysis/lib/preprocessing/add_neighborhood_avg_features_lrm.py @@ -0,0 +1,168 @@ +# %% +import pandas as pd +from tqdm import tqdm +import numpy as np +from multiprocessing import Pool, cpu_count +from nuc_morph_analysis.lib.preprocessing.load_data import get_dataset_pixel_size +from nuc_morph_analysis.lib.preprocessing import filter_data +from nuc_morph_analysis.lib.preprocessing.filter_data import all_timepoints_minimal_filtering + +LOCAL_RADIUS_LIST = [90, -1] +LOCAL_RADIUS_STR_LIST = ["90um", "whole_colony"] +NEIGHBOR_FEATURE_LIST = ["volume"] +NEIGHBOR_PREFIX = "neighbor_avg_lrm_" + +def run_in_parallel(args): + return get_neighbor_avg_at_t(*args) + + +def get_neighbor_avg_at_t(dft, local_radius_list, columns, min_neighbor_thresh=5): + """ + Compute the average value of the columns for each CellId at a given timepoint within a given colony + by finding its neighbors within a given radius=local_radius + + Parameters + ---------- + dft : pd.DataFrame + dataframe that minimally has the following columns: + ['index_sequence','track_id','centroid_x','centroid_y'] + with index = "CellId" + local_radius_list : list + list of integers that represent the radius in microns to use for finding neighbors + -1 signifies the whole colony + columns : list + list of column names to compute the average value for + min_neighbor_thresh : int + minimum number of neighbors required to compute the average value + + Returns + ------- + dft : pd.DataFrame + dataframe with the average value of the columns for each CellId at a given timepoint within a given colony + """ + + # now get the centroid values for all CellIds at that index sequence + centroid_xy = dft[["centroid_x", "centroid_y"]].values + # now compute pair-wise distances between all centroid_xy values + dists = np.linalg.norm(centroid_xy[:, None] - centroid_xy[None], axis=2) + + # initialize the new columns to be added to the dataframe + # to avoid the PerformanceWarning: DataFrame is highly fragmented occuring in the last line of this function + data = {} + for li, local_radius in enumerate(local_radius_list): + local_radius_str = LOCAL_RADIUS_STR_LIST[li] + for column in columns: + data[f"{NEIGHBOR_PREFIX}{column}_{local_radius_str}"] = np.nan + + dftnew = pd.DataFrame(data, index=dft.index) + + for li, local_radius in enumerate(local_radius_list): + local_radius_str = LOCAL_RADIUS_STR_LIST[li] + # now for each CellId find the neighbors within a circle with radius = radius + pix_size = get_dataset_pixel_size(dft.colony.values[0]) + radius = local_radius / pix_size # pixels + if local_radius < 0: + neighbors = np.ones(dists.shape, dtype=bool) + else: + neighbors = dists < radius + + # don't allow "self" to be a neighbor + np.fill_diagonal(neighbors, False) + + for column in columns: + col_vals1d = dft[column].values + col_vals2d = np.tile(col_vals1d, (col_vals1d.size, 1)).astype(float) + # now to make the diagonal values nan so that the cell of interest is not included in the average, but only the neighbors + col_vals2d[~neighbors] = np.nan + + # some rows could be all nan + # so we need to track the indices of the rows that are not all nan + # and only use those indices to update the dataframe + # this is important for avoiding an annoyint RuntimeWarning: Mean of empty slice + non_nan_neighbors_per_row = np.sum(~np.isnan(col_vals2d), axis=1) + + non_nan_rows = non_nan_neighbors_per_row >= min_neighbor_thresh + col_vals2d = col_vals2d[non_nan_rows] + + # now compute the average of the column values for each row (average of the neighbors) + # Compute the average value of the column for each neighborhood + col_vals_avg = np.nanmean(col_vals2d, axis=1) + + # Add the average values to the dataframe + # dft.loc[non_nan_rows,f'{prefix}{column}_{local_radius_str}'] = col_vals_avg + dftnew.loc[ + dft.index.values[non_nan_rows], f"{NEIGHBOR_PREFIX}{column}_{local_radius_str}" + ] = col_vals_avg + dft = dft.join(dftnew) + return dft + + +def run_script( + df, + num_workers=1, + feature_list=NEIGHBOR_FEATURE_LIST, + local_radius_list=LOCAL_RADIUS_LIST, + exclude_outliers=True, +): + """ + Determine average values of features within neighborhoods of defined radius around each cell + + Parameters + ---------- + df : pd.DataFrame + dataframe with index=CellId, and the following columns minimally: + ['index_sequence','track_id','centroid_x','centroid_y'] + feature_list + [x for x in df.columns if 'dxdt' in x] + num_workers : int, optional + number of workers to use for parallel processing. The default is 1. + feature_list : list, optional + list of column names that represent the features to proces. The default is NEIGHBOR_FEATURE_LIST + local_radius_list : list, optional + list of integers that represent the radius in microns to use for finding neighbors. The default is LOCAL_RADIUS_LIST. + -1 signifies the whole colony + exclude_outliers : bool, optional + whether to exclude outliers. The default is False. + + Returns + ------- + dforig : pd.DataFrame + original dataframe with the newly computed columns added + index of dataframe = CellId + """ + + dforig = df.copy() + original_columns = dforig.columns.tolist() + if exclude_outliers: + # to ensure that outlier data points are not used for the neighborhood avgs, filter out time point outliers here + df = all_timepoints_minimal_filtering(df) + + # also be sure to filter out non-interphase cells from neighborhood + # df = filter_data.filter_out_non_interphase_size_shape_flag(df) + # df = filter_data.filter_out_cells_entering_or_exiting_mitosis(df) + + for colony in df.colony.unique(): + dfi = df[df["colony"] == colony] + pass_cols = ["index_sequence", "colony", "track_id", "centroid_x", "centroid_y"] + + columns = feature_list + [x for x in dfi.columns if "dxdt" in x] + + # first find the unique index_sequence values + index_sequences = dfi["index_sequence"].unique() + dfin = dfi[ + pass_cols + columns + ] # reduce the size of the dataframe being passed in by only including necessary columns + + # run in parallel + args_list = [ + (dfin[dfin["index_sequence"] == t], local_radius_list, columns) for t in index_sequences + ] + num_workers = np.min([num_workers, cpu_count()]) + if num_workers == 1: + results = list(map(run_in_parallel, args_list)) + else: + results = list(Pool(num_workers).imap_unordered(run_in_parallel, args_list)) + dft = pd.concat(results) + new_cols = [x for x in dft.columns if x not in original_columns] + dforig.loc[dft.index, new_cols] = dft[new_cols] + return dforig + +# %% From 8e008e2354ceea6a46b7579faaa38ff6503fe371 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Wed, 25 Sep 2024 22:11:27 -0700 Subject: [PATCH 22/68] worklfow only loads data once --- .../chantelle_linear_regression_workflow.py | 39 +++++++++++ .../linear_regression/select_features.py | 68 +++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py create mode 100644 nuc_morph_analysis/analyses/linear_regression/select_features.py diff --git a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py new file mode 100644 index 00000000..ad228031 --- /dev/null +++ b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py @@ -0,0 +1,39 @@ +#%% +import warnings +import numpy as np +import pandas as pd +from nuc_morph_analysis.lib.preprocessing import global_dataset_filtering, filter_data, add_features, add_neighborhood_avg_features_lrm +from nuc_morph_analysis.analyses.linear_regression.linear_regression_workflow import fit_linear_regression +from nuc_morph_analysis.analyses.linear_regression.select_features import (get_feature_list, TARGET_SETTINGS) +from nuc_morph_analysis.lib.visualization.plotting_tools import get_plot_labels_for_metric + +pd.options.mode.chained_assignment = None # default='warn' +warnings.simplefilter(action="ignore", category=FutureWarning) + +#%% +df_all = global_dataset_filtering.load_dataset_with_features() +df_full = filter_data.all_timepoints_full_tracks(df_all) +df_track_level_features = filter_data.track_level_features(df_full) + +#%% +for target in ['volume_at_C', 'delta_volume_BC', 'duration_BC']: + fit_linear_regression( + df_track_level_features, + cols=get_feature_list(['features'], target), + target=target, + alpha=np.arange(0, 15, 0.1, dtype=float), + tol=TARGET_SETTINGS[target]['tolerance'], + save_path="./figures/feats/" + ) + print(f"Finished {target}") +#%% +for target in ['volume_at_C', 'delta_volume_BC', 'duration_BC']: + fit_linear_regression( + df_track_level_features, + cols=get_feature_list(['features', 'lineage_feats'], target), + target=target, + alpha=np.arange(0, 15, 0.1, dtype=float), + tol=TARGET_SETTINGS[target]['tolerance'], + save_path="./figures/feats_plus_lineage/" + ) + print(f"Finished {target} with lineage") diff --git a/nuc_morph_analysis/analyses/linear_regression/select_features.py b/nuc_morph_analysis/analyses/linear_regression/select_features.py new file mode 100644 index 00000000..964f2c0a --- /dev/null +++ b/nuc_morph_analysis/analyses/linear_regression/select_features.py @@ -0,0 +1,68 @@ + +#%% +FEATURE_GROUPS = { + 'features': [ + 'volume_at_B', #intrinic at start of growth + 'height_at_B', + 'SA_vol_ratio_at_B', + 'SA_at_B', + 'xy_aspect_at_B', + 'time_at_B', + 'colony_time_at_B', + 'density_at_B', + + 'neighbor_avg_lrm_volume_90um_at_B', # extrinsic at start of growth + 'neighbor_avg_lrm_height_90um_at_B', + 'neighbor_avg_lrm_density_90um_at_B', + 'neighbor_avg_lrm_xy_aspect_90um_at_B', + 'neighbor_avg_lrm_mesh_sa_90um_at_B', + 'early_transient_gr_whole_colony', + + 'sum_has_mitotic_neighbor', # extrinsic lifetime + 'sum_has_dying_neighbor', + 'mean_neighbor_avg_lrm_volume_90um', + 'mean_neighbor_avg_lrm_height_90um', + 'mean_neighbor_avg_lrm_density_90um', + 'mean_neighbor_avg_lrm_xy_aspect_90um', + 'mean_neighbor_avg_lrm_mesh_sa_90um', + 'mean_neighbor_avg_dxdt_48_volume_whole_colony', + ], + + 'lineage_feats': [ + 'sisters_volume_at_B', + 'sisters_duration_BC', + ], +} + +TARGET_CONTAINTING_FEATS = { + 'duration_BC': [ + '' + ], + 'volume_at_C': [ + '' + ], + 'delta_volume_BC': [ + '' + ] +} + +TARGET_SETTINGS = { + 'duration_BC': { + 'tolerance': 0.08, + }, + 'volume_at_C': { + 'tolerance': 0.05, + }, + 'delta_volume_BC': { + 'tolerance': 0.08, + } +} + +def get_feature_list(feature_group_list, target): + features = [] + for group in feature_group_list: + features = features + FEATURE_GROUPS[group] + + features = [feature for feature in features if feature not in TARGET_CONTAINTING_FEATS[target]] + + return features \ No newline at end of file From 1cff1380e746fccd4bd5e092beb9eaf50fdf1197 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Fri, 27 Sep 2024 13:27:11 -0700 Subject: [PATCH 23/68] Update features and names --- .../preprocessing/global_dataset_filtering.py | 12 +++++---- .../lib/visualization/label_tables.py | 26 ++++++++++--------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py index 9d07c754..c79e63dd 100644 --- a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py +++ b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py @@ -270,14 +270,16 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_features.add_feature_at(df_full, "frame_transition", feature, feature) df_full = add_features.get_early_transient_gr_of_whole_colony(df_full, scale=get_plot_labels_for_metric('neighbor_avg_dxdt_48_volume_whole_colony')[0]) + df_full = add_features.get_early_transient_gr_of_neighborhood(df_full, scale=get_plot_labels_for_metric('neighbor_avg_dxdt_48_volume_90um')[0]) df_full = add_features.sum_mitotic_events_along_full_track(df_full) ft_list = ['neighbor_avg_dxdt_48_volume_whole_colony', - 'neighbor_avg_lrm_volume_90um', - 'neighbor_avg_lrm_height_90um', - 'neighbor_avg_lrm_density_90um', - 'neighbor_avg_lrm_xy_aspect_90um', - 'neighbor_avg_lrm_mesh_sa_90um'] + 'neighbor_avg_dxdt_48_volume_90um', + 'neighbor_avg_lrm_volume_90um', + 'neighbor_avg_lrm_height_90um', + 'neighbor_avg_lrm_density_90um', + 'neighbor_avg_lrm_xy_aspect_90um', + 'neighbor_avg_lrm_mesh_sa_90um'] multiplier_list = [get_plot_labels_for_metric(x)[0] for x in ft_list] df_full = add_features.add_mean_feature_over_trajectory(df_full, ft_list, multiplier_list) for feat in ft_list: diff --git a/nuc_morph_analysis/lib/visualization/label_tables.py b/nuc_morph_analysis/lib/visualization/label_tables.py index 2966a9c5..bd5828ec 100644 --- a/nuc_morph_analysis/lib/visualization/label_tables.py +++ b/nuc_morph_analysis/lib/visualization/label_tables.py @@ -225,14 +225,15 @@ def get_scale_factor_table(dataset="all_baseline"): "density_at_B": "Starting density", "xy_aspect_at_B": "Starting XY aspect ratio", "SA_vol_ratio_at_B": "Starting SA/Volume ratio", - "early_transient_gr_whole_colony": "~Starting transient growth rate of whole colony", + "early_transient_gr_whole_colony": "~Starting avg. transient growth rate of whole colony", + "early_transient_gr_90um": "~Starting avg. transient growth rate in 90 \u00B5m radius", "mean_volume": "Mean volume", "mean_height": "Mean height", "mean_density": "Mean density", "mean_mesh_sa": "Mean surface area", "mean_xy_aspect": "Mean XY aspect ratio", "mean_SA_vol_ratio": "Mean SA/Volume ratio", - "mean_neighbor_avg_dxdt_48_volume_whole_colony": "Mean transient growth rate of whole colony", + "mean_neighbor_avg_dxdt_48_volume_whole_colony": "Mean avg. transient growth rate of whole colony", "std_volume": "Stdev. volume", "std_height": "Stdev. height", "std_density": "Stdev. density", @@ -240,17 +241,18 @@ def get_scale_factor_table(dataset="all_baseline"): "std_xy_aspect": "Stdev. XY aspect ratio", "std_SA_vol_ratio": "Stdev. SA/Volume ratio", "std_neighbor_avg_dxdt_48_volume_whole_colony": "Stdev. transient growth rate of whole colony", - 'neighbor_avg_lrm_volume_90um_at_B': "Starting avg. volume in 90um radius", - 'neighbor_avg_lrm_height_90um_at_B': "Starting avg. height in 90um radius", - 'neighbor_avg_lrm_density_90um_at_B': "Starting avg. density in 90um radius", - 'neighbor_avg_lrm_xy_aspect_90um_at_B': "Starting avg. XY aspect ratio in 90um radius", - 'neighbor_avg_lrm_mesh_sa_90um_at_B': "Starting avg. surface area in 90um radius", + 'neighbor_avg_lrm_volume_90um_at_B': "Starting avg. volume in 90 \u00B5m radius", + 'neighbor_avg_lrm_height_90um_at_B': "Starting avg. height in 90 \u00B5m radius", + 'neighbor_avg_lrm_density_90um_at_B': "Starting avg. density in 90 \u00B5m radius", + 'neighbor_avg_lrm_xy_aspect_90um_at_B': "Starting avg. XY aspect ratio in 90 \u00B5m radius", + 'neighbor_avg_lrm_mesh_sa_90um_at_B': "Starting avg. surface area in 90 \u00B5m radius", - 'mean_neighbor_avg_lrm_volume_90um': " Avg. mean volume in 90um radius", - 'mean_neighbor_avg_lrm_height_90um': "Avg. mean height in 90um radius", - 'mean_neighbor_avg_lrm_density_90um': "Avg. mean density in 90um radius", - 'mean_neighbor_avg_lrm_xy_aspect_90um': "Avg. mean XY aspect ratio in 90um radius", - 'mean_neighbor_avg_lrm_mesh_sa_90um': "Avg. mean surface area in 90um radius", + "mean_neighbor_avg_dxdt_48_volume_90um": "Mean avg. transient growth rate in 90 \u00B5m radius", + 'mean_neighbor_avg_lrm_volume_90um': " Avg. mean volume in 90 \u00B5m radius", + 'mean_neighbor_avg_lrm_height_90um': "Avg. mean height in 90 \u00B5m radius", + 'mean_neighbor_avg_lrm_density_90um': "Avg. mean density in 90 \u00B5m radius", + 'mean_neighbor_avg_lrm_xy_aspect_90um': "Avg. mean XY aspect ratio in 90 \u00B5m radius", + 'mean_neighbor_avg_lrm_mesh_sa_90um': "Avg. mean surface area in 90 \u00B5m radius", # mitotic and apoptotic neighbor columns "number_of_frame_of_breakdown_neighbors": "# of neighboring cells undergoing breakdown", From f0967ebd022cc1f082060d966975c53be822df3d Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Fri, 27 Sep 2024 13:27:32 -0700 Subject: [PATCH 24/68] add feature correlation plot --- .../chantelle_linear_regression_workflow.py | 11 +++-- .../linear_regression/select_features.py | 44 +++++++++++++++---- .../lib/preprocessing/add_features.py | 28 ++++++++++++ 3 files changed, 72 insertions(+), 11 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py index ad228031..7b030e69 100644 --- a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py @@ -2,10 +2,11 @@ import warnings import numpy as np import pandas as pd -from nuc_morph_analysis.lib.preprocessing import global_dataset_filtering, filter_data, add_features, add_neighborhood_avg_features_lrm +from nuc_morph_analysis.lib.preprocessing import global_dataset_filtering, filter_data from nuc_morph_analysis.analyses.linear_regression.linear_regression_workflow import fit_linear_regression -from nuc_morph_analysis.analyses.linear_regression.select_features import (get_feature_list, TARGET_SETTINGS) -from nuc_morph_analysis.lib.visualization.plotting_tools import get_plot_labels_for_metric +from nuc_morph_analysis.analyses.linear_regression.select_features import (get_feature_list, + plot_feature_correlations, + TARGET_SETTINGS) pd.options.mode.chained_assignment = None # default='warn' warnings.simplefilter(action="ignore", category=FutureWarning) @@ -15,6 +16,10 @@ df_full = filter_data.all_timepoints_full_tracks(df_all) df_track_level_features = filter_data.track_level_features(df_full) +#%% +feature_list = get_feature_list(['features', 'lineage_feats'], None) +plot_feature_correlations(df_track_level_features, feature_list, "linear_regression/figures") + #%% for target in ['volume_at_C', 'delta_volume_BC', 'duration_BC']: fit_linear_regression( diff --git a/nuc_morph_analysis/analyses/linear_regression/select_features.py b/nuc_morph_analysis/analyses/linear_regression/select_features.py index 964f2c0a..31160109 100644 --- a/nuc_morph_analysis/analyses/linear_regression/select_features.py +++ b/nuc_morph_analysis/analyses/linear_regression/select_features.py @@ -1,5 +1,8 @@ +import seaborn as sns +import matplotlib.pyplot as plt +from nuc_morph_analysis.lib.visualization.plotting_tools import get_plot_labels_for_metric +from nuc_morph_analysis.lib.visualization.notebook_tools import save_and_show_plot -#%% FEATURE_GROUPS = { 'features': [ 'volume_at_B', #intrinic at start of growth @@ -16,7 +19,13 @@ 'neighbor_avg_lrm_density_90um_at_B', 'neighbor_avg_lrm_xy_aspect_90um_at_B', 'neighbor_avg_lrm_mesh_sa_90um_at_B', - 'early_transient_gr_whole_colony', + 'early_transient_gr_90um', + + 'duration_BC', # intrinsic lifetime + 'volume_at_C', + 'delta_volume_BC', + 'late_growth_rate_by_endpoints', + 'tscale_linearityfit_volume', 'sum_has_mitotic_neighbor', # extrinsic lifetime 'sum_has_dying_neighbor', @@ -25,7 +34,7 @@ 'mean_neighbor_avg_lrm_density_90um', 'mean_neighbor_avg_lrm_xy_aspect_90um', 'mean_neighbor_avg_lrm_mesh_sa_90um', - 'mean_neighbor_avg_dxdt_48_volume_whole_colony', + 'mean_neighbor_avg_dxdt_48_volume_90um', ], 'lineage_feats': [ @@ -36,13 +45,17 @@ TARGET_CONTAINTING_FEATS = { 'duration_BC': [ - '' + 'duration_BC', ], 'volume_at_C': [ - '' + 'volume_at_C', + 'delta_volume_BC', + 'late_growth_rate_by_endpoints', ], 'delta_volume_BC': [ - '' + 'volume_at_C', + 'delta_volume_BC', + 'late_growth_rate_by_endpoints', ] } @@ -63,6 +76,21 @@ def get_feature_list(feature_group_list, target): for group in feature_group_list: features = features + FEATURE_GROUPS[group] - features = [feature for feature in features if feature not in TARGET_CONTAINTING_FEATS[target]] + if target is not None: + features = [feature for feature in features if feature not in TARGET_CONTAINTING_FEATS[target]] + + return features + +def plot_feature_correlations(df_track_level_features, feature_list, figdir): - return features \ No newline at end of file + data = df_track_level_features[feature_list] + + plt.rc('font', size=22) + plt.figure(figsize=(27, 24)) + sns.heatmap(data.corr(), annot=True, fmt=".1f", cmap='BrBG', vmin=-1, vmax=1, cbar_kws={"shrink": 0.5, "pad": 0.02}) + + column_names = [get_plot_labels_for_metric(col)[1] for col in data.columns] + plt.xticks([x + 0.5 for x in range(len(column_names))], column_names) + plt.yticks([y + 0.5 for y in range(len(column_names))], column_names) + plt.tight_layout() + save_and_show_plot(f'{figdir}/feature_correlation_heatmap') \ No newline at end of file diff --git a/nuc_morph_analysis/lib/preprocessing/add_features.py b/nuc_morph_analysis/lib/preprocessing/add_features.py index 60cd7da3..1cc9df39 100644 --- a/nuc_morph_analysis/lib/preprocessing/add_features.py +++ b/nuc_morph_analysis/lib/preprocessing/add_features.py @@ -197,7 +197,35 @@ def get_early_transient_gr_of_whole_colony(df, scale, time_shift=25): df.loc[df.track_id == tid, "early_transient_gr_whole_colony"] = transient_gr_whole_colony * scale return df + +def get_early_transient_gr_of_neighborhood(df, scale, time_shift=25): + """ + Get the transient growth rate of the colony 2 hours into the growth trajectory. + + This time shift of two hours into the growth trajectory is necessary because the metric + is calculated as the average of a 4 hour rolling window. The middle of a four hour window + does not occur until two hours into the timelapse. To calculate this feature equivalently + for each trajectory, two hours was used for all tracks to get a metric for the transient + growth rate of the neighborhood early in the growth trajectory. + Parameters + ---------- + df : DataFrame + The dataframe + time_shift : int + The time shift in frames to calculate the transient growth rate in frames + + Returns + ------- + df : DataFrame + The dataframe with the added transient growth rate feature columns + """ + for tid, dft in df.groupby("track_id"): + t_calculate = dft.index_sequence.min() + time_shift + transient_gr_whole_colony = df.loc[df.index_sequence == t_calculate, "neighbor_avg_dxdt_48_volume_90um"].values[0] + df.loc[df.track_id == tid, "early_transient_gr_90um"] = transient_gr_whole_colony * scale + + return df def add_std_feature_over_trajectory(df, feature_list, multiplier_list): """ From e7b4a6ceddbcadec47f6e3471885b92114775195 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Fri, 27 Sep 2024 16:42:13 -0700 Subject: [PATCH 25/68] update documentation --- .../linear_regression/select_features.py | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/select_features.py b/nuc_morph_analysis/analyses/linear_regression/select_features.py index 31160109..3048ed85 100644 --- a/nuc_morph_analysis/analyses/linear_regression/select_features.py +++ b/nuc_morph_analysis/analyses/linear_regression/select_features.py @@ -45,7 +45,8 @@ TARGET_CONTAINTING_FEATS = { 'duration_BC': [ - 'duration_BC', + 'duration_BC', + 'late_growth_rate_by_endpoints', ], 'volume_at_C': [ 'volume_at_C', @@ -72,6 +73,22 @@ } def get_feature_list(feature_group_list, target): + """ + Get feature list to include in linear model. + Gets full features list and excludes ones that contain target information. + + Parameters + ---------- + feature_group_list: list + List of feature groups to include in the feature list + target: str + Target variable to predict + + Returns + ------- + features: list + List of features to include in the linear model + """ features = [] for group in feature_group_list: features = features + FEATURE_GROUPS[group] @@ -82,7 +99,23 @@ def get_feature_list(feature_group_list, target): return features def plot_feature_correlations(df_track_level_features, feature_list, figdir): + """ + Plot heatmap of feature correlations. + Parameters + ---------- + df_track_level_features : pd.DataFrame + DataFrame containing track level features + feature_list : list + List of features to include in the heatmap + Output from get_feature_list + figdir : str + Directory to save the figure + + Returns + ------- + Figure + """ data = df_track_level_features[feature_list] plt.rc('font', size=22) @@ -93,4 +126,5 @@ def plot_feature_correlations(df_track_level_features, feature_list, figdir): plt.xticks([x + 0.5 for x in range(len(column_names))], column_names) plt.yticks([y + 0.5 for y in range(len(column_names))], column_names) plt.tight_layout() + save_and_show_plot(f'{figdir}/feature_correlation_heatmap') \ No newline at end of file From e88accf2c7c9d78f25be4983be63a78524ffd3e3 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Mon, 30 Sep 2024 12:46:40 -0700 Subject: [PATCH 26/68] Normalize sum of mitotic and death events --- .../linear_regression/select_features.py | 4 ++-- .../lib/preprocessing/add_features.py | 20 +++++++++++++++++++ .../preprocessing/global_dataset_filtering.py | 1 + .../lib/visualization/label_tables.py | 4 ++-- 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/select_features.py b/nuc_morph_analysis/analyses/linear_regression/select_features.py index 3048ed85..a7782ac9 100644 --- a/nuc_morph_analysis/analyses/linear_regression/select_features.py +++ b/nuc_morph_analysis/analyses/linear_regression/select_features.py @@ -27,8 +27,8 @@ 'late_growth_rate_by_endpoints', 'tscale_linearityfit_volume', - 'sum_has_mitotic_neighbor', # extrinsic lifetime - 'sum_has_dying_neighbor', + 'normalized_sum_has_mitotic_neighbor', # extrinsic lifetime + 'normalized_sum_has_dying_neighbor', 'mean_neighbor_avg_lrm_volume_90um', 'mean_neighbor_avg_lrm_height_90um', 'mean_neighbor_avg_lrm_density_90um', diff --git a/nuc_morph_analysis/lib/preprocessing/add_features.py b/nuc_morph_analysis/lib/preprocessing/add_features.py index 1cc9df39..d94a71dc 100644 --- a/nuc_morph_analysis/lib/preprocessing/add_features.py +++ b/nuc_morph_analysis/lib/preprocessing/add_features.py @@ -639,3 +639,23 @@ def sum_mitotic_events_along_full_track(df0, feature_list=[]): return sum_events_along_full_track(df0, feature_list) +def normalize_sum_events(df_full, event_cols): + """ + Normalize sum of mitotic and death events by growth duration + + Parameters + ---------- + df_full: DataFrame + The dataframe of full tracks + event_cols: list + ie. 'sum_has_mitotic_neighbor', 'sum_has_dying_neighbor' + + Returns + ------- + df_full: DataFrame + The dataframe with the normalized sum of events columns + """ + for col in event_cols: + df_full[f"normalized_{col}"] = df_full[col] / df_full['duration_BC'] + return df_full + diff --git a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py index c79e63dd..fa35f97a 100644 --- a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py +++ b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py @@ -272,6 +272,7 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_features.get_early_transient_gr_of_whole_colony(df_full, scale=get_plot_labels_for_metric('neighbor_avg_dxdt_48_volume_whole_colony')[0]) df_full = add_features.get_early_transient_gr_of_neighborhood(df_full, scale=get_plot_labels_for_metric('neighbor_avg_dxdt_48_volume_90um')[0]) df_full = add_features.sum_mitotic_events_along_full_track(df_full) + df_full = add_features.normalize_sum_events(df_full, ['sum_has_mitotic_neighbor', 'sum_has_dying_neighbor']) ft_list = ['neighbor_avg_dxdt_48_volume_whole_colony', 'neighbor_avg_dxdt_48_volume_90um', diff --git a/nuc_morph_analysis/lib/visualization/label_tables.py b/nuc_morph_analysis/lib/visualization/label_tables.py index bd5828ec..5eae3a97 100644 --- a/nuc_morph_analysis/lib/visualization/label_tables.py +++ b/nuc_morph_analysis/lib/visualization/label_tables.py @@ -258,8 +258,8 @@ def get_scale_factor_table(dataset="all_baseline"): "number_of_frame_of_breakdown_neighbors": "# of neighboring cells undergoing breakdown", "number_of_frame_of_formation_neighbors": "# of neighboring cells undergoing formation", "number_of_frame_of_death_neighbors": "# of neighboring cells undergoing death", - "sum_has_mitotic_neighbor": "Sum of mitotic neighbors", - "sum_has_dying_neighbor": "Sum of dying neighbors", + "normalized_sum_has_mitotic_neighbor": "Norm. sum of mitotic neighbors", + "normalized_sum_has_dying_neighbor": "Norm. sum of dying neighbors", } From 5fa046274e8f5c1bdb043651bf584c89c9753627 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Tue, 1 Oct 2024 12:09:07 -0700 Subject: [PATCH 27/68] fix string --- .../analyses/linear_regression/linear_regression_workflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py index 049a9aec..b473868d 100644 --- a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py @@ -229,7 +229,7 @@ def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path): g.savefig(this_path, dpi=300) # save movie of pngs - writer = imageio.get_writer(save_path / "coefficients_over_time.mp4", fps=2) + writer = imageio.get_writer(save_path / f"{target}_coefficients_over_time.mp4", fps=2) for im in files: writer.append_data(imageio.imread(im)) writer.close() From 3d870fb8f6c34ab8806e82f581f0f6c209ab7b70 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Tue, 1 Oct 2024 12:10:12 -0700 Subject: [PATCH 28/68] add delta volume as a lineage feature --- .../analyses/linear_regression/select_features.py | 1 + .../lib/preprocessing/global_dataset_filtering.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/select_features.py b/nuc_morph_analysis/analyses/linear_regression/select_features.py index a7782ac9..c7e8073c 100644 --- a/nuc_morph_analysis/analyses/linear_regression/select_features.py +++ b/nuc_morph_analysis/analyses/linear_regression/select_features.py @@ -40,6 +40,7 @@ 'lineage_feats': [ 'sisters_volume_at_B', 'sisters_duration_BC', + 'sisters_delta_volume_BC' ], } diff --git a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py index fa35f97a..b96353c3 100644 --- a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py +++ b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py @@ -261,7 +261,7 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_growth_features.fit_tracks_to_time_powerlaw(df_full, "volume", interval) # For LRM - df_full = add_features.add_lineage_features(df_full, feature_list=['volume_at_B', 'duration_BC']) + df_full = add_features.add_lineage_features(df_full, feature_list=['volume_at_B', 'duration_BC', 'volume_at_C', 'delta_volume_BC']) df_full = add_features.add_feature_at(df_full, "frame_transition", 'height', 'height_percentile', pix_size) df_full = add_features.add_feature_at(df_full, "frame_transition", 'density', 'density', pix_size) From 37d41819d54f4f376ce108c022ba95ec62bf0f05 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Tue, 1 Oct 2024 14:04:04 -0700 Subject: [PATCH 29/68] update label tables --- nuc_morph_analysis/lib/visualization/label_tables.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nuc_morph_analysis/lib/visualization/label_tables.py b/nuc_morph_analysis/lib/visualization/label_tables.py index 5eae3a97..e0ded20f 100644 --- a/nuc_morph_analysis/lib/visualization/label_tables.py +++ b/nuc_morph_analysis/lib/visualization/label_tables.py @@ -208,6 +208,7 @@ def get_scale_factor_table(dataset="all_baseline"): "family_id": "Family ID", "sisters_volume_at_B": "Sisters starting volume", "sisters_duration_BC": "Sisters growth duration", + "sisters_delta_volume_BC": "Sisters added volume", # Flags "is_outlier": "Outlier Flag", "is_tp_outlier": "Single Timepoint Outlier", From 830aedd5ff92cc0fdd4b6156c990ee5bb84b66df Mon Sep 17 00:00:00 2001 From: Ritvik Vasan Date: Tue, 1 Oct 2024 16:03:59 -0700 Subject: [PATCH 30/68] add greedy removal workflow --- .../linear_regression_workflow.py | 95 +++--- ...near_regression_workflow_greedy_removal.py | 291 ++++++++++++++++++ 2 files changed, 350 insertions(+), 36 deletions(-) create mode 100644 nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow_greedy_removal.py diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py index 049a9aec..c15bcf89 100644 --- a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py @@ -1,9 +1,7 @@ import argparse -import ast import os import warnings from pathlib import Path -import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns @@ -12,27 +10,45 @@ RepeatedKFold, cross_validate, ) +from tqdm import tqdm from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler -from tqdm import tqdm from nuc_morph_analysis.lib.preprocessing import global_dataset_filtering, filter_data from sklearn.model_selection import permutation_test_score from nuc_morph_analysis.lib.visualization.plotting_tools import ( get_plot_labels_for_metric, ) import imageio +from nuc_morph_analysis.analyses.linear_regression.select_features import ( + get_feature_list, +) +from nuc_morph_analysis.analyses.linear_regression.utils import ( + list_of_strings, + list_of_floats, +) pd.options.mode.chained_assignment = None # default='warn' warnings.simplefilter(action="ignore", category=FutureWarning) -def main(cols, target, alpha_range, tolerance, save_path, cached_dataframe=None): +def main( + cols, + target, + alpha_range, + tolerance, + save_path, + cached_dataframe=None, + save_movie=False, +): save_path = Path(save_path) save_path = save_path / Path("linear_regression") save_path.mkdir(parents=True, exist_ok=True) + if len(cols) < 1: + cols = get_feature_list(["features", "lineage_feats"], None) + if not cached_dataframe: df_all = global_dataset_filtering.load_dataset_with_features() df_full = filter_data.all_timepoints_full_tracks(df_all) @@ -41,11 +57,19 @@ def main(cols, target, alpha_range, tolerance, save_path, cached_dataframe=None) df_track_level_features = pd.read_csv(cached_dataframe) fit_linear_regression( - df_track_level_features, cols, target, alpha_range, tolerance, save_path + df_track_level_features, + cols, + target, + alpha_range, + tolerance, + save_path, + save_movie, ) -def fit_linear_regression(data, cols, target, alpha, tol, save_path): +def fit_linear_regression( + data, cols, target, alpha, tol, save_path, save, permute_cols=[] +): """ data - track level features cols - input features @@ -53,6 +77,8 @@ def fit_linear_regression(data, cols, target, alpha, tol, save_path): alpha - hyperparameter for lasso tol - tolerance to check drop in r^2 for finding best alpha (ex. 0.02) save_path - location to save files + save - whether to save movies and pngs + permute_col - list of features to permute and replace with noise """ sns.set_context("talk") random_state = 2652124 @@ -73,13 +99,18 @@ def fit_linear_regression(data, cols, target, alpha, tol, save_path): alpha = [round(i, 1) for i in alpha] # find best alpha for Lasso model - for alpha_ind, this_alpha in enumerate(alpha): - print("fitting alpha", this_alpha) - + for alpha_ind, this_alpha in tqdm(enumerate(alpha), total=len(alpha)): # drop any nan rows dropna_cols = cols + [target] data = data.dropna(subset=dropna_cols) + # permute columns if necessary + if len(permute_cols) > 0: + for col in permute_cols: + mu, sigma = 0, 1 + noise = np.random.normal(mu, sigma, len(data)) + data[col] = noise + # make numpy array for inputs and target all_input = data[cols].reset_index(drop=True).values all_target = data[target].values @@ -161,7 +192,6 @@ def fit_linear_regression(data, cols, target, alpha, tol, save_path): all_test_sc["Test MSE"] = -all_test_sc["Test MSE"] save_path = save_path / Path(f"{target}") save_path.mkdir(parents=True, exist_ok=True) - all_test_sc.to_csv(save_path / "mse.csv") # Get coeffs for all alpha all_coef_alpha = pd.concat(all_coef_alpha, axis=0).reset_index(drop=True) @@ -170,14 +200,16 @@ def fit_linear_regression(data, cols, target, alpha, tol, save_path): var_name="Column", value_name="Coefficient Importance", ).reset_index(drop=True) - all_coef_alpha.to_csv(save_path / "coefficients.csv") # Get permutation scores and p values for all alpha all_perms = pd.DataFrame(all_perms).reset_index(drop=True) - all_perms.to_csv(save_path / "perm_scores.csv") - # Save coefficient plot for max alpha value - save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path) + # Save coefficient plot movie + if save: + all_test_sc.to_csv(save_path / "mse.csv") + all_coef_alpha.to_csv(save_path / "coefficients.csv") + all_perms.to_csv(save_path / "perm_scores.csv") + save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path) return all_coef_alpha, all_test_sc, all_perms @@ -207,10 +239,10 @@ def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path): aspect=2, height=10, ) - - g.set(ylabel='') - - g.fig.subplots_adjust(top=.9) # adjust the Figure in rp + + g.set(ylabel="") + + g.fig.subplots_adjust(top=0.9) # adjust the Figure in rp g.fig.suptitle( f"Prediction of {get_plot_labels_for_metric(target)[1]}\nalpha={alpha}, test r\u00B2={test_r2_mean}±{test_r2_std}, P={p_value}" ) @@ -232,26 +264,10 @@ def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path): writer = imageio.get_writer(save_path / "coefficients_over_time.mp4", fps=2) for im in files: writer.append_data(imageio.imread(im)) + os.remove(im) writer.close() -def list_of_strings(arg): - return arg.split(",") - - -def list_of_floats(arg): - return list(map(float, arg.split(","))) - - -def str2bool(v): - if isinstance(v, bool): - return v - if v.lower() in ("yes", "true", "t", "y", "1"): - return True - elif v.lower() in ("no", "false", "False", "f", "n", "0"): - return False - - if __name__ == "__main__": parser = argparse.ArgumentParser(description="Run the linear regression workflow") # Optional command line argument @@ -267,7 +283,7 @@ def str2bool(v): parser.add_argument( "--cols", type=list_of_strings, - default=["volume_at_B", "time_at_B", "colony_time_at_B", "SA_at_B"], + default=[], help="Supply a list of column names to use as independent variables in the linear regression analysis.", ) parser.add_argument( @@ -294,6 +310,12 @@ def str2bool(v): default=0.02, help="Tolerace for change in regression score to determine best alpha", ) + parser.add_argument( + "--save", + type=bool, + default=False, + help="Save plots", + ) args = parser.parse_args() main( cols=args.cols, @@ -302,4 +324,5 @@ def str2bool(v): tolerance=args.tolerance, save_path=args.save_path, cached_dataframe=args.cached_dataframe, + save=args.save, ) diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow_greedy_removal.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow_greedy_removal.py new file mode 100644 index 00000000..aeaa6aa7 --- /dev/null +++ b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow_greedy_removal.py @@ -0,0 +1,291 @@ +import argparse +import warnings +from pathlib import Path +import numpy as np +import pandas as pd +from nuc_morph_analysis.lib.preprocessing import global_dataset_filtering, filter_data +from nuc_morph_analysis.analyses.linear_regression.select_features import ( + get_feature_list, +) +from nuc_morph_analysis.analyses.linear_regression.linear_regression_workflow import ( + fit_linear_regression, +) +from nuc_morph_analysis.analyses.linear_regression.utils import ( + list_of_strings, + list_of_floats, +) +from nuc_morph_analysis.lib.visualization.plotting_tools import ( + get_plot_labels_for_metric, +) +import imageio +import seaborn as sns +import os +import matplotlib.pyplot as plt + +pd.options.mode.chained_assignment = None # default='warn' + +warnings.simplefilter(action="ignore", category=FutureWarning) + + +def main( + cols, + target, + alpha_range, + tolerance, + save_path, + max_iterations, + cached_dataframe=None, +): + + save_path = Path(save_path) + save_path = save_path / Path("linear_regression_greedy") / Path(f"{target}") + save_path.mkdir(parents=True, exist_ok=True) + + if len(cols) < 1: + cols = get_feature_list(["features"], target) + + label_list = [get_plot_labels_for_metric(col)[1] for col in cols] + map_dict = {i: j for i, j in zip(cols, label_list)} + + if not cached_dataframe: + df_all = global_dataset_filtering.load_dataset_with_features() + df_full = filter_data.all_timepoints_full_tracks(df_all) + df_track_level_features = filter_data.track_level_features(df_full) + df_track_level_features.to_csv( + "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/track_level.csv" + ) + else: + df_track_level_features = pd.read_csv(cached_dataframe) + + permute_cols = [] + count = 0 + + removal_coefs = [] + removal_test_sc = [] + removal_perms = [] + while (count < len(cols) - 2) and (count < max_iterations): + print( + f"Iteration {count}, Total cols {len(cols)}, Removing features {permute_cols}" + ) + try: + all_coef_alpha, all_test_sc, all_perms = fit_linear_regression( + df_track_level_features, + cols, + target, + alpha_range, + tolerance, + save_path, + False, + permute_cols, + ) + except ValueError: + break + # Save max alpha + max_alpha = all_coef_alpha["alpha"].max() + all_coef_alpha = all_coef_alpha.loc[all_coef_alpha["alpha"] == max_alpha] + all_test_sc = all_test_sc.loc[all_test_sc["alpha"] == max_alpha] + all_perms = all_perms.loc[all_perms["alpha"] == max_alpha] + all_coef_alpha["iteration"] = count + all_test_sc["iteration"] = count + all_perms["iteration"] = count + + # Get max coefficient to remove + tmp = all_coef_alpha.copy() + tmp["Coefficient Importance"] = tmp["Coefficient Importance"].abs() + tmp = tmp.loc[ + tmp["Coefficient Importance"] == tmp["Coefficient Importance"].max() + ] + remove_column = tmp["Column"].item() + + # add this to list of columns to permute + permute_cols.append(remove_column) + + # save out info + all_coef_alpha["iteration"] = count + all_test_sc["iteration"] = count + all_perms["iteration"] = count + + all_coef_alpha["feature_removed"] = remove_column + all_test_sc["feature_removed"] = remove_column + all_perms["feature_removed"] = remove_column + + all_coef_alpha = all_coef_alpha.groupby(["Column"]).mean().reset_index() + + label_list = [ + get_plot_labels_for_metric(col)[1] + for col in all_coef_alpha["Column"].unique() + ] + all_coef_alpha["Column"] = label_list + + all_coef_alpha.to_csv(save_path / f"removal_coefficients_{count}.csv") + all_test_sc.to_csv(save_path / f"removal_test_sc_{count}.csv") + all_perms.to_csv(save_path / f"removal_perms_{count}.csv") + + removal_coefs.append(all_coef_alpha) + removal_test_sc.append(all_test_sc) + removal_perms.append(all_perms) + + count += 1 + + removal_coefs = pd.concat(removal_coefs, axis=0).reset_index(drop=True) + removal_test_sc = pd.concat(removal_test_sc, axis=0).reset_index(drop=True) + removal_perms = pd.concat(removal_perms, axis=0).reset_index(drop=True) + + removal_coefs.to_csv(save_path / "removal_coefficients.csv") + removal_test_sc.to_csv(save_path / "removal_test_sc.csv") + removal_perms.to_csv(save_path / "removal_perms.csv") + + save_plots( + removal_coefs, removal_test_sc, removal_perms, target, save_path, map_dict + ) + + +def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path, map_dict): + sns.set_style("whitegrid") + xlim = None + files = [] + perm_cols = [] + perm_coeffs = [] + y_order = [] + + xlim = [ + all_coef_alpha["Coefficient Importance"].min(), + all_coef_alpha["Coefficient Importance"].max(), + ] + for iter in all_coef_alpha["iteration"].unique(): + this_coef_alpha = all_coef_alpha.loc[ + all_coef_alpha["iteration"] == iter + ].reset_index(drop=True) + this_test_sc = all_test_sc.loc[all_test_sc["iteration"] == iter].reset_index( + drop=True + ) + this_perms = all_perms.loc[all_perms["iteration"] == iter].reset_index( + drop=True + ) + if iter > 0: + prev_perms = all_perms.loc[all_perms["iteration"] == iter - 1].reset_index( + drop=True + ) + prev_coefs = all_coef_alpha.loc[ + all_coef_alpha["iteration"] == iter - 1 + ].reset_index(drop=True) + feat_removed = map_dict[prev_perms["feature_removed"].item()] + feat_coefficient = prev_coefs.loc[prev_coefs["Column"] == feat_removed][ + "Coefficient Importance" + ].item() + perm_cols.append(feat_removed) + perm_coeffs.append(feat_coefficient) + print(perm_cols, perm_coeffs) + p_value = round(this_perms["p_value"].item(), 3) + test_r2_mean = round(this_test_sc["Test r$^2$"].mean(), 2) + test_r2_std = round(this_test_sc["Test r$^2$"].std() / 2, 2) + + this_coef_alpha["removed"] = False + if len(perm_cols) > 0: + this1 = this_coef_alpha.loc[this_coef_alpha["Column"].isin(perm_cols)] + this2 = this_coef_alpha.loc[~this_coef_alpha["Column"].isin(perm_cols)] + for col in perm_cols: + ind = perm_cols.index(col) + final_coeff = perm_coeffs[ind] + this1.loc[this1["Column"] == col, "Coefficient Importance"] = ( + final_coeff + ) + this1["removed"] = True + this_coef_alpha = pd.concat([this1, this2], axis=0).reset_index(drop=True) + + if len(y_order) == 0: + y_order = this_coef_alpha["Column"].values + + g = sns.catplot( + data=this_coef_alpha, + y="Column", + x="Coefficient Importance", + hue="removed", + kind="bar", + order=y_order, + errorbar="sd", + aspect=2, + height=10, + ) + + g.set(ylabel="") + + g.fig.subplots_adjust(top=0.9) # adjust the Figure in rp + g.fig.suptitle( + f"Prediction of {get_plot_labels_for_metric(target)[1]}\niteration={iter}, test r\u00B2={test_r2_mean}±{test_r2_std}, P={p_value}" + ) + label_list = [col for col in all_coef_alpha["Column"].unique()] + g.set_yticklabels(label_list) + plt.grid() + this_path = str(save_path / Path(f"coefficients_{target}_iteration_{iter}.png")) + files.append(this_path) + + g.set(xlim=xlim) + g.savefig(this_path, dpi=300) + + # save movie of pngs + writer = imageio.get_writer(save_path / "coefficients_over_time.mp4", fps=2) + for im in files: + writer.append_data(imageio.imread(im)) + os.remove(im) + writer.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run the linear regression workflow") + # Optional command line argument + parser.add_argument( + "--cached_dataframe", + type=str, + metavar="path", + help="Supply a path to a dataframe to skip data preprocessing. If included, dataframe " + "should match the result of linear_regression_analysis.get_data (see source code for " + "details).", + ) + + parser.add_argument( + "--cols", + type=list_of_strings, + default=[], + help="Supply a list of column names to use as independent variables in the linear regression analysis.", + ) + parser.add_argument( + "--target", + type=str, + default="duration_BC", + help="Supply a column name for a dependent variable to perform regression on", + ) + parser.add_argument( + "--alpha_range", + type=list_of_floats, + default=np.arange(0.5, 15, 0.2, dtype=float), + help="Supply a list of alpha values to use in lasso regression", + ) + parser.add_argument( + "--save_path", + type=str, + default="figures", + help="local folder name where plots will be saved", + ) + parser.add_argument( + "--tolerance", + type=float, + default=0.02, + help="Tolerace for change in regression score to determine best alpha", + ) + parser.add_argument( + "--max_iterations", + type=int, + default=100, + help="Max iterations for greedy removal", + ) + args = parser.parse_args() + main( + cols=args.cols, + target=args.target, + alpha_range=args.alpha_range, + tolerance=args.tolerance, + save_path=args.save_path, + max_iterations=args.max_iterations, + cached_dataframe=args.cached_dataframe, + ) From 6bf43427e3d0e27dd25c79cbeb3d2e607563f597 Mon Sep 17 00:00:00 2001 From: Ritvik Vasan Date: Tue, 1 Oct 2024 16:04:21 -0700 Subject: [PATCH 31/68] add scripts for greedy removal --- .../scripts_greedy_removal/added_volume.sh | 5 +++++ .../scripts_greedy_removal/duration_BC.sh | 5 +++++ .../scripts_greedy_removal/duration_BC_debug copy.sh | 7 +++++++ .../scripts_greedy_removal/duration_BC_debug.sh | 7 +++++++ .../scripts_greedy_removal/ending_volume.sh | 5 +++++ 5 files changed, 29 insertions(+) create mode 100755 nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/added_volume.sh create mode 100755 nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC.sh create mode 100755 nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC_debug copy.sh create mode 100755 nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC_debug.sh create mode 100755 nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/ending_volume.sh diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/added_volume.sh b/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/added_volume.sh new file mode 100755 index 00000000..9015c21b --- /dev/null +++ b/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/added_volume.sh @@ -0,0 +1,5 @@ +python linear_regression_workflow_greedy_removal.py \ +--cached_dataframe "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/track_level.csv" \ +--target 'delta_volume_BC' \ +--save_path "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/nuc_morph_analysis/figures/" \ +--tolerance 0.08 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC.sh b/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC.sh new file mode 100755 index 00000000..ea06eec8 --- /dev/null +++ b/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC.sh @@ -0,0 +1,5 @@ +python linear_regression_workflow_greedy_removal.py \ +--cached_dataframe "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/track_level.csv" \ +--target 'duration_BC' \ +--save_path "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/nuc_morph_analysis/figures/" \ +--tolerance 0.08 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC_debug copy.sh b/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC_debug copy.sh new file mode 100755 index 00000000..a208aab2 --- /dev/null +++ b/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC_debug copy.sh @@ -0,0 +1,7 @@ +python linear_regression_workflow_greedy_removal.py \ +--cached_dataframe "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/track_level.csv" \ +--target 'duration_BC' \ +--alpha_range "0.2,0.4,0.6,0.8,1.0,1.2,1.4,1.6,1.8,2.0,2.2,2.4,2.6,2.8,3.0,3.2" \ +--save_path "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/nuc_morph_analysis/figures_debug/" \ +--tolerance 0.08 \ +--max_iterations 4 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC_debug.sh b/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC_debug.sh new file mode 100755 index 00000000..57768ef6 --- /dev/null +++ b/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC_debug.sh @@ -0,0 +1,7 @@ +python linear_regression_workflow_greedy_removal.py \ +--cached_dataframe "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/track_level.csv" \ +--target 'duration_BC' \ +--alpha_range "1.2,1.4" \ +--save_path "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/nuc_morph_analysis/figures_debug/" \ +--tolerance 0.08 \ +--max_iterations 4 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/ending_volume.sh b/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/ending_volume.sh new file mode 100755 index 00000000..6a7c55b7 --- /dev/null +++ b/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/ending_volume.sh @@ -0,0 +1,5 @@ +python linear_regression_workflow_greedy_removal.py \ +--cached_dataframe "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/track_level.csv" \ +--target 'volume_at_C' \ +--save_path "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/nuc_morph_analysis/figures/" \ +--tolerance 0.08 \ No newline at end of file From c0ec8e1fd9dcc099fcfe1867cedf006ef6ebe684 Mon Sep 17 00:00:00 2001 From: Ritvik Vasan Date: Tue, 1 Oct 2024 16:04:52 -0700 Subject: [PATCH 32/68] move some fns to utils --- .../analyses/linear_regression/utils.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 nuc_morph_analysis/analyses/linear_regression/utils.py diff --git a/nuc_morph_analysis/analyses/linear_regression/utils.py b/nuc_morph_analysis/analyses/linear_regression/utils.py new file mode 100644 index 00000000..423cf66b --- /dev/null +++ b/nuc_morph_analysis/analyses/linear_regression/utils.py @@ -0,0 +1,15 @@ +def list_of_strings(arg): + return arg.split(",") + + +def list_of_floats(arg): + return list(map(float, arg.split(","))) + + +def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "False", "f", "n", "0"): + return False From 4619e6d5a45f8968a2b5162b125f40c514fa8dfe Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Wed, 2 Oct 2024 11:08:42 -0700 Subject: [PATCH 33/68] update workflow to save fit_linear_regression --- .../chantelle_linear_regression_workflow.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py index 7b030e69..126d5d5b 100644 --- a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py @@ -28,7 +28,8 @@ target=target, alpha=np.arange(0, 15, 0.1, dtype=float), tol=TARGET_SETTINGS[target]['tolerance'], - save_path="./figures/feats/" + save_path="./figures/feats/", + save=True ) print(f"Finished {target}") #%% @@ -39,6 +40,9 @@ target=target, alpha=np.arange(0, 15, 0.1, dtype=float), tol=TARGET_SETTINGS[target]['tolerance'], - save_path="./figures/feats_plus_lineage/" + save_path="./figures/feats_plus_lineage/", + save=True ) print(f"Finished {target} with lineage") + +# %% From 6868f2a83b0aa10c27409441a6b9925f09d008c5 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Fri, 4 Oct 2024 17:32:33 -0700 Subject: [PATCH 34/68] update features into catagories --- .../linear_regression/select_features.py | 54 +++++++++---------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/select_features.py b/nuc_morph_analysis/analyses/linear_regression/select_features.py index c7e8073c..7fefd31c 100644 --- a/nuc_morph_analysis/analyses/linear_regression/select_features.py +++ b/nuc_morph_analysis/analyses/linear_regression/select_features.py @@ -4,30 +4,39 @@ from nuc_morph_analysis.lib.visualization.notebook_tools import save_and_show_plot FEATURE_GROUPS = { - 'features': [ - 'volume_at_B', #intrinic at start of growth + 'start_intrinsic': [ #intrinic at start of growth + 'volume_at_B', 'height_at_B', 'SA_vol_ratio_at_B', 'SA_at_B', 'xy_aspect_at_B', + 'density_at_B', + 'sisters_volume_at_B', + ], + + 'lifetime_intrinsic': [ # intrinsic lifetime + 'duration_BC', + 'volume_at_C', + 'delta_volume_BC', + 'late_growth_rate_by_endpoints', + 'tscale_linearityfit_volume', + 'sisters_duration_BC', + 'sisters_delta_volume_BC', + ], + + 'start_extrinsic': [ # extrinsic at start of growth 'time_at_B', 'colony_time_at_B', - 'density_at_B', - - 'neighbor_avg_lrm_volume_90um_at_B', # extrinsic at start of growth + 'neighbor_avg_lrm_volume_90um_at_B', 'neighbor_avg_lrm_height_90um_at_B', 'neighbor_avg_lrm_density_90um_at_B', 'neighbor_avg_lrm_xy_aspect_90um_at_B', 'neighbor_avg_lrm_mesh_sa_90um_at_B', - 'early_transient_gr_90um', - - 'duration_BC', # intrinsic lifetime - 'volume_at_C', - 'delta_volume_BC', - 'late_growth_rate_by_endpoints', - 'tscale_linearityfit_volume', - - 'normalized_sum_has_mitotic_neighbor', # extrinsic lifetime + 'early_transient_gr_90um', + ], + + 'lifetime_extrinsic': [ # extrinsic lifetime + 'normalized_sum_has_mitotic_neighbor', 'normalized_sum_has_dying_neighbor', 'mean_neighbor_avg_lrm_volume_90um', 'mean_neighbor_avg_lrm_height_90um', @@ -36,28 +45,20 @@ 'mean_neighbor_avg_lrm_mesh_sa_90um', 'mean_neighbor_avg_dxdt_48_volume_90um', ], - - 'lineage_feats': [ - 'sisters_volume_at_B', - 'sisters_duration_BC', - 'sisters_delta_volume_BC' - ], + } TARGET_CONTAINTING_FEATS = { 'duration_BC': [ 'duration_BC', - 'late_growth_rate_by_endpoints', - ], - 'volume_at_C': [ - 'volume_at_C', - 'delta_volume_BC', 'late_growth_rate_by_endpoints', + 'mean_neighbor_avg_dxdt_48_volume_90um', ], 'delta_volume_BC': [ 'volume_at_C', 'delta_volume_BC', 'late_growth_rate_by_endpoints', + 'mean_neighbor_avg_dxdt_48_volume_90um', ] } @@ -65,9 +66,6 @@ 'duration_BC': { 'tolerance': 0.08, }, - 'volume_at_C': { - 'tolerance': 0.05, - }, 'delta_volume_BC': { 'tolerance': 0.08, } From fda3a6855b931a1f7e15521801b05f0d186bb3a9 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Fri, 4 Oct 2024 17:32:52 -0700 Subject: [PATCH 35/68] update workflow to make r squared matrix --- .../chantelle_linear_regression_workflow.py | 58 ++++++++++++++----- 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py index 126d5d5b..e8ec692d 100644 --- a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py @@ -20,29 +20,61 @@ feature_list = get_feature_list(['features', 'lineage_feats'], None) plot_feature_correlations(df_track_level_features, feature_list, "linear_regression/figures") -#%% -for target in ['volume_at_C', 'delta_volume_BC', 'duration_BC']: - fit_linear_regression( + +# %% +config = { + 'duration_BC': [ + {'name': 'all_features', 'features': ['start_intrinsic', 'lifetime_intrinsic', 'start_extrinsic', 'lifetime_extrinsic']}, + {'name': 'start_intrinsic', 'features': ['start_intrinsic']}, + {'name': 'lifetime_intrinsic', 'features': ['lifetime_intrinsic']}, + {'name': 'start_extrinsic', 'features': ['start_extrinsic']}, + {'name': 'lifetime_extrinsic', 'features': ['lifetime_extrinsic']} + ], + 'delta_volume_BC': [ + {'name': 'all_features', 'features': ['start_intrinsic', 'lifetime_intrinsic', 'start_extrinsic', 'lifetime_extrinsic']}, + {'name': 'start_intrinsic', 'features': ['start_intrinsic']}, + {'name': 'lifetime_intrinsic', 'features': ['lifetime_intrinsic']}, + {'name': 'start_extrinsic', 'features': ['start_extrinsic']}, + {'name': 'lifetime_extrinsic', 'features': ['lifetime_extrinsic']} + ] +} +df = pd.DataFrame(columns=['target', 'r_squared', 'feature_group', 'alpha', 'feats_used']) + +def run_regression(target, features, name,): + _, all_test_sc, _ = fit_linear_regression( df_track_level_features, - cols=get_feature_list(['features'], target), + cols=get_feature_list(features, target), target=target, - alpha=np.arange(0, 15, 0.1, dtype=float), + alpha=[0], tol=TARGET_SETTINGS[target]['tolerance'], - save_path="./figures/feats/", - save=True + save_path=f"./figures/r_squared_matrix/{name}/", + save=True, + multiple_predictions=False ) - print(f"Finished {target}") + print(f"Target {target}, Alpha: 0. Feature group: {name}") + + r_squared = round(all_test_sc["Test r$^2$"].mean(), 3) + + return {'target': target, 'feature_group': name, 'r_squared': r_squared, 'alpha': 0, 'feats_used': get_feature_list(features, target)} + +for target, configs in config.items(): + for config in configs: + result = run_regression(target, **config) + df = df.append(result, ignore_index=True) + df.to_csv(f"./figures/r_squared_matrix/r_squared_results.csv") #%% -for target in ['volume_at_C', 'delta_volume_BC', 'duration_BC']: +print(df.iloc[:, :3]) + +# %% +for target in ['duration_BC', 'delta_volume_BC',]: fit_linear_regression( df_track_level_features, - cols=get_feature_list(['features', 'lineage_feats'], target), + cols=get_feature_list(['start_intrinsic', 'lifetime_intrinsic', 'start_extrinsic', 'lifetime_extrinsic'], target), target=target, alpha=np.arange(0, 15, 0.1, dtype=float), tol=TARGET_SETTINGS[target]['tolerance'], - save_path="./figures/feats_plus_lineage/", + save_path="./figures/feats_plus_lineage1/", save=True ) print(f"Finished {target} with lineage") - -# %% +#%% \ No newline at end of file From e55872a5862a6fb50e9aadbf0a19532842c0865a Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Fri, 4 Oct 2024 17:33:12 -0700 Subject: [PATCH 36/68] update regression so that you can start at 0 --- .../linear_regression/linear_regression_workflow.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py index 9ee6e6e5..23ee7d80 100644 --- a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py @@ -68,7 +68,7 @@ def main( def fit_linear_regression( - data, cols, target, alpha, tol, save_path, save, permute_cols=[] + data, cols, target, alpha, tol, save_path, save, permute_cols=[], multiple_predictions=True ): """ data - track level features @@ -94,9 +94,10 @@ def fit_linear_regression( "alpha": [], } - # remove 0 alpha due to convergence errors - alpha = [i for i in alpha if i != 0] - alpha = [round(i, 1) for i in alpha] + if multiple_predictions: + # remove 0 alpha due to convergence errors + alpha = [i for i in alpha if i != 0] + alpha = [round(i, 1) for i in alpha] # find best alpha for Lasso model for alpha_ind, this_alpha in tqdm(enumerate(alpha), total=len(alpha)): From 4bf140e068fa871c4fbacb0ea6c128f7beb0bc97 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Mon, 7 Oct 2024 15:19:47 -0700 Subject: [PATCH 37/68] in progress --- .../chantelle_linear_regression_workflow.py | 52 ++++++++++++++++--- .../linear_regression/select_features.py | 8 ++- 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py index e8ec692d..1da308e0 100644 --- a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py @@ -8,6 +8,9 @@ plot_feature_correlations, TARGET_SETTINGS) +import seaborn as sns +import matplotlib.pyplot as plt +from nuc_morph_analysis.lib.visualization.plotting_tools import get_plot_labels_for_metric pd.options.mode.chained_assignment = None # default='warn' warnings.simplefilter(action="ignore", category=FutureWarning) @@ -38,7 +41,7 @@ {'name': 'lifetime_extrinsic', 'features': ['lifetime_extrinsic']} ] } -df = pd.DataFrame(columns=['target', 'r_squared', 'feature_group', 'alpha', 'feats_used']) +df = pd.DataFrame(columns=['target', 'r_squared', 'stdev', 'feature_group', 'alpha', 'feats_used']) def run_regression(target, features, name,): _, all_test_sc, _ = fit_linear_regression( @@ -46,16 +49,17 @@ def run_regression(target, features, name,): cols=get_feature_list(features, target), target=target, alpha=[0], - tol=TARGET_SETTINGS[target]['tolerance'], + tol=0.4, save_path=f"./figures/r_squared_matrix/{name}/", - save=True, + save=False, multiple_predictions=False ) print(f"Target {target}, Alpha: 0. Feature group: {name}") r_squared = round(all_test_sc["Test r$^2$"].mean(), 3) + std = round(all_test_sc["Test r$^2$"].std(), 3) - return {'target': target, 'feature_group': name, 'r_squared': r_squared, 'alpha': 0, 'feats_used': get_feature_list(features, target)} + return {'target': target, 'feature_group': name, 'r_squared': r_squared, 'stdev': std, 'alpha': 0, 'feats_used': get_feature_list(features, target)} for target, configs in config.items(): for config in configs: @@ -63,9 +67,9 @@ def run_regression(target, features, name,): df = df.append(result, ignore_index=True) df.to_csv(f"./figures/r_squared_matrix/r_squared_results.csv") #%% -print(df.iloc[:, :3]) +print(df.iloc[:, :5]) -# %% +#%% for target in ['duration_BC', 'delta_volume_BC',]: fit_linear_regression( df_track_level_features, @@ -73,8 +77,40 @@ def run_regression(target, features, name,): target=target, alpha=np.arange(0, 15, 0.1, dtype=float), tol=TARGET_SETTINGS[target]['tolerance'], - save_path="./figures/feats_plus_lineage1/", + save_path="./figures/test_tolerance/", save=True ) print(f"Finished {target} with lineage") -#%% \ No newline at end of file + +#%% +# Split the 'feature_group' column into two +df[['start_lifetime', 'intrinsic_extrinsic']] = df['feature_group'].str.split('_', expand=True) +def replace_values(val): + if val in ['all', 'features']: + return 'all_features' + else: + return val +df[['start_lifetime', 'intrinsic_extrinsic']] = df[['start_lifetime', 'intrinsic_extrinsic']].applymap(replace_values) + +# Then create the pivot table and heatmap as before +for target, df_target in df.groupby('target'): + pivot_df = df_target.pivot(index='start_lifetime', columns='intrinsic_extrinsic', values='r_squared') + + fig, ax = plt.subplots(figsize=(10, 8)) # Create a figure and a set of subplots + + sns.heatmap(pivot_df, annot=True, cmap='coolwarm', ax=ax, vmin=0, vmax=0.5) + + ax.set_xticklabels(['','Extrinsic','Intrinsic']) + ax.xaxis.tick_top() + ax.set_yticklabels(['', 'Average over growth', 'Start of growth'], rotation=0) + + ax.set_xlabel('') + ax.set_ylabel('') + ax.tick_params(axis='both', which='both', length=0) + title = ax.set_title(f'Target: {get_plot_labels_for_metric(target)[1]}', loc='left') + title.set_position([-0.3,1]) + plt.show() + + + +# %% diff --git a/nuc_morph_analysis/analyses/linear_regression/select_features.py b/nuc_morph_analysis/analyses/linear_regression/select_features.py index 7fefd31c..f49fa473 100644 --- a/nuc_morph_analysis/analyses/linear_regression/select_features.py +++ b/nuc_morph_analysis/analyses/linear_regression/select_features.py @@ -10,7 +10,6 @@ 'SA_vol_ratio_at_B', 'SA_at_B', 'xy_aspect_at_B', - 'density_at_B', 'sisters_volume_at_B', ], @@ -27,6 +26,7 @@ 'start_extrinsic': [ # extrinsic at start of growth 'time_at_B', 'colony_time_at_B', + # 'density_at_B', 'neighbor_avg_lrm_volume_90um_at_B', 'neighbor_avg_lrm_height_90um_at_B', 'neighbor_avg_lrm_density_90um_at_B', @@ -52,22 +52,20 @@ 'duration_BC': [ 'duration_BC', 'late_growth_rate_by_endpoints', - 'mean_neighbor_avg_dxdt_48_volume_90um', ], 'delta_volume_BC': [ 'volume_at_C', 'delta_volume_BC', 'late_growth_rate_by_endpoints', - 'mean_neighbor_avg_dxdt_48_volume_90um', ] } TARGET_SETTINGS = { 'duration_BC': { - 'tolerance': 0.08, + 'tolerance': 0.04, }, 'delta_volume_BC': { - 'tolerance': 0.08, + 'tolerance': 0.04, } } From fb2ada0487259ed400ef13b74dd62cc09768d91e Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Wed, 9 Oct 2024 10:09:01 -0700 Subject: [PATCH 38/68] update to new density --- .../lib/preprocessing/global_dataset_filtering.py | 11 ++++++++--- nuc_morph_analysis/lib/visualization/label_tables.py | 3 ++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py index 9ca54633..a52c9a8b 100644 --- a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py +++ b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py @@ -201,7 +201,8 @@ def process_all_tracks(df, dataset, remove_growth_outliers, num_workers): df = add_change_over_time(df) df = add_neighborhood_avg_features.run_script(df, num_workers=num_workers) df = add_neighborhood_avg_features_lrm.run_script(df, num_workers=num_workers, - feature_list=["volume", "height", "density", "xy_aspect", "mesh_sa"], + feature_list=["volume", "height", "density", "xy_aspect", "mesh_sa", "2d_area_nuc_cell_ratio", + '2d_intensity_max_edge', '2d_intensity_mean_edge','2d_intensity_min_edge'], exclude_outliers=False) if dataset == "all_baseline": @@ -266,7 +267,7 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_features.add_lineage_features(df_full, feature_list=['volume_at_B', 'duration_BC', 'volume_at_C', 'delta_volume_BC']) df_full = add_features.add_feature_at(df_full, "frame_transition", 'height', 'height_percentile', pix_size) - df_full = add_features.add_feature_at(df_full, "frame_transition", 'density', 'density', pix_size) + # df_full = add_features.add_feature_at(df_full, "frame_transition", 'density', 'density', pix_size) for feature in ['xy_aspect', 'SA_vol_ratio', 'neighbor_avg_lrm_volume_90um', 'neighbor_avg_lrm_height_90um', 'neighbor_avg_lrm_density_90um','neighbor_avg_lrm_xy_aspect_90um','neighbor_avg_lrm_mesh_sa_90um']: df_full = add_features.add_feature_at(df_full, "frame_transition", feature, feature) @@ -282,7 +283,11 @@ def process_full_tracks(df_all, thresh, pix_size, interval): 'neighbor_avg_lrm_height_90um', 'neighbor_avg_lrm_density_90um', 'neighbor_avg_lrm_xy_aspect_90um', - 'neighbor_avg_lrm_mesh_sa_90um'] + 'neighbor_avg_lrm_mesh_sa_90um', + 'neighbor_avg_lrm_2d_area_nuc_cell_ratio_90um', + 'neighbor_avg_lrm_2d_intensity_max_edge_90um', + 'neighbor_avg_lrm_2d_intensity_mean_edge_90um', + 'neighbor_avg_lrm_2d_intensity_min_edge_90um'] multiplier_list = [get_plot_labels_for_metric(x)[0] for x in ft_list] df_full = add_features.add_mean_feature_over_trajectory(df_full, ft_list, multiplier_list) for feat in ft_list: diff --git a/nuc_morph_analysis/lib/visualization/label_tables.py b/nuc_morph_analysis/lib/visualization/label_tables.py index f7868790..15db731a 100644 --- a/nuc_morph_analysis/lib/visualization/label_tables.py +++ b/nuc_morph_analysis/lib/visualization/label_tables.py @@ -263,12 +263,13 @@ def get_scale_factor_table(dataset="all_baseline"): 'neighbor_avg_lrm_xy_aspect_90um_at_B': "Starting avg. XY aspect ratio in 90 \u00B5m radius", 'neighbor_avg_lrm_mesh_sa_90um_at_B': "Starting avg. surface area in 90 \u00B5m radius", - "mean_neighbor_avg_dxdt_48_volume_90um": "Mean avg. transient growth rate in 90 \u00B5m radius", + "mean_neighbor_avg_dxdt_48_volume_90um": "Avg. mean transient growth rate in 90 \u00B5m radius", 'mean_neighbor_avg_lrm_volume_90um': " Avg. mean volume in 90 \u00B5m radius", 'mean_neighbor_avg_lrm_height_90um': "Avg. mean height in 90 \u00B5m radius", 'mean_neighbor_avg_lrm_density_90um': "Avg. mean density in 90 \u00B5m radius", 'mean_neighbor_avg_lrm_xy_aspect_90um': "Avg. mean XY aspect ratio in 90 \u00B5m radius", 'mean_neighbor_avg_lrm_mesh_sa_90um': "Avg. mean surface area in 90 \u00B5m radius", + 'mean_neighbor_avg_lrm_2d_area_nuc_cell_ratio_90um': "Avg. mean density in 90 \u00B5m radius", # mitotic and apoptotic neighbor columns "number_of_frame_of_breakdown_neighbors": "# of neighboring cells undergoing breakdown", From fa2fdb39b6ffea2eaa70e9b2077639e2391cf34a Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Wed, 9 Oct 2024 10:09:26 -0700 Subject: [PATCH 39/68] Update workflow to have maximum r squared for feature groups --- .../linear_regression/analysis_plots.py | 132 ++++++++++++++++++ .../chantelle_linear_regression_workflow.py | 111 +++------------ .../linear_regression/select_features.py | 42 +----- 3 files changed, 158 insertions(+), 127 deletions(-) create mode 100644 nuc_morph_analysis/analyses/linear_regression/analysis_plots.py diff --git a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py new file mode 100644 index 00000000..6780ad47 --- /dev/null +++ b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py @@ -0,0 +1,132 @@ +import seaborn as sns +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from nuc_morph_analysis.lib.visualization.plotting_tools import get_plot_labels_for_metric +from nuc_morph_analysis.lib.visualization.notebook_tools import save_and_show_plot +from nuc_morph_analysis.analyses.linear_regression.linear_regression_workflow import fit_linear_regression +from nuc_morph_analysis.analyses.linear_regression.select_features import (get_feature_list) + + +def plot_feature_correlations(df_track_level_features, feature_list, figdir): + """ + Plot heatmap of feature correlations. + + Parameters + ---------- + df_track_level_features : pd.DataFrame + DataFrame containing track level features + feature_list : list + List of features to include in the heatmap + Output from get_feature_list + figdir : str + Directory to save the figure + + Returns + ------- + Figure + """ + data = df_track_level_features[feature_list] + + plt.rc('font', size=22) + plt.figure(figsize=(28, 25)) + sns.heatmap(data.corr(), annot=True, fmt=".1f", cmap='BrBG', vmin=-1, vmax=1, cbar_kws={"shrink": 0.5, "pad": 0.02}) + + column_names = [get_plot_labels_for_metric(col)[1] for col in data.columns] + plt.xticks([x + 0.5 for x in range(len(column_names))], column_names) + plt.yticks([y + 0.5 for y in range(len(column_names))], column_names) + plt.tight_layout() + + save_and_show_plot(f'{figdir}/feature_correlation_heatmap') + + +def run_regression(df_track_level_features, target, features, name, alpha): + _, all_test_sc, _ = fit_linear_regression( + df_track_level_features, + cols=get_feature_list(features, target), + target=target, + alpha=alpha, + tol=0.04, + save_path="./figures/test_density/", + save=False, + multiple_predictions=False + ) + print(f"Target {target}, Alpha: {alpha}. Feature group: {name}") + r_squared = round(all_test_sc["Test r$^2$"].mean(), 3) + std = round(all_test_sc["Test r$^2$"].std(), 3) + return {'target': target, 'feature_group': name, 'r_squared': r_squared, 'stdev': std, 'alpha': 0, 'feats_used': get_feature_list(features, target)} + +def run_regression_workflow(targets, feature_configs, df_track_level_features, figdir, alpha): + df = pd.DataFrame(columns=['target', 'r_squared', 'stdev', 'feature_group', 'alpha', 'feats_used']) + + for target in targets: + for name, features in feature_configs.items(): + result = run_regression(df_track_level_features, target, features, name, [alpha]) + df = df.append(result, ignore_index=True) + df.to_csv(f"{figdir}/r_squared_results.csv") + + df['num_feats_used'] = df['feats_used'].apply(lambda x: len(x)) + + return df + + +def plot_heatmap(df, figdir): + """ + Plot heatmap of r_squared values for different feature groups. + + Parameters + ---------- + df : pd.DataFrame + DataFrame containing r_squared values for different feature groups + figdir : str + Directory to save the figure + + Returns + ------- + Figure + """ + # Split the 'feature_group' column into two + df[['start_lifetime', 'intrinsic_extrinsic']] = df['feature_group'].str.split('_', expand=True) + def replace_values(val): + if val in ['all', 'features']: + return 'all_features' + else: + return val + df[['start_lifetime', 'intrinsic_extrinsic']] = df[['start_lifetime', 'intrinsic_extrinsic']].applymap(replace_values) + for index, row in df.iterrows(): + if row['start_lifetime'] == 'intrinsic': + df.at[index, 'intrinsic_extrinsic'] = 'intrinsic' + df.at[index, 'start_lifetime'] = 'both' + elif row['start_lifetime'] == 'extrinsic': + df.at[index, 'intrinsic_extrinsic'] = 'extrinsic' + df.at[index, 'start_lifetime'] = 'both' + + for target, df_target in df.groupby('target'): + pivot_df = df_target.pivot(index='start_lifetime', columns='intrinsic_extrinsic', values='r_squared') + pivot_df_std = df_target.pivot(index='start_lifetime', columns='intrinsic_extrinsic', values='stdev') + + fig, ax = plt.subplots(figsize=(10, 8)) + + sns.heatmap(pivot_df, annot=False, cmap='coolwarm', ax=ax, vmin=0, vmax=0.5) + + for text_x in range(pivot_df.shape[0]): + for text_y in range(pivot_df.shape[1]): + value = pivot_df.iloc[text_x, text_y] + std_dev = pivot_df_std.iloc[text_x, text_y] + if not np.isnan(value): + ax.text(text_y+0.5, text_x+0.5, f'{value:.2f} ± {std_dev:.2f}', + horizontalalignment='center', + verticalalignment='center') + + ax.set_xticklabels(['','Extrinsic','Intrinsic']) + ax.xaxis.tick_top() + ax.set_yticklabels(['', 'Both', 'Lifetime', 'Start of growth'], rotation=0) + + ax.set_xlabel('') + ax.set_ylabel('') + ax.tick_params(axis='both', which='both', length=0) + title = ax.set_title(f'Target: {get_plot_labels_for_metric(target)[1]}', loc='left') + title.set_position([-0.1,1]) + save_and_show_plot(f'{figdir}/{target}_prediction_r_squared_matrix_alpha_{df.alpha[0]}') + + diff --git a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py index 1da308e0..b508d9c2 100644 --- a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py @@ -4,13 +4,12 @@ import pandas as pd from nuc_morph_analysis.lib.preprocessing import global_dataset_filtering, filter_data from nuc_morph_analysis.analyses.linear_regression.linear_regression_workflow import fit_linear_regression -from nuc_morph_analysis.analyses.linear_regression.select_features import (get_feature_list, - plot_feature_correlations, +from nuc_morph_analysis.analyses.linear_regression.analysis_plots import (run_regression_workflow, plot_feature_correlations, + plot_heatmap) +from nuc_morph_analysis.analyses.linear_regression.select_features import (get_feature_list, TARGET_SETTINGS) -import seaborn as sns -import matplotlib.pyplot as plt -from nuc_morph_analysis.lib.visualization.plotting_tools import get_plot_labels_for_metric + pd.options.mode.chained_assignment = None # default='warn' warnings.simplefilter(action="ignore", category=FutureWarning) @@ -19,98 +18,34 @@ df_full = filter_data.all_timepoints_full_tracks(df_all) df_track_level_features = filter_data.track_level_features(df_full) -#%% -feature_list = get_feature_list(['features', 'lineage_feats'], None) -plot_feature_correlations(df_track_level_features, feature_list, "linear_regression/figures") - - -# %% -config = { - 'duration_BC': [ - {'name': 'all_features', 'features': ['start_intrinsic', 'lifetime_intrinsic', 'start_extrinsic', 'lifetime_extrinsic']}, - {'name': 'start_intrinsic', 'features': ['start_intrinsic']}, - {'name': 'lifetime_intrinsic', 'features': ['lifetime_intrinsic']}, - {'name': 'start_extrinsic', 'features': ['start_extrinsic']}, - {'name': 'lifetime_extrinsic', 'features': ['lifetime_extrinsic']} - ], - 'delta_volume_BC': [ - {'name': 'all_features', 'features': ['start_intrinsic', 'lifetime_intrinsic', 'start_extrinsic', 'lifetime_extrinsic']}, - {'name': 'start_intrinsic', 'features': ['start_intrinsic']}, - {'name': 'lifetime_intrinsic', 'features': ['lifetime_intrinsic']}, - {'name': 'start_extrinsic', 'features': ['start_extrinsic']}, - {'name': 'lifetime_extrinsic', 'features': ['lifetime_extrinsic']} - ] +#%% +FIGDIR='./figures/' +TARGETS = ['duration_BC', 'delta_volume_BC'] +CONFIG = { + 'all_features': ['start_intrinsic', 'lifetime_intrinsic', 'start_extrinsic', 'lifetime_extrinsic'], + 'start_intrinsic': ['start_intrinsic'], + 'lifetime_intrinsic': ['lifetime_intrinsic'], + 'start_extrinsic': ['start_extrinsic'], + 'lifetime_extrinsic': ['lifetime_extrinsic'], + 'intrinsic': ['start_intrinsic', 'lifetime_intrinsic'], + 'extrinsic': ['start_extrinsic', 'lifetime_extrinsic'], } -df = pd.DataFrame(columns=['target', 'r_squared', 'stdev', 'feature_group', 'alpha', 'feats_used']) - -def run_regression(target, features, name,): - _, all_test_sc, _ = fit_linear_regression( - df_track_level_features, - cols=get_feature_list(features, target), - target=target, - alpha=[0], - tol=0.4, - save_path=f"./figures/r_squared_matrix/{name}/", - save=False, - multiple_predictions=False - ) - print(f"Target {target}, Alpha: 0. Feature group: {name}") - - r_squared = round(all_test_sc["Test r$^2$"].mean(), 3) - std = round(all_test_sc["Test r$^2$"].std(), 3) - - return {'target': target, 'feature_group': name, 'r_squared': r_squared, 'stdev': std, 'alpha': 0, 'feats_used': get_feature_list(features, target)} - -for target, configs in config.items(): - for config in configs: - result = run_regression(target, **config) - df = df.append(result, ignore_index=True) - df.to_csv(f"./figures/r_squared_matrix/r_squared_results.csv") #%% -print(df.iloc[:, :5]) - +df = run_regression_workflow(TARGETS, CONFIG, df_track_level_features, FIGDIR, alpha=0) +plot_heatmap(df, FIGDIR) #%% -for target in ['duration_BC', 'delta_volume_BC',]: +for target in TARGETS: + df = run_regression_workflow([target], CONFIG, df_track_level_features, FIGDIR, alpha=TARGET_SETTINGS[target]['max_alpha']) + plot_heatmap(df, FIGDIR) +#%% +for target in ['duration_BC', 'delta_volume_BC']: fit_linear_regression( df_track_level_features, cols=get_feature_list(['start_intrinsic', 'lifetime_intrinsic', 'start_extrinsic', 'lifetime_extrinsic'], target), target=target, alpha=np.arange(0, 15, 0.1, dtype=float), tol=TARGET_SETTINGS[target]['tolerance'], - save_path="./figures/test_tolerance/", + save_path=FIGDIR, save=True ) - print(f"Finished {target} with lineage") - -#%% -# Split the 'feature_group' column into two -df[['start_lifetime', 'intrinsic_extrinsic']] = df['feature_group'].str.split('_', expand=True) -def replace_values(val): - if val in ['all', 'features']: - return 'all_features' - else: - return val -df[['start_lifetime', 'intrinsic_extrinsic']] = df[['start_lifetime', 'intrinsic_extrinsic']].applymap(replace_values) - -# Then create the pivot table and heatmap as before -for target, df_target in df.groupby('target'): - pivot_df = df_target.pivot(index='start_lifetime', columns='intrinsic_extrinsic', values='r_squared') - - fig, ax = plt.subplots(figsize=(10, 8)) # Create a figure and a set of subplots - - sns.heatmap(pivot_df, annot=True, cmap='coolwarm', ax=ax, vmin=0, vmax=0.5) - - ax.set_xticklabels(['','Extrinsic','Intrinsic']) - ax.xaxis.tick_top() - ax.set_yticklabels(['', 'Average over growth', 'Start of growth'], rotation=0) - - ax.set_xlabel('') - ax.set_ylabel('') - ax.tick_params(axis='both', which='both', length=0) - title = ax.set_title(f'Target: {get_plot_labels_for_metric(target)[1]}', loc='left') - title.set_position([-0.3,1]) - plt.show() - - - # %% diff --git a/nuc_morph_analysis/analyses/linear_regression/select_features.py b/nuc_morph_analysis/analyses/linear_regression/select_features.py index f49fa473..4e5b0a37 100644 --- a/nuc_morph_analysis/analyses/linear_regression/select_features.py +++ b/nuc_morph_analysis/analyses/linear_regression/select_features.py @@ -1,8 +1,3 @@ -import seaborn as sns -import matplotlib.pyplot as plt -from nuc_morph_analysis.lib.visualization.plotting_tools import get_plot_labels_for_metric -from nuc_morph_analysis.lib.visualization.notebook_tools import save_and_show_plot - FEATURE_GROUPS = { 'start_intrinsic': [ #intrinic at start of growth 'volume_at_B', @@ -26,10 +21,8 @@ 'start_extrinsic': [ # extrinsic at start of growth 'time_at_B', 'colony_time_at_B', - # 'density_at_B', 'neighbor_avg_lrm_volume_90um_at_B', 'neighbor_avg_lrm_height_90um_at_B', - 'neighbor_avg_lrm_density_90um_at_B', 'neighbor_avg_lrm_xy_aspect_90um_at_B', 'neighbor_avg_lrm_mesh_sa_90um_at_B', 'early_transient_gr_90um', @@ -40,12 +33,11 @@ 'normalized_sum_has_dying_neighbor', 'mean_neighbor_avg_lrm_volume_90um', 'mean_neighbor_avg_lrm_height_90um', - 'mean_neighbor_avg_lrm_density_90um', 'mean_neighbor_avg_lrm_xy_aspect_90um', 'mean_neighbor_avg_lrm_mesh_sa_90um', 'mean_neighbor_avg_dxdt_48_volume_90um', + 'mean_neighbor_avg_lrm_2d_area_nuc_cell_ratio_90um', ], - } TARGET_CONTAINTING_FEATS = { @@ -63,9 +55,11 @@ TARGET_SETTINGS = { 'duration_BC': { 'tolerance': 0.04, + 'max_alpha': 0.7, }, 'delta_volume_BC': { 'tolerance': 0.04, + 'max_alpha': 12.3, } } @@ -95,33 +89,3 @@ def get_feature_list(feature_group_list, target): return features -def plot_feature_correlations(df_track_level_features, feature_list, figdir): - """ - Plot heatmap of feature correlations. - - Parameters - ---------- - df_track_level_features : pd.DataFrame - DataFrame containing track level features - feature_list : list - List of features to include in the heatmap - Output from get_feature_list - figdir : str - Directory to save the figure - - Returns - ------- - Figure - """ - data = df_track_level_features[feature_list] - - plt.rc('font', size=22) - plt.figure(figsize=(27, 24)) - sns.heatmap(data.corr(), annot=True, fmt=".1f", cmap='BrBG', vmin=-1, vmax=1, cbar_kws={"shrink": 0.5, "pad": 0.02}) - - column_names = [get_plot_labels_for_metric(col)[1] for col in data.columns] - plt.xticks([x + 0.5 for x in range(len(column_names))], column_names) - plt.yticks([y + 0.5 for y in range(len(column_names))], column_names) - plt.tight_layout() - - save_and_show_plot(f'{figdir}/feature_correlation_heatmap') \ No newline at end of file From c247f19059be288ac3712f7e2639cd760af88738 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Wed, 9 Oct 2024 10:38:05 -0700 Subject: [PATCH 40/68] update to use config all feats --- .../chantelle_linear_regression_workflow.py | 5 ++++- .../analyses/linear_regression/select_features.py | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py index b508d9c2..27c5a141 100644 --- a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py @@ -30,6 +30,9 @@ 'intrinsic': ['start_intrinsic', 'lifetime_intrinsic'], 'extrinsic': ['start_extrinsic', 'lifetime_extrinsic'], } +#%% +plot_feature_correlations(df_track_level_features, get_feature_list(CONFIG['all_features'], None), FIGDIR) + #%% df = run_regression_workflow(TARGETS, CONFIG, df_track_level_features, FIGDIR, alpha=0) plot_heatmap(df, FIGDIR) @@ -41,7 +44,7 @@ for target in ['duration_BC', 'delta_volume_BC']: fit_linear_regression( df_track_level_features, - cols=get_feature_list(['start_intrinsic', 'lifetime_intrinsic', 'start_extrinsic', 'lifetime_extrinsic'], target), + cols=get_feature_list(CONFIG['all_features'], target), target=target, alpha=np.arange(0, 15, 0.1, dtype=float), tol=TARGET_SETTINGS[target]['tolerance'], diff --git a/nuc_morph_analysis/analyses/linear_regression/select_features.py b/nuc_morph_analysis/analyses/linear_regression/select_features.py index 4e5b0a37..7a49f3dc 100644 --- a/nuc_morph_analysis/analyses/linear_regression/select_features.py +++ b/nuc_morph_analysis/analyses/linear_regression/select_features.py @@ -49,6 +49,9 @@ 'volume_at_C', 'delta_volume_BC', 'late_growth_rate_by_endpoints', + ], + None: [ + '', ] } From d2881350e451a53131953f04f4043b5df269501f Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Wed, 9 Oct 2024 14:40:32 -0700 Subject: [PATCH 41/68] confirm N is the same for subset cols --- .../linear_regression/analysis_plots.py | 10 ++-- .../chantelle_linear_regression_workflow.py | 55 +++++++++++++++---- .../linear_regression_workflow.py | 1 + ...near_regression_workflow_greedy_removal.py | 26 +++++---- .../linear_regression/select_features.py | 4 +- .../lib/preprocessing/add_features.py | 11 ++-- 6 files changed, 74 insertions(+), 33 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py index 6780ad47..0498dae5 100644 --- a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py +++ b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py @@ -40,14 +40,14 @@ def plot_feature_correlations(df_track_level_features, feature_list, figdir): save_and_show_plot(f'{figdir}/feature_correlation_heatmap') -def run_regression(df_track_level_features, target, features, name, alpha): +def run_regression(df_track_level_features, target, features, name, alpha, figdir): _, all_test_sc, _ = fit_linear_regression( df_track_level_features, cols=get_feature_list(features, target), target=target, alpha=alpha, tol=0.04, - save_path="./figures/test_density/", + save_path=figdir, save=False, multiple_predictions=False ) @@ -61,9 +61,9 @@ def run_regression_workflow(targets, feature_configs, df_track_level_features, f for target in targets: for name, features in feature_configs.items(): - result = run_regression(df_track_level_features, target, features, name, [alpha]) + result = run_regression(df_track_level_features, target, features, name, [alpha], figdir) df = df.append(result, ignore_index=True) - df.to_csv(f"{figdir}/r_squared_results.csv") + df.to_csv(f"{figdir}r_squared_results.csv") df['num_feats_used'] = df['feats_used'].apply(lambda x: len(x)) @@ -127,6 +127,6 @@ def replace_values(val): ax.tick_params(axis='both', which='both', length=0) title = ax.set_title(f'Target: {get_plot_labels_for_metric(target)[1]}', loc='left') title.set_position([-0.1,1]) - save_and_show_plot(f'{figdir}/{target}_prediction_r_squared_matrix_alpha_{df.alpha[0]}') + save_and_show_plot(f'{figdir}{target}_prediction_r_squared_matrix_alpha_{df.alpha[0]}') diff --git a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py index 27c5a141..254dd931 100644 --- a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py @@ -8,8 +8,7 @@ plot_heatmap) from nuc_morph_analysis.analyses.linear_regression.select_features import (get_feature_list, TARGET_SETTINGS) - - +from nuc_morph_analysis.analyses.linear_regression.linear_regression_workflow_greedy_removal import main pd.options.mode.chained_assignment = None # default='warn' warnings.simplefilter(action="ignore", category=FutureWarning) @@ -19,7 +18,7 @@ df_track_level_features = filter_data.track_level_features(df_full) #%% -FIGDIR='./figures/' +FIGDIR='linear_regression/figures/' TARGETS = ['duration_BC', 'delta_volume_BC'] CONFIG = { 'all_features': ['start_intrinsic', 'lifetime_intrinsic', 'start_extrinsic', 'lifetime_extrinsic'], @@ -30,25 +29,57 @@ 'intrinsic': ['start_intrinsic', 'lifetime_intrinsic'], 'extrinsic': ['start_extrinsic', 'lifetime_extrinsic'], } + #%% plot_feature_correlations(df_track_level_features, get_feature_list(CONFIG['all_features'], None), FIGDIR) -#%% -df = run_regression_workflow(TARGETS, CONFIG, df_track_level_features, FIGDIR, alpha=0) +#%% preprocess dataframe to ensure same N for all analysis +dropna_cols = get_feature_list(CONFIG['all_features'], None) +data = df_track_level_features.dropna(subset=dropna_cols) +print(f"Number of tracks: {len(data)}") + +#%% Create maxtrix of r squared values +df = run_regression_workflow(TARGETS, CONFIG, data, FIGDIR, alpha=0) plot_heatmap(df, FIGDIR) -#%% -for target in TARGETS: - df = run_regression_workflow([target], CONFIG, df_track_level_features, FIGDIR, alpha=TARGET_SETTINGS[target]['max_alpha']) - plot_heatmap(df, FIGDIR) -#%% + +#%% Create movie of increasing alpha for target in ['duration_BC', 'delta_volume_BC']: fit_linear_regression( - df_track_level_features, + data, cols=get_feature_list(CONFIG['all_features'], target), target=target, - alpha=np.arange(0, 15, 0.1, dtype=float), + alpha=np.arange(0, 15, 0.2, dtype=float), tol=TARGET_SETTINGS[target]['tolerance'], save_path=FIGDIR, save=True ) + +#%% +for target in TARGETS: + df = run_regression_workflow([target], CONFIG, data, FIGDIR, alpha=TARGET_SETTINGS[target]['max_alpha']) + plot_heatmap(df, FIGDIR) +# %% +# for target in TARGETS: +target = 'delta_volume_BC' +main(cols=get_feature_list(CONFIG['all_features'], target), + target=target, + alpha_range=np.arange(0.5, 15, 0.2, dtype=float), + tolerance=TARGET_SETTINGS[target]['tolerance'], + save_path=FIGDIR, + max_iterations=100, + preloaded_dataframe=data) + + +# %% +cols = get_feature_list(CONFIG['all_features'], None) +df_na = df_track_level_features[df_track_level_features[cols].isna().any(axis=1)] +df_test = df_track_level_features.dropna(subset=cols) +# %% +nan_cols = df_na.columns[df_na.isna().any()].tolist() +# %% +overlap = set(nan_cols).intersection(set(cols)) +# %% +for feature in overlap: + nan_count = df_track_level_features[feature].isna().sum() + print(f"{feature}: {nan_count}") # %% diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py index 23ee7d80..7aac6339 100644 --- a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py @@ -104,6 +104,7 @@ def fit_linear_regression( # drop any nan rows dropna_cols = cols + [target] data = data.dropna(subset=dropna_cols) + print(f"number of tracks: {len(data)}") # permute columns if necessary if len(permute_cols) > 0: diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow_greedy_removal.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow_greedy_removal.py index aeaa6aa7..b6afafc8 100644 --- a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow_greedy_removal.py +++ b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow_greedy_removal.py @@ -35,6 +35,7 @@ def main( save_path, max_iterations, cached_dataframe=None, + preloaded_dataframe=None, ): save_path = Path(save_path) @@ -42,20 +43,19 @@ def main( save_path.mkdir(parents=True, exist_ok=True) if len(cols) < 1: - cols = get_feature_list(["features"], target) + cols = get_feature_list(['start_intrinsic', 'lifetime_intrinsic', 'start_extrinsic', 'lifetime_extrinsic'], target) label_list = [get_plot_labels_for_metric(col)[1] for col in cols] map_dict = {i: j for i, j in zip(cols, label_list)} - - if not cached_dataframe: + + if preloaded_dataframe is not None: + df_track_level_features = preloaded_dataframe + elif cached_dataframe is not None: + df_track_level_features = pd.read_csv(cached_dataframe) + else: df_all = global_dataset_filtering.load_dataset_with_features() df_full = filter_data.all_timepoints_full_tracks(df_all) df_track_level_features = filter_data.track_level_features(df_full) - df_track_level_features.to_csv( - "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/track_level.csv" - ) - else: - df_track_level_features = pd.read_csv(cached_dataframe) permute_cols = [] count = 0 @@ -224,7 +224,7 @@ def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path, map_di g.savefig(this_path, dpi=300) # save movie of pngs - writer = imageio.get_writer(save_path / "coefficients_over_time.mp4", fps=2) + writer = imageio.get_writer(save_path / f"{target}_greedy_coefficients_over_time.mp4", fps=2) for im in files: writer.append_data(imageio.imread(im)) os.remove(im) @@ -242,7 +242,12 @@ def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path, map_di "should match the result of linear_regression_analysis.get_data (see source code for " "details).", ) - + parser.add_argument( + "--preloaded_dataframe", + type=pd.DataFrame, + metavar="path", + help="Supply preloaded dataframe to skip data preprocessing.", + ) parser.add_argument( "--cols", type=list_of_strings, @@ -288,4 +293,5 @@ def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path, map_di save_path=args.save_path, max_iterations=args.max_iterations, cached_dataframe=args.cached_dataframe, + preloaded_dataframe=args.preloaded_dataframe, ) diff --git a/nuc_morph_analysis/analyses/linear_regression/select_features.py b/nuc_morph_analysis/analyses/linear_regression/select_features.py index 7a49f3dc..f935bfd2 100644 --- a/nuc_morph_analysis/analyses/linear_regression/select_features.py +++ b/nuc_morph_analysis/analyses/linear_regression/select_features.py @@ -57,11 +57,11 @@ TARGET_SETTINGS = { 'duration_BC': { - 'tolerance': 0.04, + 'tolerance': 0.05, 'max_alpha': 0.7, }, 'delta_volume_BC': { - 'tolerance': 0.04, + 'tolerance': 0.05, 'max_alpha': 12.3, } } diff --git a/nuc_morph_analysis/lib/preprocessing/add_features.py b/nuc_morph_analysis/lib/preprocessing/add_features.py index 71b2a4b4..b61b3fd7 100644 --- a/nuc_morph_analysis/lib/preprocessing/add_features.py +++ b/nuc_morph_analysis/lib/preprocessing/add_features.py @@ -198,7 +198,7 @@ def get_early_transient_gr_of_whole_colony(df, scale, time_shift=25): return df -def get_early_transient_gr_of_neighborhood(df, scale, time_shift=25): +def get_early_transient_gr_of_neighborhood(df, scale, time_shift=24, window_length=6): """ Get the transient growth rate of the colony 2 hours into the growth trajectory. @@ -214,6 +214,8 @@ def get_early_transient_gr_of_neighborhood(df, scale, time_shift=25): The dataframe time_shift : int The time shift in frames to calculate the transient growth rate in frames + window_length : int + The length of the time window in frames Returns ------- @@ -222,10 +224,11 @@ def get_early_transient_gr_of_neighborhood(df, scale, time_shift=25): """ for tid, dft in df.groupby("track_id"): t_calculate = dft.index_sequence.min() + time_shift - transient_gr_whole_colony = df.loc[df.index_sequence == t_calculate, "neighbor_avg_dxdt_48_volume_90um"].values[0] + time_window_mask = df.index_sequence.between(t_calculate, t_calculate + window_length) + transient_gr_whole_colony = df.loc[time_window_mask, "neighbor_avg_dxdt_48_volume_90um"].mean() df.loc[df.track_id == tid, "early_transient_gr_90um"] = transient_gr_whole_colony * scale - - return df + + return df def add_std_feature_over_trajectory(df, feature_list, multiplier_list): """ From 61f8f211e25e0935d698de46654f061f4bc340f9 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Mon, 14 Oct 2024 11:31:41 -0700 Subject: [PATCH 42/68] update workflow --- .../chantelle_linear_regression_workflow.py | 36 +++++-------------- .../linear_regression_workflow.py | 5 +-- .../preprocessing/global_dataset_filtering.py | 15 +++----- 3 files changed, 15 insertions(+), 41 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py index 254dd931..ae15d6d1 100644 --- a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py @@ -54,32 +54,12 @@ save=True ) -#%% +# %% Greedy removal for target in TARGETS: - df = run_regression_workflow([target], CONFIG, data, FIGDIR, alpha=TARGET_SETTINGS[target]['max_alpha']) - plot_heatmap(df, FIGDIR) -# %% -# for target in TARGETS: -target = 'delta_volume_BC' -main(cols=get_feature_list(CONFIG['all_features'], target), - target=target, - alpha_range=np.arange(0.5, 15, 0.2, dtype=float), - tolerance=TARGET_SETTINGS[target]['tolerance'], - save_path=FIGDIR, - max_iterations=100, - preloaded_dataframe=data) - - -# %% -cols = get_feature_list(CONFIG['all_features'], None) -df_na = df_track_level_features[df_track_level_features[cols].isna().any(axis=1)] -df_test = df_track_level_features.dropna(subset=cols) -# %% -nan_cols = df_na.columns[df_na.isna().any()].tolist() -# %% -overlap = set(nan_cols).intersection(set(cols)) -# %% -for feature in overlap: - nan_count = df_track_level_features[feature].isna().sum() - print(f"{feature}: {nan_count}") -# %% + main(cols=get_feature_list(CONFIG['all_features'], target), + target=target, + alpha_range=np.arange(0.5, 15, 0.2, dtype=float), + tolerance=TARGET_SETTINGS[target]['tolerance'], + save_path=FIGDIR, + max_iterations=100, + preloaded_dataframe=data) \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py index 7aac6339..a3e1c113 100644 --- a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py @@ -142,8 +142,9 @@ def fit_linear_regression( rounded_permutation_score = round(score, 2) if alpha_ind == 0: max_val = rounded_permutation_score - if abs(rounded_permutation_score - max_val) > tol or (pvalue > 0.05): - break + if multiple_predictions: + if abs(rounded_permutation_score - max_val) > tol or (pvalue > 0.05): + break # if relatively equal to linear regression value, then continue # save permutation score and p_value to dictionary diff --git a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py index a52c9a8b..eff68fff 100644 --- a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py +++ b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py @@ -201,8 +201,7 @@ def process_all_tracks(df, dataset, remove_growth_outliers, num_workers): df = add_change_over_time(df) df = add_neighborhood_avg_features.run_script(df, num_workers=num_workers) df = add_neighborhood_avg_features_lrm.run_script(df, num_workers=num_workers, - feature_list=["volume", "height", "density", "xy_aspect", "mesh_sa", "2d_area_nuc_cell_ratio", - '2d_intensity_max_edge', '2d_intensity_mean_edge','2d_intensity_min_edge'], + feature_list=["volume", "height", "xy_aspect", "mesh_sa", "2d_area_nuc_cell_ratio"], exclude_outliers=False) if dataset == "all_baseline": @@ -269,25 +268,19 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_features.add_feature_at(df_full, "frame_transition", 'height', 'height_percentile', pix_size) # df_full = add_features.add_feature_at(df_full, "frame_transition", 'density', 'density', pix_size) for feature in ['xy_aspect', 'SA_vol_ratio', 'neighbor_avg_lrm_volume_90um', 'neighbor_avg_lrm_height_90um', - 'neighbor_avg_lrm_density_90um','neighbor_avg_lrm_xy_aspect_90um','neighbor_avg_lrm_mesh_sa_90um']: + 'neighbor_avg_lrm_xy_aspect_90um','neighbor_avg_lrm_mesh_sa_90um']: df_full = add_features.add_feature_at(df_full, "frame_transition", feature, feature) - df_full = add_features.get_early_transient_gr_of_whole_colony(df_full, scale=get_plot_labels_for_metric('neighbor_avg_dxdt_48_volume_whole_colony')[0]) df_full = add_features.get_early_transient_gr_of_neighborhood(df_full, scale=get_plot_labels_for_metric('neighbor_avg_dxdt_48_volume_90um')[0]) df_full = add_features.sum_mitotic_events_along_full_track(df_full) df_full = add_features.normalize_sum_events(df_full, ['sum_has_mitotic_neighbor', 'sum_has_dying_neighbor']) - ft_list = ['neighbor_avg_dxdt_48_volume_whole_colony', - 'neighbor_avg_dxdt_48_volume_90um', + ft_list = ['neighbor_avg_dxdt_48_volume_90um', 'neighbor_avg_lrm_volume_90um', 'neighbor_avg_lrm_height_90um', - 'neighbor_avg_lrm_density_90um', 'neighbor_avg_lrm_xy_aspect_90um', 'neighbor_avg_lrm_mesh_sa_90um', - 'neighbor_avg_lrm_2d_area_nuc_cell_ratio_90um', - 'neighbor_avg_lrm_2d_intensity_max_edge_90um', - 'neighbor_avg_lrm_2d_intensity_mean_edge_90um', - 'neighbor_avg_lrm_2d_intensity_min_edge_90um'] + 'neighbor_avg_lrm_2d_area_nuc_cell_ratio_90um',] multiplier_list = [get_plot_labels_for_metric(x)[0] for x in ft_list] df_full = add_features.add_mean_feature_over_trajectory(df_full, ft_list, multiplier_list) for feat in ft_list: From 016722b5e7fbd56aa16dc207edec28afa6d8b6ca Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Mon, 14 Oct 2024 14:35:00 -0700 Subject: [PATCH 43/68] Update main function to avoid error --- .../analyses/linear_regression/linear_regression_workflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py index a3e1c113..825e49b3 100644 --- a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py @@ -327,5 +327,5 @@ def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path): tolerance=args.tolerance, save_path=args.save_path, cached_dataframe=args.cached_dataframe, - save=args.save, + save_movie=args.save, ) From f82e19a4b045ad462cbd5c079097d34a8d11886f Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Mon, 21 Oct 2024 14:35:33 -0700 Subject: [PATCH 44/68] change to linear color map for heatmapt --- .../analyses/linear_regression/analysis_plots.py | 12 +++++++++--- .../chantelle_linear_regression_workflow.py | 5 ++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py index 0498dae5..ddbe664c 100644 --- a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py +++ b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py @@ -70,7 +70,7 @@ def run_regression_workflow(targets, feature_configs, df_track_level_features, f return df -def plot_heatmap(df, figdir): +def plot_heatmap(df, figdir, cmap='coolwarm'): """ Plot heatmap of r_squared values for different feature groups. @@ -80,6 +80,8 @@ def plot_heatmap(df, figdir): DataFrame containing r_squared values for different feature groups figdir : str Directory to save the figure + cmap: str + linear colormap Returns ------- @@ -107,16 +109,20 @@ def replace_values(val): fig, ax = plt.subplots(figsize=(10, 8)) - sns.heatmap(pivot_df, annot=False, cmap='coolwarm', ax=ax, vmin=0, vmax=0.5) + sns.heatmap(pivot_df, annot=False, cmap=cmap, ax=ax, vmin=0, vmax=0.5) + first_element = True for text_x in range(pivot_df.shape[0]): for text_y in range(pivot_df.shape[1]): value = pivot_df.iloc[text_x, text_y] std_dev = pivot_df_std.iloc[text_x, text_y] if not np.isnan(value): + color = 'white' if first_element else 'black' ax.text(text_y+0.5, text_x+0.5, f'{value:.2f} ± {std_dev:.2f}', horizontalalignment='center', - verticalalignment='center') + verticalalignment='center', + color=color) + first_element = False ax.set_xticklabels(['','Extrinsic','Intrinsic']) ax.xaxis.tick_top() diff --git a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py index ae15d6d1..5e74d422 100644 --- a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py @@ -40,7 +40,10 @@ #%% Create maxtrix of r squared values df = run_regression_workflow(TARGETS, CONFIG, data, FIGDIR, alpha=0) -plot_heatmap(df, FIGDIR) +#%% +for cmap in ['YlOrRd','OrRd', 'coolwarm']: + print(cmap) + plot_heatmap(df, FIGDIR, cmap) #%% Create movie of increasing alpha for target in ['duration_BC', 'delta_volume_BC']: From 13f5250bda40bcf420f43cf76b3cb179169246b1 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Mon, 21 Oct 2024 15:30:29 -0700 Subject: [PATCH 45/68] add clustering to correlation heatmap --- .../linear_regression/analysis_plots.py | 41 +++++++++++++++++++ .../chantelle_linear_regression_workflow.py | 4 +- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py index ddbe664c..901cf4a1 100644 --- a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py +++ b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py @@ -39,6 +39,47 @@ def plot_feature_correlations(df_track_level_features, feature_list, figdir): save_and_show_plot(f'{figdir}/feature_correlation_heatmap') +def plot_feature_cluster_correlations(df_track_level_features, feature_list, figdir): + """ + Plot clustermap of feature correlations. + + Parameters + ---------- + df_track_level_features : pd.DataFrame + DataFrame containing track level features + feature_list : list + List of features to include in the clustermap + Output from get_feature_list + figdir : str + Directory to save the figure + + Returns + ------- + Figure + """ + data = df_track_level_features[feature_list] + + cluster_grid = sns.clustermap(data.corr(), vmin=-1, vmax=1, cmap='vlag', + cbar_pos=(0.45, 0.5, 0.18, 0.02), + cbar_kws={"orientation": "horizontal"}) + + # Get the reordered labels using the dendrogram information + reordered_column_index = cluster_grid.dendrogram_col.reordered_ind + + reordered_labels = [get_plot_labels_for_metric(data.columns[i])[1] for i in reordered_column_index] + + # Ensure the number of labels matches the number of ticks + cluster_grid.ax_heatmap.set_xticks(range(len(reordered_labels))) + cluster_grid.ax_heatmap.set_xticklabels(reordered_labels, rotation=90) + cluster_grid.ax_heatmap.set_yticks(range(len(reordered_labels))) + cluster_grid.ax_heatmap.set_yticklabels(reordered_labels, rotation=0) + + # Adjust the padding between the labels and the heatmap + cluster_grid.ax_heatmap.tick_params(axis='x', labelsize=8, width=0.7) + cluster_grid.ax_heatmap.tick_params(axis='y', labelsize=8, width=0.7) + + save_and_show_plot(f'{figdir}/feature_correlation_clustermap', figure=cluster_grid.fig, dpi=300) + def run_regression(df_track_level_features, target, features, name, alpha, figdir): _, all_test_sc, _ = fit_linear_regression( diff --git a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py index 5e74d422..9ddd6397 100644 --- a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py @@ -5,7 +5,7 @@ from nuc_morph_analysis.lib.preprocessing import global_dataset_filtering, filter_data from nuc_morph_analysis.analyses.linear_regression.linear_regression_workflow import fit_linear_regression from nuc_morph_analysis.analyses.linear_regression.analysis_plots import (run_regression_workflow, plot_feature_correlations, - plot_heatmap) + plot_feature_cluster_correlations, plot_heatmap) from nuc_morph_analysis.analyses.linear_regression.select_features import (get_feature_list, TARGET_SETTINGS) from nuc_morph_analysis.analyses.linear_regression.linear_regression_workflow_greedy_removal import main @@ -32,6 +32,8 @@ #%% plot_feature_correlations(df_track_level_features, get_feature_list(CONFIG['all_features'], None), FIGDIR) +#%% +plot_feature_cluster_correlations(df_track_level_features, get_feature_list(CONFIG['all_features'], None), FIGDIR) #%% preprocess dataframe to ensure same N for all analysis dropna_cols = get_feature_list(CONFIG['all_features'], None) From a8be049fcedc6dfbe4b0c1732db3fc6ce37ec15a Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Mon, 21 Oct 2024 15:44:27 -0700 Subject: [PATCH 46/68] add annotations to cluster plot --- .../analyses/linear_regression/analysis_plots.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py index 901cf4a1..6b939a63 100644 --- a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py +++ b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py @@ -60,8 +60,10 @@ def plot_feature_cluster_correlations(df_track_level_features, feature_list, fig data = df_track_level_features[feature_list] cluster_grid = sns.clustermap(data.corr(), vmin=-1, vmax=1, cmap='vlag', - cbar_pos=(0.45, 0.5, 0.18, 0.02), - cbar_kws={"orientation": "horizontal"}) + cbar_pos=(0.7, 0.13, 0.18, 0.02), + cbar_kws={"orientation": "horizontal"}, + annot=True, fmt=".1f", annot_kws={"size": 12}, + figsize=(20, 20)) # Get the reordered labels using the dendrogram information reordered_column_index = cluster_grid.dendrogram_col.reordered_ind @@ -69,14 +71,14 @@ def plot_feature_cluster_correlations(df_track_level_features, feature_list, fig reordered_labels = [get_plot_labels_for_metric(data.columns[i])[1] for i in reordered_column_index] # Ensure the number of labels matches the number of ticks - cluster_grid.ax_heatmap.set_xticks(range(len(reordered_labels))) + cluster_grid.ax_heatmap.set_xticks([x + 0.5 for x in range(len(reordered_labels))]) cluster_grid.ax_heatmap.set_xticklabels(reordered_labels, rotation=90) - cluster_grid.ax_heatmap.set_yticks(range(len(reordered_labels))) + cluster_grid.ax_heatmap.set_yticks([y + 0.5 for y in range(len(reordered_labels))]) cluster_grid.ax_heatmap.set_yticklabels(reordered_labels, rotation=0) # Adjust the padding between the labels and the heatmap - cluster_grid.ax_heatmap.tick_params(axis='x', labelsize=8, width=0.7) - cluster_grid.ax_heatmap.tick_params(axis='y', labelsize=8, width=0.7) + cluster_grid.ax_heatmap.tick_params(axis='x', labelsize=12, width=0.7) + cluster_grid.ax_heatmap.tick_params(axis='y', labelsize=12, width=0.7) save_and_show_plot(f'{figdir}/feature_correlation_clustermap', figure=cluster_grid.fig, dpi=300) From 4abb45b02b751920126e387fe03c1d7fb0887881 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Mon, 21 Oct 2024 17:14:13 -0700 Subject: [PATCH 47/68] update colormaps --- .../analyses/linear_regression/analysis_plots.py | 4 ++-- .../linear_regression/chantelle_linear_regression_workflow.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py index 6b939a63..788c47d8 100644 --- a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py +++ b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py @@ -59,7 +59,7 @@ def plot_feature_cluster_correlations(df_track_level_features, feature_list, fig """ data = df_track_level_features[feature_list] - cluster_grid = sns.clustermap(data.corr(), vmin=-1, vmax=1, cmap='vlag', + cluster_grid = sns.clustermap(data.corr(), vmin=-1, vmax=1, cmap='BrBG', #'vlag' red to blue cbar_pos=(0.7, 0.13, 0.18, 0.02), cbar_kws={"orientation": "horizontal"}, annot=True, fmt=".1f", annot_kws={"size": 12}, @@ -164,7 +164,7 @@ def replace_values(val): ax.text(text_y+0.5, text_x+0.5, f'{value:.2f} ± {std_dev:.2f}', horizontalalignment='center', verticalalignment='center', - color=color) + color=color, fontsize=16) first_element = False ax.set_xticklabels(['','Extrinsic','Intrinsic']) diff --git a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py index 9ddd6397..9dca45f3 100644 --- a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py @@ -43,7 +43,8 @@ #%% Create maxtrix of r squared values df = run_regression_workflow(TARGETS, CONFIG, data, FIGDIR, alpha=0) #%% -for cmap in ['YlOrRd','OrRd', 'coolwarm']: +#'YlOrRd','OrRd' +for cmap in ['YlOrRd']: print(cmap) plot_heatmap(df, FIGDIR, cmap) From bea98f16806a49602a71218e54ee4808776be8d7 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Fri, 25 Oct 2024 14:42:59 -0700 Subject: [PATCH 48/68] update workflow --- .../linear_regression/analysis_plots.py | 99 +++++++++++++++++-- .../chantelle_linear_regression_workflow.py | 66 ++++++------- .../linear_regression/select_features.py | 6 +- 3 files changed, 126 insertions(+), 45 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py index 788c47d8..8d65b10b 100644 --- a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py +++ b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py @@ -60,25 +60,35 @@ def plot_feature_cluster_correlations(df_track_level_features, feature_list, fig data = df_track_level_features[feature_list] cluster_grid = sns.clustermap(data.corr(), vmin=-1, vmax=1, cmap='BrBG', #'vlag' red to blue - cbar_pos=(0.7, 0.13, 0.18, 0.02), - cbar_kws={"orientation": "horizontal"}, - annot=True, fmt=".1f", annot_kws={"size": 12}, - figsize=(20, 20)) + cbar_pos=(0.4, 0.9, 0.3, 0.02), + cbar_kws={"orientation": "horizontal"}, + annot=True, fmt=".1f", annot_kws={"size": 12}, + figsize=(18, 18)) + + + # Hide the dendrograms + cluster_grid.ax_row_dendrogram.set_visible(False) + cluster_grid.ax_col_dendrogram.set_visible(False) # Get the reordered labels using the dendrogram information reordered_column_index = cluster_grid.dendrogram_col.reordered_ind - reordered_labels = [get_plot_labels_for_metric(data.columns[i])[1] for i in reordered_column_index] + + # add a number to the end of each label + reordered_labels = [f'{label} ({i+1})' for i, label in enumerate(reordered_labels)] + + # for the bottom label just use numbers + numbered_labels = [f'{i+1}' for i in range(len(reordered_labels))] # Ensure the number of labels matches the number of ticks - cluster_grid.ax_heatmap.set_xticks([x + 0.5 for x in range(len(reordered_labels))]) - cluster_grid.ax_heatmap.set_xticklabels(reordered_labels, rotation=90) + cluster_grid.ax_heatmap.set_xticks([x + 0.5 for x in range(len(numbered_labels))]) + cluster_grid.ax_heatmap.set_xticklabels(numbered_labels, rotation=0) cluster_grid.ax_heatmap.set_yticks([y + 0.5 for y in range(len(reordered_labels))]) cluster_grid.ax_heatmap.set_yticklabels(reordered_labels, rotation=0) # Adjust the padding between the labels and the heatmap cluster_grid.ax_heatmap.tick_params(axis='x', labelsize=12, width=0.7) - cluster_grid.ax_heatmap.tick_params(axis='y', labelsize=12, width=0.7) + cluster_grid.ax_heatmap.tick_params(axis='y', labelsize=12, width=0.7, labelright=False, labelleft=True, left=True, right=False) save_and_show_plot(f'{figdir}/feature_correlation_clustermap', figure=cluster_grid.fig, dpi=300) @@ -179,3 +189,76 @@ def replace_values(val): save_and_show_plot(f'{figdir}{target}_prediction_r_squared_matrix_alpha_{df.alpha[0]}') +def plot_feature_contribution(coef_alpha, test_sc, perms, target, fig_height, figdir): + """ + For a given target, plot feature importance for each feature in the linear model at a specified alpha. + Features that touch 0 are considered not important and are excluded from the plot. + + Paramaters + ------- + coef_alpha: pd.DataFrame + DataFrame containing the coefficient importance for each feature + test_sc: pd.DataFrame + DataFrame containing the test r2 scores + perms: pd.DataFrame + DataFrame containing the permutation test results + target: str + Prediction feature + fig_height: int + Height of the figure based on number of important features + save_path: str + Path to save the plot + + Returns + ------- + Figure + """ + + alpha = coef_alpha["alpha"].unique()[0] + p_value = round(perms["p_value"].item(), 3) + test_r2_mean = round(test_sc["Test r$^2$"].mean(), 2) + test_r2_std = round(test_sc["Test r$^2$"].std() / 2, 2) + + for col, df_col in coef_alpha.groupby("Column"): + lower_bound = df_col["Coefficient Importance"].mean() - df_col["Coefficient Importance"].std() + upper_bound = df_col["Coefficient Importance"].mean() + df_col["Coefficient Importance"].std() + if lower_bound < 0 and upper_bound > 0 or df_col["Coefficient Importance"].mean() == 0: + coef_alpha = coef_alpha[coef_alpha["Column"] != col] + + coef_alpha['Magnitude coefficient importance'] = abs(coef_alpha['Coefficient Importance']) + coef_alpha['Sign'] = coef_alpha['Coefficient Importance'].apply(lambda x: 'Positive coefficient' if x > 0 else 'Negative coefficient') + + coef_alpha['Mean Magnitude'] = coef_alpha.groupby('Column')['Magnitude coefficient importance'].transform('mean') + coef_alpha = coef_alpha.sort_values('Mean Magnitude', ascending=False).drop(columns=['Mean Magnitude']) + + plt.figure(figsize=(4,fig_height*.5)) + ax = sns.barplot( + data=coef_alpha, + y="Column", + x="Magnitude coefficient importance", + hue="Sign", + palette={'Positive coefficient': '#156082', 'Negative coefficient': 'grey'}, + errorbar="sd", + width=0.7, + native_scale=True) + + for patch in ax.patches: + patch.set_edgecolor('black') + patch.set_linewidth(1.5) + current_height = patch.get_height() + desired_height = 0.7 + patch.set_height(desired_height) + patch.set_y(patch.get_y() + (current_height - desired_height) * 0.5) + + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + + plt.ylabel("") + plt.legend(frameon=False, bbox_to_anchor=(1.05, 1), loc='upper left') + if target == 'delta_volume_BC': + ax.get_legend().remove() + plt.title(f"Target: {get_plot_labels_for_metric(target)[1]}, alpha={alpha}, test r\u00B2={test_r2_mean}±{test_r2_std}, P={p_value}") + label_list = [get_plot_labels_for_metric(col)[1] for col in coef_alpha["Column"].unique()] + + plt.yticks(ticks=range(len(label_list)), labels=label_list) + save_and_show_plot(f'{figdir}/coefficients_{target}_alpha_{alpha}', dpi=300, bbox_inches='tight') \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py index 9dca45f3..06124280 100644 --- a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py @@ -4,11 +4,13 @@ import pandas as pd from nuc_morph_analysis.lib.preprocessing import global_dataset_filtering, filter_data from nuc_morph_analysis.analyses.linear_regression.linear_regression_workflow import fit_linear_regression -from nuc_morph_analysis.analyses.linear_regression.analysis_plots import (run_regression_workflow, plot_feature_correlations, - plot_feature_cluster_correlations, plot_heatmap) +from nuc_morph_analysis.analyses.linear_regression.analysis_plots import (run_regression_workflow, + plot_feature_cluster_correlations, + plot_heatmap, + plot_feature_contribution) from nuc_morph_analysis.analyses.linear_regression.select_features import (get_feature_list, TARGET_SETTINGS) -from nuc_morph_analysis.analyses.linear_regression.linear_regression_workflow_greedy_removal import main + pd.options.mode.chained_assignment = None # default='warn' warnings.simplefilter(action="ignore", category=FutureWarning) @@ -17,7 +19,6 @@ df_full = filter_data.all_timepoints_full_tracks(df_all) df_track_level_features = filter_data.track_level_features(df_full) -#%% FIGDIR='linear_regression/figures/' TARGETS = ['duration_BC', 'delta_volume_BC'] CONFIG = { @@ -29,43 +30,38 @@ 'intrinsic': ['start_intrinsic', 'lifetime_intrinsic'], 'extrinsic': ['start_extrinsic', 'lifetime_extrinsic'], } +EXTENDED_WORKFLOW = False -#%% -plot_feature_correlations(df_track_level_features, get_feature_list(CONFIG['all_features'], None), FIGDIR) -#%% -plot_feature_cluster_correlations(df_track_level_features, get_feature_list(CONFIG['all_features'], None), FIGDIR) -#%% preprocess dataframe to ensure same N for all analysis +#%% Preprocess dataframe to ensure same N for all analysis (lineage features are being used) dropna_cols = get_feature_list(CONFIG['all_features'], None) data = df_track_level_features.dropna(subset=dropna_cols) print(f"Number of tracks: {len(data)}") #%% Create maxtrix of r squared values df = run_regression_workflow(TARGETS, CONFIG, data, FIGDIR, alpha=0) -#%% -#'YlOrRd','OrRd' -for cmap in ['YlOrRd']: - print(cmap) - plot_heatmap(df, FIGDIR, cmap) +plot_heatmap(df, FIGDIR, 'YlOrRd') + +#%% Plot feature importance +for target in ['duration_BC', 'delta_volume_BC']: + df_alpha, df_test, df_coeff = fit_linear_regression(data, + cols=get_feature_list(CONFIG['all_features'], target), + target=target, alpha=[TARGET_SETTINGS[target]['max_alpha']], + tol=TARGET_SETTINGS[target]['tolerance'], save_path=FIGDIR, save=False) + + plot_feature_contribution(df_alpha, df_test, df_coeff, target, TARGET_SETTINGS[target]['fig_height'], FIGDIR) + +#%% Plot feature correlations +plot_feature_cluster_correlations(df_track_level_features, get_feature_list(CONFIG['all_features'], None), FIGDIR) -#%% Create movie of increasing alpha -for target in ['duration_BC', 'delta_volume_BC']: - fit_linear_regression( - data, - cols=get_feature_list(CONFIG['all_features'], target), - target=target, - alpha=np.arange(0, 15, 0.2, dtype=float), - tol=TARGET_SETTINGS[target]['tolerance'], - save_path=FIGDIR, - save=True - ) - -# %% Greedy removal -for target in TARGETS: - main(cols=get_feature_list(CONFIG['all_features'], target), - target=target, - alpha_range=np.arange(0.5, 15, 0.2, dtype=float), - tolerance=TARGET_SETTINGS[target]['tolerance'], - save_path=FIGDIR, - max_iterations=100, - preloaded_dataframe=data) \ No newline at end of file +#%% Create movie of increasing alpha until tolerance of 0.05 is reached +if EXTENDED_WORKFLOW: + for target in ['duration_BC', 'delta_volume_BC']: + fit_linear_regression( + data, + cols=get_feature_list(CONFIG['all_features'], target), + target=target, + alpha=np.arange(0, 15, 0.2, dtype=float), + tol=TARGET_SETTINGS[target]['tolerance'], + save_path=FIGDIR, + save=True) \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/select_features.py b/nuc_morph_analysis/analyses/linear_regression/select_features.py index f935bfd2..aa6f4a6a 100644 --- a/nuc_morph_analysis/analyses/linear_regression/select_features.py +++ b/nuc_morph_analysis/analyses/linear_regression/select_features.py @@ -58,11 +58,13 @@ TARGET_SETTINGS = { 'duration_BC': { 'tolerance': 0.05, - 'max_alpha': 0.7, + 'max_alpha': 0.8, + 'fig_height': 7, }, 'delta_volume_BC': { 'tolerance': 0.05, - 'max_alpha': 12.3, + 'max_alpha': 11.6, + 'fig_height': 2, } } From c840327a53b93f233cf91cd81392605918791396 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Fri, 25 Oct 2024 15:11:22 -0700 Subject: [PATCH 49/68] organize and document --- .../linear_regression/analysis_plots.py | 63 ++++++++++++++----- ...py => supplemental_lrm_figure_workflow.py} | 8 +-- 2 files changed, 51 insertions(+), 20 deletions(-) rename nuc_morph_analysis/analyses/linear_regression/{chantelle_linear_regression_workflow.py => supplemental_lrm_figure_workflow.py} (94%) diff --git a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py index 8d65b10b..f212fea0 100644 --- a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py +++ b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py @@ -92,24 +92,57 @@ def plot_feature_cluster_correlations(df_track_level_features, feature_list, fig save_and_show_plot(f'{figdir}/feature_correlation_clustermap', figure=cluster_grid.fig, dpi=300) - + def run_regression(df_track_level_features, target, features, name, alpha, figdir): - _, all_test_sc, _ = fit_linear_regression( - df_track_level_features, - cols=get_feature_list(features, target), - target=target, - alpha=alpha, - tol=0.04, - save_path=figdir, - save=False, - multiple_predictions=False - ) - print(f"Target {target}, Alpha: {alpha}. Feature group: {name}") - r_squared = round(all_test_sc["Test r$^2$"].mean(), 3) - std = round(all_test_sc["Test r$^2$"].std(), 3) - return {'target': target, 'feature_group': name, 'r_squared': r_squared, 'stdev': std, 'alpha': 0, 'feats_used': get_feature_list(features, target)} + """ + Run linear regression on the given dataset and return the results. + + Parameters: + ---------- + df_track_level_features (pd.DataFrame): DataFrame containing the track level features. + target (str): The target variable for regression. + features (list): List of features to be used for regression. + name (str): Name of the feature group. + alpha (list): List of alpha values for regularization. + figdir (str): Directory path to save the figures. + + Returns: + -------- + dict: A dictionary containing the target, feature group name, mean R-squared value, + standard deviation of R-squared values, alpha value, and the features used. + """ + _, all_test_sc, _ = fit_linear_regression( + df_track_level_features, + cols=get_feature_list(features, target), + target=target, + alpha=alpha, + tol=0.04, + save_path=figdir, + save=False, + multiple_predictions=False + ) + print(f"Target {target}, Alpha: {alpha}. Feature group: {name}") + r_squared = round(all_test_sc["Test r$^2$"].mean(), 3) + std = round(all_test_sc["Test r$^2$"].std(), 3) + return {'target': target, 'feature_group': name, 'r_squared': r_squared, 'stdev': std, 'alpha': 0, 'feats_used': get_feature_list(features, target)} def run_regression_workflow(targets, feature_configs, df_track_level_features, figdir, alpha): + """ + Run the regression workflow for multiple targets and feature configurations. + + Parameters: + ---------- + targets (list): List of target variables for regression. + feature_configs (dict): Dictionary where keys are feature group names and values are lists of features. + df_track_level_features (pd.DataFrame): DataFrame containing the track level features. + figdir (str): Directory path to save the figures and results. + alpha (float): Alpha value for regularization. + + Returns: + -------- + pd.DataFrame: DataFrame containing the results of the regression workflow, including target, + R-squared value, standard deviation, feature group, alpha value, and features used. + """ df = pd.DataFrame(columns=['target', 'r_squared', 'stdev', 'feature_group', 'alpha', 'feats_used']) for target in targets: diff --git a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py similarity index 94% rename from nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py rename to nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py index 06124280..54e85156 100644 --- a/nuc_morph_analysis/analyses/linear_regression/chantelle_linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py @@ -11,9 +11,6 @@ from nuc_morph_analysis.analyses.linear_regression.select_features import (get_feature_list, TARGET_SETTINGS) -pd.options.mode.chained_assignment = None # default='warn' -warnings.simplefilter(action="ignore", category=FutureWarning) - #%% df_all = global_dataset_filtering.load_dataset_with_features() df_full = filter_data.all_timepoints_full_tracks(df_all) @@ -63,5 +60,6 @@ target=target, alpha=np.arange(0, 15, 0.2, dtype=float), tol=TARGET_SETTINGS[target]['tolerance'], - save_path=FIGDIR, - save=True) \ No newline at end of file + save_path="figures/", + save=True) +# %% From 1e19750a52e1c5790fe004b0210ebd2c16fdaec3 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Fri, 25 Oct 2024 15:16:46 -0700 Subject: [PATCH 50/68] remove unused code --- .../linear_regression_workflow.py | 331 ------------------ ...near_regression_workflow_greedy_removal.py | 297 ---------------- .../linear_regression/scripts/added_volume.sh | 5 - .../scripts/added_volume_sisters.sh | 5 - .../linear_regression/scripts/duration_BC.sh | 5 - .../scripts/duration_BC_sisters.sh | 5 - .../scripts/ending_volume.sh | 5 - .../scripts/ending_volume_sisters.sh | 5 - .../scripts_greedy_removal/added_volume.sh | 5 - .../scripts_greedy_removal/duration_BC.sh | 5 - .../duration_BC_debug copy.sh | 7 - .../duration_BC_debug.sh | 7 - .../scripts_greedy_removal/ending_volume.sh | 5 - 13 files changed, 687 deletions(-) delete mode 100644 nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py delete mode 100644 nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow_greedy_removal.py delete mode 100755 nuc_morph_analysis/analyses/linear_regression/scripts/added_volume.sh delete mode 100755 nuc_morph_analysis/analyses/linear_regression/scripts/added_volume_sisters.sh delete mode 100755 nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC.sh delete mode 100755 nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC_sisters.sh delete mode 100755 nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume.sh delete mode 100755 nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume_sisters.sh delete mode 100755 nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/added_volume.sh delete mode 100755 nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC.sh delete mode 100755 nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC_debug copy.sh delete mode 100755 nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC_debug.sh delete mode 100755 nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/ending_volume.sh diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py deleted file mode 100644 index 825e49b3..00000000 --- a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py +++ /dev/null @@ -1,331 +0,0 @@ -import argparse -import os -import warnings -from pathlib import Path -import numpy as np -import pandas as pd -import seaborn as sns -from sklearn import linear_model -from sklearn.model_selection import ( - RepeatedKFold, - cross_validate, -) -from tqdm import tqdm -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler -from nuc_morph_analysis.lib.preprocessing import global_dataset_filtering, filter_data -from sklearn.model_selection import permutation_test_score -from nuc_morph_analysis.lib.visualization.plotting_tools import ( - get_plot_labels_for_metric, -) -import imageio -from nuc_morph_analysis.analyses.linear_regression.select_features import ( - get_feature_list, -) -from nuc_morph_analysis.analyses.linear_regression.utils import ( - list_of_strings, - list_of_floats, -) - -pd.options.mode.chained_assignment = None # default='warn' - -warnings.simplefilter(action="ignore", category=FutureWarning) - - -def main( - cols, - target, - alpha_range, - tolerance, - save_path, - cached_dataframe=None, - save_movie=False, -): - - save_path = Path(save_path) - save_path = save_path / Path("linear_regression") - save_path.mkdir(parents=True, exist_ok=True) - - if len(cols) < 1: - cols = get_feature_list(["features", "lineage_feats"], None) - - if not cached_dataframe: - df_all = global_dataset_filtering.load_dataset_with_features() - df_full = filter_data.all_timepoints_full_tracks(df_all) - df_track_level_features = filter_data.track_level_features(df_full) - else: - df_track_level_features = pd.read_csv(cached_dataframe) - - fit_linear_regression( - df_track_level_features, - cols, - target, - alpha_range, - tolerance, - save_path, - save_movie, - ) - - -def fit_linear_regression( - data, cols, target, alpha, tol, save_path, save, permute_cols=[], multiple_predictions=True -): - """ - data - track level features - cols - input features - target - target to predict - alpha - hyperparameter for lasso - tol - tolerance to check drop in r^2 for finding best alpha (ex. 0.02) - save_path - location to save files - save - whether to save movies and pngs - permute_col - list of features to permute and replace with noise - """ - sns.set_context("talk") - random_state = 2652124 - - # init empty dicts and lists - all_test_sc = [] - all_coef_alpha = [] - all_perms = { - "score": [], - "perm_score_mean": [], - "perm_score_std": [], - "p_value": [], - "alpha": [], - } - - if multiple_predictions: - # remove 0 alpha due to convergence errors - alpha = [i for i in alpha if i != 0] - alpha = [round(i, 1) for i in alpha] - - # find best alpha for Lasso model - for alpha_ind, this_alpha in tqdm(enumerate(alpha), total=len(alpha)): - # drop any nan rows - dropna_cols = cols + [target] - data = data.dropna(subset=dropna_cols) - print(f"number of tracks: {len(data)}") - - # permute columns if necessary - if len(permute_cols) > 0: - for col in permute_cols: - mu, sigma = 0, 1 - noise = np.random.normal(mu, sigma, len(data)) - data[col] = noise - - # make numpy array for inputs and target - all_input = data[cols].reset_index(drop=True).values - all_target = data[target].values - - if this_alpha == 0: - # linear regression if alpha == 0 - clf = linear_model.LinearRegression() - else: - clf = linear_model.Lasso(alpha=this_alpha) - - # normalize input features - model = make_pipeline(StandardScaler(), clf) - - # run permutation test - score, permutation_scores, pvalue = permutation_test_score( - model, - all_input, - all_target, - random_state=random_state, - cv=5, - n_permutations=500, - ) - - # break if permutation score is less than linear regression value (max possible) - # with a tolerance - # or if p_value > 0.05 - rounded_permutation_score = round(score, 2) - if alpha_ind == 0: - max_val = rounded_permutation_score - if multiple_predictions: - if abs(rounded_permutation_score - max_val) > tol or (pvalue > 0.05): - break - - # if relatively equal to linear regression value, then continue - # save permutation score and p_value to dictionary - all_perms["score"].append(score) - all_perms["perm_score_mean"].append(permutation_scores.mean()) - all_perms["perm_score_std"].append(permutation_scores.std()) - all_perms["p_value"].append(pvalue) - all_perms["alpha"].append(this_alpha) - - # run cross validate to get model coefficients - cv_model = cross_validate( - model, - all_input, - all_target, - cv=RepeatedKFold(n_splits=5, n_repeats=20, random_state=random_state), - return_estimator=True, - n_jobs=2, - scoring=[ - "r2", - "explained_variance", - "neg_mean_absolute_error", - "max_error", - "neg_mean_squared_error", - "neg_mean_absolute_percentage_error", - ], - return_train_score=True, - ) - - # Save test r^2 and test MSE to dataframe - range_test_scores = [round(i, 2) for i in cv_model["test_r2"]] - range_errors = [round(i, 2) for i in cv_model["test_neg_mean_squared_error"]] - test_sc = pd.DataFrame() - test_sc[r"Test r$^2$"] = range_test_scores - test_sc["Test MSE"] = range_errors - test_sc["alpha"] = this_alpha - all_test_sc.append(test_sc) - - # Save coeffs to dataframe - coefs = pd.DataFrame( - [model[1].coef_ for model in cv_model["estimator"]], columns=cols - ) - - coefs["alpha"] = this_alpha - all_coef_alpha.append(coefs) - - # Get test scores for all alpha - all_test_sc = pd.concat(all_test_sc, axis=0).reset_index(drop=True) - all_test_sc["Test MSE"] = -all_test_sc["Test MSE"] - save_path = save_path / Path(f"{target}") - save_path.mkdir(parents=True, exist_ok=True) - - # Get coeffs for all alpha - all_coef_alpha = pd.concat(all_coef_alpha, axis=0).reset_index(drop=True) - all_coef_alpha = all_coef_alpha.melt( - id_vars=["alpha"], - var_name="Column", - value_name="Coefficient Importance", - ).reset_index(drop=True) - - # Get permutation scores and p values for all alpha - all_perms = pd.DataFrame(all_perms).reset_index(drop=True) - - # Save coefficient plot movie - if save: - all_test_sc.to_csv(save_path / "mse.csv") - all_coef_alpha.to_csv(save_path / "coefficients.csv") - all_perms.to_csv(save_path / "perm_scores.csv") - save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path) - - return all_coef_alpha, all_test_sc, all_perms - - -def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path): - - xlim = None - files = [] - for alpha in all_coef_alpha["alpha"].unique(): - this_coef_alpha = all_coef_alpha.loc[ - all_coef_alpha["alpha"] == alpha - ].reset_index(drop=True) - this_test_sc = all_test_sc.loc[all_test_sc["alpha"] == alpha].reset_index( - drop=True - ) - this_perms = all_perms.loc[all_perms["alpha"] == alpha].reset_index(drop=True) - p_value = round(this_perms["p_value"].item(), 3) - test_r2_mean = round(this_test_sc["Test r$^2$"].mean(), 2) - test_r2_std = round(this_test_sc["Test r$^2$"].std() / 2, 2) - - g = sns.catplot( - data=this_coef_alpha, - y="Column", - x="Coefficient Importance", - kind="bar", - errorbar="sd", - aspect=2, - height=10, - ) - - g.set(ylabel="") - - g.fig.subplots_adjust(top=0.9) # adjust the Figure in rp - g.fig.suptitle( - f"Prediction of {get_plot_labels_for_metric(target)[1]}\nalpha={alpha}, test r\u00B2={test_r2_mean}±{test_r2_std}, P={p_value}" - ) - label_list = [ - get_plot_labels_for_metric(col)[1] - for col in all_coef_alpha["Column"].unique() - ] - g.set_yticklabels(label_list) - print(f"Saving coefficients_{target}_alpha_{alpha}.png") - this_path = str(save_path / Path(f"coefficients_{target}_alpha_{alpha}.png")) - files.append(this_path) - - if not xlim: - xlim = g.fig.axes[0].get_xlim() - g.set(xlim=xlim) - g.savefig(this_path, dpi=300) - - # save movie of pngs - writer = imageio.get_writer(save_path / f"{target}_coefficients_over_time.mp4", fps=2) - for im in files: - writer.append_data(imageio.imread(im)) - os.remove(im) - writer.close() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run the linear regression workflow") - # Optional command line argument - parser.add_argument( - "--cached_dataframe", - type=str, - metavar="path", - help="Supply a path to a dataframe to skip data preprocessing. If included, dataframe " - "should match the result of linear_regression_analysis.get_data (see source code for " - "details).", - ) - - parser.add_argument( - "--cols", - type=list_of_strings, - default=[], - help="Supply a list of column names to use as independent variables in the linear regression analysis.", - ) - parser.add_argument( - "--target", - type=str, - default="duration_BC", - help="Supply a column name for a dependent variable to perform regression on", - ) - parser.add_argument( - "--alpha_range", - type=list_of_floats, - default=np.arange(0, 15, 0.1, dtype=float), - help="Supply a list of alpha values to use in lasso regression", - ) - parser.add_argument( - "--save_path", - type=str, - default="figures", - help="local folder name where plots will be saved", - ) - parser.add_argument( - "--tolerance", - type=float, - default=0.02, - help="Tolerace for change in regression score to determine best alpha", - ) - parser.add_argument( - "--save", - type=bool, - default=False, - help="Save plots", - ) - args = parser.parse_args() - main( - cols=args.cols, - target=args.target, - alpha_range=args.alpha_range, - tolerance=args.tolerance, - save_path=args.save_path, - cached_dataframe=args.cached_dataframe, - save_movie=args.save, - ) diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow_greedy_removal.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow_greedy_removal.py deleted file mode 100644 index b6afafc8..00000000 --- a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow_greedy_removal.py +++ /dev/null @@ -1,297 +0,0 @@ -import argparse -import warnings -from pathlib import Path -import numpy as np -import pandas as pd -from nuc_morph_analysis.lib.preprocessing import global_dataset_filtering, filter_data -from nuc_morph_analysis.analyses.linear_regression.select_features import ( - get_feature_list, -) -from nuc_morph_analysis.analyses.linear_regression.linear_regression_workflow import ( - fit_linear_regression, -) -from nuc_morph_analysis.analyses.linear_regression.utils import ( - list_of_strings, - list_of_floats, -) -from nuc_morph_analysis.lib.visualization.plotting_tools import ( - get_plot_labels_for_metric, -) -import imageio -import seaborn as sns -import os -import matplotlib.pyplot as plt - -pd.options.mode.chained_assignment = None # default='warn' - -warnings.simplefilter(action="ignore", category=FutureWarning) - - -def main( - cols, - target, - alpha_range, - tolerance, - save_path, - max_iterations, - cached_dataframe=None, - preloaded_dataframe=None, -): - - save_path = Path(save_path) - save_path = save_path / Path("linear_regression_greedy") / Path(f"{target}") - save_path.mkdir(parents=True, exist_ok=True) - - if len(cols) < 1: - cols = get_feature_list(['start_intrinsic', 'lifetime_intrinsic', 'start_extrinsic', 'lifetime_extrinsic'], target) - - label_list = [get_plot_labels_for_metric(col)[1] for col in cols] - map_dict = {i: j for i, j in zip(cols, label_list)} - - if preloaded_dataframe is not None: - df_track_level_features = preloaded_dataframe - elif cached_dataframe is not None: - df_track_level_features = pd.read_csv(cached_dataframe) - else: - df_all = global_dataset_filtering.load_dataset_with_features() - df_full = filter_data.all_timepoints_full_tracks(df_all) - df_track_level_features = filter_data.track_level_features(df_full) - - permute_cols = [] - count = 0 - - removal_coefs = [] - removal_test_sc = [] - removal_perms = [] - while (count < len(cols) - 2) and (count < max_iterations): - print( - f"Iteration {count}, Total cols {len(cols)}, Removing features {permute_cols}" - ) - try: - all_coef_alpha, all_test_sc, all_perms = fit_linear_regression( - df_track_level_features, - cols, - target, - alpha_range, - tolerance, - save_path, - False, - permute_cols, - ) - except ValueError: - break - # Save max alpha - max_alpha = all_coef_alpha["alpha"].max() - all_coef_alpha = all_coef_alpha.loc[all_coef_alpha["alpha"] == max_alpha] - all_test_sc = all_test_sc.loc[all_test_sc["alpha"] == max_alpha] - all_perms = all_perms.loc[all_perms["alpha"] == max_alpha] - all_coef_alpha["iteration"] = count - all_test_sc["iteration"] = count - all_perms["iteration"] = count - - # Get max coefficient to remove - tmp = all_coef_alpha.copy() - tmp["Coefficient Importance"] = tmp["Coefficient Importance"].abs() - tmp = tmp.loc[ - tmp["Coefficient Importance"] == tmp["Coefficient Importance"].max() - ] - remove_column = tmp["Column"].item() - - # add this to list of columns to permute - permute_cols.append(remove_column) - - # save out info - all_coef_alpha["iteration"] = count - all_test_sc["iteration"] = count - all_perms["iteration"] = count - - all_coef_alpha["feature_removed"] = remove_column - all_test_sc["feature_removed"] = remove_column - all_perms["feature_removed"] = remove_column - - all_coef_alpha = all_coef_alpha.groupby(["Column"]).mean().reset_index() - - label_list = [ - get_plot_labels_for_metric(col)[1] - for col in all_coef_alpha["Column"].unique() - ] - all_coef_alpha["Column"] = label_list - - all_coef_alpha.to_csv(save_path / f"removal_coefficients_{count}.csv") - all_test_sc.to_csv(save_path / f"removal_test_sc_{count}.csv") - all_perms.to_csv(save_path / f"removal_perms_{count}.csv") - - removal_coefs.append(all_coef_alpha) - removal_test_sc.append(all_test_sc) - removal_perms.append(all_perms) - - count += 1 - - removal_coefs = pd.concat(removal_coefs, axis=0).reset_index(drop=True) - removal_test_sc = pd.concat(removal_test_sc, axis=0).reset_index(drop=True) - removal_perms = pd.concat(removal_perms, axis=0).reset_index(drop=True) - - removal_coefs.to_csv(save_path / "removal_coefficients.csv") - removal_test_sc.to_csv(save_path / "removal_test_sc.csv") - removal_perms.to_csv(save_path / "removal_perms.csv") - - save_plots( - removal_coefs, removal_test_sc, removal_perms, target, save_path, map_dict - ) - - -def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path, map_dict): - sns.set_style("whitegrid") - xlim = None - files = [] - perm_cols = [] - perm_coeffs = [] - y_order = [] - - xlim = [ - all_coef_alpha["Coefficient Importance"].min(), - all_coef_alpha["Coefficient Importance"].max(), - ] - for iter in all_coef_alpha["iteration"].unique(): - this_coef_alpha = all_coef_alpha.loc[ - all_coef_alpha["iteration"] == iter - ].reset_index(drop=True) - this_test_sc = all_test_sc.loc[all_test_sc["iteration"] == iter].reset_index( - drop=True - ) - this_perms = all_perms.loc[all_perms["iteration"] == iter].reset_index( - drop=True - ) - if iter > 0: - prev_perms = all_perms.loc[all_perms["iteration"] == iter - 1].reset_index( - drop=True - ) - prev_coefs = all_coef_alpha.loc[ - all_coef_alpha["iteration"] == iter - 1 - ].reset_index(drop=True) - feat_removed = map_dict[prev_perms["feature_removed"].item()] - feat_coefficient = prev_coefs.loc[prev_coefs["Column"] == feat_removed][ - "Coefficient Importance" - ].item() - perm_cols.append(feat_removed) - perm_coeffs.append(feat_coefficient) - print(perm_cols, perm_coeffs) - p_value = round(this_perms["p_value"].item(), 3) - test_r2_mean = round(this_test_sc["Test r$^2$"].mean(), 2) - test_r2_std = round(this_test_sc["Test r$^2$"].std() / 2, 2) - - this_coef_alpha["removed"] = False - if len(perm_cols) > 0: - this1 = this_coef_alpha.loc[this_coef_alpha["Column"].isin(perm_cols)] - this2 = this_coef_alpha.loc[~this_coef_alpha["Column"].isin(perm_cols)] - for col in perm_cols: - ind = perm_cols.index(col) - final_coeff = perm_coeffs[ind] - this1.loc[this1["Column"] == col, "Coefficient Importance"] = ( - final_coeff - ) - this1["removed"] = True - this_coef_alpha = pd.concat([this1, this2], axis=0).reset_index(drop=True) - - if len(y_order) == 0: - y_order = this_coef_alpha["Column"].values - - g = sns.catplot( - data=this_coef_alpha, - y="Column", - x="Coefficient Importance", - hue="removed", - kind="bar", - order=y_order, - errorbar="sd", - aspect=2, - height=10, - ) - - g.set(ylabel="") - - g.fig.subplots_adjust(top=0.9) # adjust the Figure in rp - g.fig.suptitle( - f"Prediction of {get_plot_labels_for_metric(target)[1]}\niteration={iter}, test r\u00B2={test_r2_mean}±{test_r2_std}, P={p_value}" - ) - label_list = [col for col in all_coef_alpha["Column"].unique()] - g.set_yticklabels(label_list) - plt.grid() - this_path = str(save_path / Path(f"coefficients_{target}_iteration_{iter}.png")) - files.append(this_path) - - g.set(xlim=xlim) - g.savefig(this_path, dpi=300) - - # save movie of pngs - writer = imageio.get_writer(save_path / f"{target}_greedy_coefficients_over_time.mp4", fps=2) - for im in files: - writer.append_data(imageio.imread(im)) - os.remove(im) - writer.close() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run the linear regression workflow") - # Optional command line argument - parser.add_argument( - "--cached_dataframe", - type=str, - metavar="path", - help="Supply a path to a dataframe to skip data preprocessing. If included, dataframe " - "should match the result of linear_regression_analysis.get_data (see source code for " - "details).", - ) - parser.add_argument( - "--preloaded_dataframe", - type=pd.DataFrame, - metavar="path", - help="Supply preloaded dataframe to skip data preprocessing.", - ) - parser.add_argument( - "--cols", - type=list_of_strings, - default=[], - help="Supply a list of column names to use as independent variables in the linear regression analysis.", - ) - parser.add_argument( - "--target", - type=str, - default="duration_BC", - help="Supply a column name for a dependent variable to perform regression on", - ) - parser.add_argument( - "--alpha_range", - type=list_of_floats, - default=np.arange(0.5, 15, 0.2, dtype=float), - help="Supply a list of alpha values to use in lasso regression", - ) - parser.add_argument( - "--save_path", - type=str, - default="figures", - help="local folder name where plots will be saved", - ) - parser.add_argument( - "--tolerance", - type=float, - default=0.02, - help="Tolerace for change in regression score to determine best alpha", - ) - parser.add_argument( - "--max_iterations", - type=int, - default=100, - help="Max iterations for greedy removal", - ) - args = parser.parse_args() - main( - cols=args.cols, - target=args.target, - alpha_range=args.alpha_range, - tolerance=args.tolerance, - save_path=args.save_path, - max_iterations=args.max_iterations, - cached_dataframe=args.cached_dataframe, - preloaded_dataframe=args.preloaded_dataframe, - ) diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume.sh b/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume.sh deleted file mode 100755 index d1b40635..00000000 --- a/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume.sh +++ /dev/null @@ -1,5 +0,0 @@ -python linear_regression_workflow.py \ ---cols 'volume_at_B','height_at_B','time_at_B','xy_aspect_at_B','SA_vol_ratio_at_B','colony_time_at_B','SA_at_B','mean_height','mean_density','mean_xy_aspect','mean_SA_vol_ratio','mean_neighbor_avg_dxdt_48_volume_whole_colony','std_height','std_density','std_xy_aspect','std_SA_vol_ratio','std_neighbor_avg_dxdt_48_volume_whole_colony','early_transient_gr_whole_colony' \ ---target 'delta_volume_BC' \ ---save_path "./" \ ---tolerance 0.08 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume_sisters.sh b/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume_sisters.sh deleted file mode 100755 index 3e56e4bb..00000000 --- a/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume_sisters.sh +++ /dev/null @@ -1,5 +0,0 @@ -python linear_regression_workflow.py \ ---cols 'volume_at_B','mothers_volume_at_B','sisters_volume_at_B','mothers_duration_BC','sisters_duration_BC','height_at_B','time_at_B','xy_aspect_at_B','SA_vol_ratio_at_B','colony_time_at_B','SA_at_B','mean_height','mean_density','mean_xy_aspect','mean_SA_vol_ratio','mean_neighbor_avg_dxdt_48_volume_whole_colony','std_height','std_density','std_xy_aspect','std_SA_vol_ratio','std_neighbor_avg_dxdt_48_volume_whole_colony','early_transient_gr_whole_colony' \ ---target 'delta_volume_BC' \ ---save_path "./" \ ---tolerance 0.08 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC.sh b/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC.sh deleted file mode 100755 index bd012bf8..00000000 --- a/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC.sh +++ /dev/null @@ -1,5 +0,0 @@ -python linear_regression_workflow.py \ ---cols 'volume_at_B','height_at_B','time_at_B','xy_aspect_at_B','SA_vol_ratio_at_B','colony_time_at_B','SA_at_B','mean_height','mean_density','mean_volume','mean_mesh_sa','mean_xy_aspect','mean_SA_vol_ratio','mean_neighbor_avg_dxdt_48_volume_whole_colony','std_height','std_density','std_volume','std_mesh_sa','std_xy_aspect','std_SA_vol_ratio','std_neighbor_avg_dxdt_48_volume_whole_colony','early_transient_gr_whole_colony' \ ---target 'duration_BC' \ ---save_path "./" \ ---tolerance 0.08 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC_sisters.sh b/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC_sisters.sh deleted file mode 100755 index 42b7c704..00000000 --- a/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC_sisters.sh +++ /dev/null @@ -1,5 +0,0 @@ -python linear_regression_workflow.py \ ---cols 'volume_at_B','mothers_volume_at_B','sisters_volume_at_B','mothers_duration_BC','sisters_duration_BC','height_at_B','time_at_B','xy_aspect_at_B','SA_vol_ratio_at_B','colony_time_at_B','SA_at_B','mean_height','mean_density','mean_volume','mean_mesh_sa','mean_xy_aspect','mean_SA_vol_ratio','mean_neighbor_avg_dxdt_48_volume_whole_colony','std_height','std_density','std_volume','std_mesh_sa','std_xy_aspect','std_SA_vol_ratio','std_neighbor_avg_dxdt_48_volume_whole_colony','early_transient_gr_whole_colony' \ ---target 'duration_BC' \ ---save_path "./" \ ---tolerance 0.08 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume.sh b/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume.sh deleted file mode 100755 index 97e76194..00000000 --- a/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume.sh +++ /dev/null @@ -1,5 +0,0 @@ -python linear_regression_workflow.py \ ---cols 'volume_at_B','height_at_B','time_at_B','xy_aspect_at_B','SA_vol_ratio_at_B','colony_time_at_B','SA_at_B','mean_height','mean_density','mean_xy_aspect','mean_SA_vol_ratio','mean_neighbor_avg_dxdt_48_volume_whole_colony','std_height','std_density','std_xy_aspect','std_SA_vol_ratio','std_neighbor_avg_dxdt_48_volume_whole_colony','early_transient_gr_whole_colony' \ ---target 'volume_at_C' \ ---save_path "./" \ ---tolerance 0.05 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume_sisters.sh b/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume_sisters.sh deleted file mode 100755 index 01d44aed..00000000 --- a/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume_sisters.sh +++ /dev/null @@ -1,5 +0,0 @@ -python linear_regression_workflow.py \ ---cols 'volume_at_B','mothers_volume_at_B','sisters_volume_at_B','mothers_duration_BC','sisters_duration_BC','height_at_B','time_at_B','xy_aspect_at_B','SA_vol_ratio_at_B','colony_time_at_B','SA_at_B','mean_height','mean_density','mean_xy_aspect','mean_SA_vol_ratio','mean_neighbor_avg_dxdt_48_volume_whole_colony','std_height','std_density','std_xy_aspect','std_SA_vol_ratio','std_neighbor_avg_dxdt_48_volume_whole_colony','early_transient_gr_whole_colony' \ ---target 'volume_at_C' \ ---save_path "./" \ ---tolerance 0.05 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/added_volume.sh b/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/added_volume.sh deleted file mode 100755 index 9015c21b..00000000 --- a/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/added_volume.sh +++ /dev/null @@ -1,5 +0,0 @@ -python linear_regression_workflow_greedy_removal.py \ ---cached_dataframe "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/track_level.csv" \ ---target 'delta_volume_BC' \ ---save_path "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/nuc_morph_analysis/figures/" \ ---tolerance 0.08 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC.sh b/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC.sh deleted file mode 100755 index ea06eec8..00000000 --- a/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC.sh +++ /dev/null @@ -1,5 +0,0 @@ -python linear_regression_workflow_greedy_removal.py \ ---cached_dataframe "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/track_level.csv" \ ---target 'duration_BC' \ ---save_path "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/nuc_morph_analysis/figures/" \ ---tolerance 0.08 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC_debug copy.sh b/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC_debug copy.sh deleted file mode 100755 index a208aab2..00000000 --- a/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC_debug copy.sh +++ /dev/null @@ -1,7 +0,0 @@ -python linear_regression_workflow_greedy_removal.py \ ---cached_dataframe "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/track_level.csv" \ ---target 'duration_BC' \ ---alpha_range "0.2,0.4,0.6,0.8,1.0,1.2,1.4,1.6,1.8,2.0,2.2,2.4,2.6,2.8,3.0,3.2" \ ---save_path "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/nuc_morph_analysis/figures_debug/" \ ---tolerance 0.08 \ ---max_iterations 4 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC_debug.sh b/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC_debug.sh deleted file mode 100755 index 57768ef6..00000000 --- a/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/duration_BC_debug.sh +++ /dev/null @@ -1,7 +0,0 @@ -python linear_regression_workflow_greedy_removal.py \ ---cached_dataframe "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/track_level.csv" \ ---target 'duration_BC' \ ---alpha_range "1.2,1.4" \ ---save_path "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/nuc_morph_analysis/figures_debug/" \ ---tolerance 0.08 \ ---max_iterations 4 \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/ending_volume.sh b/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/ending_volume.sh deleted file mode 100755 index 6a7c55b7..00000000 --- a/nuc_morph_analysis/analyses/linear_regression/scripts_greedy_removal/ending_volume.sh +++ /dev/null @@ -1,5 +0,0 @@ -python linear_regression_workflow_greedy_removal.py \ ---cached_dataframe "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/track_level.csv" \ ---target 'volume_at_C' \ ---save_path "/allen/aics/modeling/ritvik/projects/trash/nucmorph/nuc-morph-analysis/nuc_morph_analysis/figures/" \ ---tolerance 0.08 \ No newline at end of file From 056ec6cdf0642b7898aef57295cf1a3cc5f0565f Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Fri, 25 Oct 2024 15:17:01 -0700 Subject: [PATCH 51/68] add figure to run all workflows --- .../linear_regression/analysis_plots.py | 2 +- .../linear_regression/linear_regression.py | 331 ++++++++++++++++++ .../supplemental_lrm_figure_workflow.py | 2 +- run_all_manuscript_workflows.py | 3 + 4 files changed, 336 insertions(+), 2 deletions(-) create mode 100644 nuc_morph_analysis/analyses/linear_regression/linear_regression.py diff --git a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py index f212fea0..69c3ed08 100644 --- a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py +++ b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py @@ -4,7 +4,7 @@ import pandas as pd from nuc_morph_analysis.lib.visualization.plotting_tools import get_plot_labels_for_metric from nuc_morph_analysis.lib.visualization.notebook_tools import save_and_show_plot -from nuc_morph_analysis.analyses.linear_regression.linear_regression_workflow import fit_linear_regression +from nuc_morph_analysis.analyses.linear_regression.linear_regression import fit_linear_regression from nuc_morph_analysis.analyses.linear_regression.select_features import (get_feature_list) diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression.py new file mode 100644 index 00000000..825e49b3 --- /dev/null +++ b/nuc_morph_analysis/analyses/linear_regression/linear_regression.py @@ -0,0 +1,331 @@ +import argparse +import os +import warnings +from pathlib import Path +import numpy as np +import pandas as pd +import seaborn as sns +from sklearn import linear_model +from sklearn.model_selection import ( + RepeatedKFold, + cross_validate, +) +from tqdm import tqdm +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from nuc_morph_analysis.lib.preprocessing import global_dataset_filtering, filter_data +from sklearn.model_selection import permutation_test_score +from nuc_morph_analysis.lib.visualization.plotting_tools import ( + get_plot_labels_for_metric, +) +import imageio +from nuc_morph_analysis.analyses.linear_regression.select_features import ( + get_feature_list, +) +from nuc_morph_analysis.analyses.linear_regression.utils import ( + list_of_strings, + list_of_floats, +) + +pd.options.mode.chained_assignment = None # default='warn' + +warnings.simplefilter(action="ignore", category=FutureWarning) + + +def main( + cols, + target, + alpha_range, + tolerance, + save_path, + cached_dataframe=None, + save_movie=False, +): + + save_path = Path(save_path) + save_path = save_path / Path("linear_regression") + save_path.mkdir(parents=True, exist_ok=True) + + if len(cols) < 1: + cols = get_feature_list(["features", "lineage_feats"], None) + + if not cached_dataframe: + df_all = global_dataset_filtering.load_dataset_with_features() + df_full = filter_data.all_timepoints_full_tracks(df_all) + df_track_level_features = filter_data.track_level_features(df_full) + else: + df_track_level_features = pd.read_csv(cached_dataframe) + + fit_linear_regression( + df_track_level_features, + cols, + target, + alpha_range, + tolerance, + save_path, + save_movie, + ) + + +def fit_linear_regression( + data, cols, target, alpha, tol, save_path, save, permute_cols=[], multiple_predictions=True +): + """ + data - track level features + cols - input features + target - target to predict + alpha - hyperparameter for lasso + tol - tolerance to check drop in r^2 for finding best alpha (ex. 0.02) + save_path - location to save files + save - whether to save movies and pngs + permute_col - list of features to permute and replace with noise + """ + sns.set_context("talk") + random_state = 2652124 + + # init empty dicts and lists + all_test_sc = [] + all_coef_alpha = [] + all_perms = { + "score": [], + "perm_score_mean": [], + "perm_score_std": [], + "p_value": [], + "alpha": [], + } + + if multiple_predictions: + # remove 0 alpha due to convergence errors + alpha = [i for i in alpha if i != 0] + alpha = [round(i, 1) for i in alpha] + + # find best alpha for Lasso model + for alpha_ind, this_alpha in tqdm(enumerate(alpha), total=len(alpha)): + # drop any nan rows + dropna_cols = cols + [target] + data = data.dropna(subset=dropna_cols) + print(f"number of tracks: {len(data)}") + + # permute columns if necessary + if len(permute_cols) > 0: + for col in permute_cols: + mu, sigma = 0, 1 + noise = np.random.normal(mu, sigma, len(data)) + data[col] = noise + + # make numpy array for inputs and target + all_input = data[cols].reset_index(drop=True).values + all_target = data[target].values + + if this_alpha == 0: + # linear regression if alpha == 0 + clf = linear_model.LinearRegression() + else: + clf = linear_model.Lasso(alpha=this_alpha) + + # normalize input features + model = make_pipeline(StandardScaler(), clf) + + # run permutation test + score, permutation_scores, pvalue = permutation_test_score( + model, + all_input, + all_target, + random_state=random_state, + cv=5, + n_permutations=500, + ) + + # break if permutation score is less than linear regression value (max possible) + # with a tolerance + # or if p_value > 0.05 + rounded_permutation_score = round(score, 2) + if alpha_ind == 0: + max_val = rounded_permutation_score + if multiple_predictions: + if abs(rounded_permutation_score - max_val) > tol or (pvalue > 0.05): + break + + # if relatively equal to linear regression value, then continue + # save permutation score and p_value to dictionary + all_perms["score"].append(score) + all_perms["perm_score_mean"].append(permutation_scores.mean()) + all_perms["perm_score_std"].append(permutation_scores.std()) + all_perms["p_value"].append(pvalue) + all_perms["alpha"].append(this_alpha) + + # run cross validate to get model coefficients + cv_model = cross_validate( + model, + all_input, + all_target, + cv=RepeatedKFold(n_splits=5, n_repeats=20, random_state=random_state), + return_estimator=True, + n_jobs=2, + scoring=[ + "r2", + "explained_variance", + "neg_mean_absolute_error", + "max_error", + "neg_mean_squared_error", + "neg_mean_absolute_percentage_error", + ], + return_train_score=True, + ) + + # Save test r^2 and test MSE to dataframe + range_test_scores = [round(i, 2) for i in cv_model["test_r2"]] + range_errors = [round(i, 2) for i in cv_model["test_neg_mean_squared_error"]] + test_sc = pd.DataFrame() + test_sc[r"Test r$^2$"] = range_test_scores + test_sc["Test MSE"] = range_errors + test_sc["alpha"] = this_alpha + all_test_sc.append(test_sc) + + # Save coeffs to dataframe + coefs = pd.DataFrame( + [model[1].coef_ for model in cv_model["estimator"]], columns=cols + ) + + coefs["alpha"] = this_alpha + all_coef_alpha.append(coefs) + + # Get test scores for all alpha + all_test_sc = pd.concat(all_test_sc, axis=0).reset_index(drop=True) + all_test_sc["Test MSE"] = -all_test_sc["Test MSE"] + save_path = save_path / Path(f"{target}") + save_path.mkdir(parents=True, exist_ok=True) + + # Get coeffs for all alpha + all_coef_alpha = pd.concat(all_coef_alpha, axis=0).reset_index(drop=True) + all_coef_alpha = all_coef_alpha.melt( + id_vars=["alpha"], + var_name="Column", + value_name="Coefficient Importance", + ).reset_index(drop=True) + + # Get permutation scores and p values for all alpha + all_perms = pd.DataFrame(all_perms).reset_index(drop=True) + + # Save coefficient plot movie + if save: + all_test_sc.to_csv(save_path / "mse.csv") + all_coef_alpha.to_csv(save_path / "coefficients.csv") + all_perms.to_csv(save_path / "perm_scores.csv") + save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path) + + return all_coef_alpha, all_test_sc, all_perms + + +def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path): + + xlim = None + files = [] + for alpha in all_coef_alpha["alpha"].unique(): + this_coef_alpha = all_coef_alpha.loc[ + all_coef_alpha["alpha"] == alpha + ].reset_index(drop=True) + this_test_sc = all_test_sc.loc[all_test_sc["alpha"] == alpha].reset_index( + drop=True + ) + this_perms = all_perms.loc[all_perms["alpha"] == alpha].reset_index(drop=True) + p_value = round(this_perms["p_value"].item(), 3) + test_r2_mean = round(this_test_sc["Test r$^2$"].mean(), 2) + test_r2_std = round(this_test_sc["Test r$^2$"].std() / 2, 2) + + g = sns.catplot( + data=this_coef_alpha, + y="Column", + x="Coefficient Importance", + kind="bar", + errorbar="sd", + aspect=2, + height=10, + ) + + g.set(ylabel="") + + g.fig.subplots_adjust(top=0.9) # adjust the Figure in rp + g.fig.suptitle( + f"Prediction of {get_plot_labels_for_metric(target)[1]}\nalpha={alpha}, test r\u00B2={test_r2_mean}±{test_r2_std}, P={p_value}" + ) + label_list = [ + get_plot_labels_for_metric(col)[1] + for col in all_coef_alpha["Column"].unique() + ] + g.set_yticklabels(label_list) + print(f"Saving coefficients_{target}_alpha_{alpha}.png") + this_path = str(save_path / Path(f"coefficients_{target}_alpha_{alpha}.png")) + files.append(this_path) + + if not xlim: + xlim = g.fig.axes[0].get_xlim() + g.set(xlim=xlim) + g.savefig(this_path, dpi=300) + + # save movie of pngs + writer = imageio.get_writer(save_path / f"{target}_coefficients_over_time.mp4", fps=2) + for im in files: + writer.append_data(imageio.imread(im)) + os.remove(im) + writer.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run the linear regression workflow") + # Optional command line argument + parser.add_argument( + "--cached_dataframe", + type=str, + metavar="path", + help="Supply a path to a dataframe to skip data preprocessing. If included, dataframe " + "should match the result of linear_regression_analysis.get_data (see source code for " + "details).", + ) + + parser.add_argument( + "--cols", + type=list_of_strings, + default=[], + help="Supply a list of column names to use as independent variables in the linear regression analysis.", + ) + parser.add_argument( + "--target", + type=str, + default="duration_BC", + help="Supply a column name for a dependent variable to perform regression on", + ) + parser.add_argument( + "--alpha_range", + type=list_of_floats, + default=np.arange(0, 15, 0.1, dtype=float), + help="Supply a list of alpha values to use in lasso regression", + ) + parser.add_argument( + "--save_path", + type=str, + default="figures", + help="local folder name where plots will be saved", + ) + parser.add_argument( + "--tolerance", + type=float, + default=0.02, + help="Tolerace for change in regression score to determine best alpha", + ) + parser.add_argument( + "--save", + type=bool, + default=False, + help="Save plots", + ) + args = parser.parse_args() + main( + cols=args.cols, + target=args.target, + alpha_range=args.alpha_range, + tolerance=args.tolerance, + save_path=args.save_path, + cached_dataframe=args.cached_dataframe, + save_movie=args.save, + ) diff --git a/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py b/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py index 54e85156..f043b596 100644 --- a/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd from nuc_morph_analysis.lib.preprocessing import global_dataset_filtering, filter_data -from nuc_morph_analysis.analyses.linear_regression.linear_regression_workflow import fit_linear_regression +from nuc_morph_analysis.analyses.linear_regression.linear_regression import fit_linear_regression from nuc_morph_analysis.analyses.linear_regression.analysis_plots import (run_regression_workflow, plot_feature_cluster_correlations, plot_heatmap, diff --git a/run_all_manuscript_workflows.py b/run_all_manuscript_workflows.py index 8694c539..7abf1180 100644 --- a/run_all_manuscript_workflows.py +++ b/run_all_manuscript_workflows.py @@ -58,6 +58,9 @@ def figureS1_segmentation_model_validation(): def supplemental_figure_cell_health(): import nuc_morph_analysis.analyses.cell_health.cell_health_workflow + + def supplemental_figure_linear_regression_model(): + import nuc_morph_analysis.analyses.linear_regression.supplemental_lrm_figure_workflow ALL_WORKFLOWS = get_jobs(Workflows) From ebc10975545cd56b65268cf080bb3de19fc499b3 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Fri, 25 Oct 2024 15:33:21 -0700 Subject: [PATCH 52/68] remove unused code --- .../linear_regression/analysis_plots.py | 32 ------------------- .../lib/preprocessing/add_features.py | 28 ---------------- .../preprocessing/global_dataset_filtering.py | 1 - .../lib/visualization/label_tables.py | 16 ---------- 4 files changed, 77 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py index 69c3ed08..cccbfbe4 100644 --- a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py +++ b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py @@ -7,38 +7,6 @@ from nuc_morph_analysis.analyses.linear_regression.linear_regression import fit_linear_regression from nuc_morph_analysis.analyses.linear_regression.select_features import (get_feature_list) - -def plot_feature_correlations(df_track_level_features, feature_list, figdir): - """ - Plot heatmap of feature correlations. - - Parameters - ---------- - df_track_level_features : pd.DataFrame - DataFrame containing track level features - feature_list : list - List of features to include in the heatmap - Output from get_feature_list - figdir : str - Directory to save the figure - - Returns - ------- - Figure - """ - data = df_track_level_features[feature_list] - - plt.rc('font', size=22) - plt.figure(figsize=(28, 25)) - sns.heatmap(data.corr(), annot=True, fmt=".1f", cmap='BrBG', vmin=-1, vmax=1, cbar_kws={"shrink": 0.5, "pad": 0.02}) - - column_names = [get_plot_labels_for_metric(col)[1] for col in data.columns] - plt.xticks([x + 0.5 for x in range(len(column_names))], column_names) - plt.yticks([y + 0.5 for y in range(len(column_names))], column_names) - plt.tight_layout() - - save_and_show_plot(f'{figdir}/feature_correlation_heatmap') - def plot_feature_cluster_correlations(df_track_level_features, feature_list, figdir): """ Plot clustermap of feature correlations. diff --git a/nuc_morph_analysis/lib/preprocessing/add_features.py b/nuc_morph_analysis/lib/preprocessing/add_features.py index b61b3fd7..4633a954 100644 --- a/nuc_morph_analysis/lib/preprocessing/add_features.py +++ b/nuc_morph_analysis/lib/preprocessing/add_features.py @@ -230,34 +230,6 @@ def get_early_transient_gr_of_neighborhood(df, scale, time_shift=24, window_leng return df -def add_std_feature_over_trajectory(df, feature_list, multiplier_list): - """ - Add the standard deviation of a given feature over the growth trajectory - from transition to frame breakdown. - - Parameters - ---------- - df : DataFrame - The dataframe - feature_list : list - List of column names - multiplier_list : list - List of scale to multiply the std by - - Returns - ------- - df : DataFrame - The dataframe with the added standard deviation feature columns - """ - for feature, multiplier in zip(feature_list, multiplier_list): - for tid, dft in df.groupby("track_id"): - start = dft.frame_transition.values[0] - stop = dft.Fb.values[0] - df_std = dft[(dft['index_sequence'] >= start) & (dft['index_sequence'] <= stop)] - std = df_std[feature].std() * multiplier - df.loc[df.track_id == tid, f"std_{feature}"] = std - return df - def add_volume_at(df, pixel_size, frame_column): """ diff --git a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py index eff68fff..0074eedf 100644 --- a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py +++ b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py @@ -266,7 +266,6 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_features.add_lineage_features(df_full, feature_list=['volume_at_B', 'duration_BC', 'volume_at_C', 'delta_volume_BC']) df_full = add_features.add_feature_at(df_full, "frame_transition", 'height', 'height_percentile', pix_size) - # df_full = add_features.add_feature_at(df_full, "frame_transition", 'density', 'density', pix_size) for feature in ['xy_aspect', 'SA_vol_ratio', 'neighbor_avg_lrm_volume_90um', 'neighbor_avg_lrm_height_90um', 'neighbor_avg_lrm_xy_aspect_90um','neighbor_avg_lrm_mesh_sa_90um']: df_full = add_features.add_feature_at(df_full, "frame_transition", feature, feature) diff --git a/nuc_morph_analysis/lib/visualization/label_tables.py b/nuc_morph_analysis/lib/visualization/label_tables.py index 15db731a..4037c7b3 100644 --- a/nuc_morph_analysis/lib/visualization/label_tables.py +++ b/nuc_morph_analysis/lib/visualization/label_tables.py @@ -241,28 +241,12 @@ def get_scale_factor_table(dataset="all_baseline"): "density_at_B": "Starting density", "xy_aspect_at_B": "Starting XY aspect ratio", "SA_vol_ratio_at_B": "Starting SA/Volume ratio", - "early_transient_gr_whole_colony": "~Starting avg. transient growth rate of whole colony", "early_transient_gr_90um": "~Starting avg. transient growth rate in 90 \u00B5m radius", - "mean_volume": "Mean volume", - "mean_height": "Mean height", - "mean_density": "Mean density", - "mean_mesh_sa": "Mean surface area", - "mean_xy_aspect": "Mean XY aspect ratio", - "mean_SA_vol_ratio": "Mean SA/Volume ratio", - "mean_neighbor_avg_dxdt_48_volume_whole_colony": "Mean avg. transient growth rate of whole colony", - "std_volume": "Stdev. volume", - "std_height": "Stdev. height", - "std_density": "Stdev. density", - "std_mesh_sa": "Stdev. surface area", - "std_xy_aspect": "Stdev. XY aspect ratio", - "std_SA_vol_ratio": "Stdev. SA/Volume ratio", - "std_neighbor_avg_dxdt_48_volume_whole_colony": "Stdev. transient growth rate of whole colony", 'neighbor_avg_lrm_volume_90um_at_B': "Starting avg. volume in 90 \u00B5m radius", 'neighbor_avg_lrm_height_90um_at_B': "Starting avg. height in 90 \u00B5m radius", 'neighbor_avg_lrm_density_90um_at_B': "Starting avg. density in 90 \u00B5m radius", 'neighbor_avg_lrm_xy_aspect_90um_at_B': "Starting avg. XY aspect ratio in 90 \u00B5m radius", 'neighbor_avg_lrm_mesh_sa_90um_at_B': "Starting avg. surface area in 90 \u00B5m radius", - "mean_neighbor_avg_dxdt_48_volume_90um": "Avg. mean transient growth rate in 90 \u00B5m radius", 'mean_neighbor_avg_lrm_volume_90um': " Avg. mean volume in 90 \u00B5m radius", 'mean_neighbor_avg_lrm_height_90um': "Avg. mean height in 90 \u00B5m radius", From 9c48be7b6e5e08f2a881bf36949a699a5a396256 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Fri, 25 Oct 2024 15:38:10 -0700 Subject: [PATCH 53/68] update documentation --- .../linear_regression/supplemental_lrm_figure_workflow.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py b/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py index f043b596..6cf8d000 100644 --- a/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py @@ -30,7 +30,7 @@ EXTENDED_WORKFLOW = False -#%% Preprocess dataframe to ensure same N for all analysis (lineage features are being used) +#%% Preprocess dataframe to ensure same N for all analysis (full tracks with lineage features) dropna_cols = get_feature_list(CONFIG['all_features'], None) data = df_track_level_features.dropna(subset=dropna_cols) print(f"Number of tracks: {len(data)}") @@ -39,7 +39,7 @@ df = run_regression_workflow(TARGETS, CONFIG, data, FIGDIR, alpha=0) plot_heatmap(df, FIGDIR, 'YlOrRd') -#%% Plot feature importance +#%% Plot feature importance for target in ['duration_BC', 'delta_volume_BC']: df_alpha, df_test, df_coeff = fit_linear_regression(data, cols=get_feature_list(CONFIG['all_features'], target), @@ -48,7 +48,7 @@ plot_feature_contribution(df_alpha, df_test, df_coeff, target, TARGET_SETTINGS[target]['fig_height'], FIGDIR) -#%% Plot feature correlations +#%% Plot feature correlations using all full tracks plot_feature_cluster_correlations(df_track_level_features, get_feature_list(CONFIG['all_features'], None), FIGDIR) #%% Create movie of increasing alpha until tolerance of 0.05 is reached From 0c6fb0f6a181fc7ced5488802a6329a3eef47885 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Mon, 28 Oct 2024 11:14:28 -0700 Subject: [PATCH 54/68] PR reveiw changes to add_features --- .../lib/preprocessing/add_features.py | 42 ++++--------------- .../preprocessing/global_dataset_filtering.py | 2 +- 2 files changed, 8 insertions(+), 36 deletions(-) diff --git a/nuc_morph_analysis/lib/preprocessing/add_features.py b/nuc_morph_analysis/lib/preprocessing/add_features.py index 4633a954..2e498c45 100644 --- a/nuc_morph_analysis/lib/preprocessing/add_features.py +++ b/nuc_morph_analysis/lib/preprocessing/add_features.py @@ -169,44 +169,16 @@ def add_mean_feature_over_trajectory(df, feature_list, multiplier_list): return df -def get_early_transient_gr_of_whole_colony(df, scale, time_shift=25): - """ - Get the transient growth rate of the colony 2 hours into the growth trajectory. - - This time shift of two hours into the growth trajectory is necessary because the metric - is calculated as the average of a 4 hour rolling window. The middle of a four hour window - does not occur until two hours into the timelapse. To calculate this feature equivalently - for each trajectory, two hours was used for all tracks to get a metric for the transient - growth rate of the colony early in the growth trajectory. - - Parameters - ---------- - df : DataFrame - The dataframe - time_shift : int - The time shift in frames to calculate the transient growth rate in frames - - Returns - ------- - df : DataFrame - The dataframe with the added transient growth rate feature columns - """ - for tid, dft in df.groupby("track_id"): - t_calculate = dft.index_sequence.min() + time_shift - transient_gr_whole_colony = df.loc[df.index_sequence == t_calculate, "neighbor_avg_dxdt_48_volume_whole_colony"].values[0] - df.loc[df.track_id == tid, "early_transient_gr_whole_colony"] = transient_gr_whole_colony * scale - - return df - def get_early_transient_gr_of_neighborhood(df, scale, time_shift=24, window_length=6): """ - Get the transient growth rate of the colony 2 hours into the growth trajectory. + Get the transient growth rate of the local neighborhood 2 hours into the growth trajectory. This time shift of two hours into the growth trajectory is necessary because the metric is calculated as the average of a 4 hour rolling window. The middle of a four hour window does not occur until two hours into the timelapse. To calculate this feature equivalently for each trajectory, two hours was used for all tracks to get a metric for the transient - growth rate of the neighborhood early in the growth trajectory. + growth rate of the neighborhood early in the growth trajectory. The early transient growth + rate is averaged over 30 minutes as defined by the window_length. Parameters ---------- @@ -215,7 +187,7 @@ def get_early_transient_gr_of_neighborhood(df, scale, time_shift=24, window_leng time_shift : int The time shift in frames to calculate the transient growth rate in frames window_length : int - The length of the time window in frames + The length of the time window in frames to average over Returns ------- @@ -224,8 +196,8 @@ def get_early_transient_gr_of_neighborhood(df, scale, time_shift=24, window_leng """ for tid, dft in df.groupby("track_id"): t_calculate = dft.index_sequence.min() + time_shift - time_window_mask = df.index_sequence.between(t_calculate, t_calculate + window_length) - transient_gr_whole_colony = df.loc[time_window_mask, "neighbor_avg_dxdt_48_volume_90um"].mean() + time_window_mask = dft.index_sequence.between(t_calculate, t_calculate + window_length) + transient_gr_whole_colony = dft.loc[time_window_mask, "neighbor_avg_dxdt_48_volume_90um"].mean() df.loc[df.track_id == tid, "early_transient_gr_90um"] = transient_gr_whole_colony * scale return df @@ -614,7 +586,7 @@ def sum_mitotic_events_along_full_track(df0, feature_list=[]): return sum_events_along_full_track(df0, feature_list) -def normalize_sum_events(df_full, event_cols): +def normalize_sum_events(df_full, event_cols=['sum_has_mitotic_neighbor', 'sum_has_dying_neighbor']): """ Normalize sum of mitotic and death events by growth duration diff --git a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py index 0074eedf..44c4aabf 100644 --- a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py +++ b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py @@ -272,7 +272,7 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_features.get_early_transient_gr_of_neighborhood(df_full, scale=get_plot_labels_for_metric('neighbor_avg_dxdt_48_volume_90um')[0]) df_full = add_features.sum_mitotic_events_along_full_track(df_full) - df_full = add_features.normalize_sum_events(df_full, ['sum_has_mitotic_neighbor', 'sum_has_dying_neighbor']) + df_full = add_features.normalize_sum_events(df_full) ft_list = ['neighbor_avg_dxdt_48_volume_90um', 'neighbor_avg_lrm_volume_90um', From 2bff19fe98981519d8f67b9c0904e3d22dfa5b74 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Mon, 28 Oct 2024 11:42:11 -0700 Subject: [PATCH 55/68] PR review changes to global dataset filtering --- .../supplemental_lrm_figure_workflow.py | 2 - .../lib/preprocessing/add_features.py | 58 +++++++++++++++++++ .../preprocessing/global_dataset_filtering.py | 20 +------ 3 files changed, 61 insertions(+), 19 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py b/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py index 6cf8d000..f1f12a2a 100644 --- a/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py @@ -1,7 +1,5 @@ #%% -import warnings import numpy as np -import pandas as pd from nuc_morph_analysis.lib.preprocessing import global_dataset_filtering, filter_data from nuc_morph_analysis.analyses.linear_regression.linear_regression import fit_linear_regression from nuc_morph_analysis.analyses.linear_regression.analysis_plots import (run_regression_workflow, diff --git a/nuc_morph_analysis/lib/preprocessing/add_features.py b/nuc_morph_analysis/lib/preprocessing/add_features.py index 2e498c45..9d211e6b 100644 --- a/nuc_morph_analysis/lib/preprocessing/add_features.py +++ b/nuc_morph_analysis/lib/preprocessing/add_features.py @@ -1,4 +1,5 @@ from nuc_morph_analysis.analyses.lineage.get_features import lineage_trees +from nuc_morph_analysis.lib.visualization.plotting_tools import get_plot_labels_for_metric import numpy as np FRAME_COL = {"Ff": "A", "frame_transition": "B", "Fb": "C"} @@ -625,3 +626,60 @@ def add_perimeter_ratio(df): """ df['2d_perimeter_nuc_cell_ratio'] = df['2d_perimeter_nucleus'] / df['2d_perimeter_pseudo_cell'] return df + +def add_features_at_transition(df, + feature_list=['xy_aspect', + 'SA_vol_ratio', + 'neighbor_avg_lrm_volume_90um', + 'neighbor_avg_lrm_height_90um', + 'neighbor_avg_lrm_xy_aspect_90um', + 'neighbor_avg_lrm_mesh_sa_90um', + 'neighbor_avg_dxdt_48_volume_90um', + 'neighbor_avg_lrm_2d_area_nuc_cell_ratio_90um'] + ): + """ + Add feature measurements at transition that are used in the linear regression analysis. + Features should be pre-calculated and not need to be scaled. + + Parameters + ---------- + df_full : DataFrame + The dataframe containing full trajectories + feature_list : list + List of column names + + Returns + ------- + df_full : DataFrame + The dataframe with the added feature columns + """ + + for feature in feature_list: + df = add_feature_at(df, "frame_transition", feature, feature) + return df + +def add_mean_features(df, + feature_list=['neighbor_avg_dxdt_48_volume_90um', + 'neighbor_avg_lrm_volume_90um', + 'neighbor_avg_lrm_height_90um', + 'neighbor_avg_lrm_xy_aspect_90um', + 'neighbor_avg_lrm_mesh_sa_90um', + 'neighbor_avg_lrm_2d_area_nuc_cell_ratio_90um'] + ): + """ + Add mean feature measurements over the growth trajectory that are used in the linear regression analysis. + + Parameters + ---------- + df : DataFrame + The dataframe containing full trajectories + feature_list : list + List of column names + + Returns + ------- + df : DataFrame + The dataframe with the added mean feature columns + """ + multiplier_list = [get_plot_labels_for_metric(x)[0] for x in feature_list] + df = add_mean_feature_over_trajectory(df, feature_list, multiplier_list) diff --git a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py index 44c4aabf..30eb05cf 100644 --- a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py +++ b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py @@ -202,7 +202,7 @@ def process_all_tracks(df, dataset, remove_growth_outliers, num_workers): df = add_neighborhood_avg_features.run_script(df, num_workers=num_workers) df = add_neighborhood_avg_features_lrm.run_script(df, num_workers=num_workers, feature_list=["volume", "height", "xy_aspect", "mesh_sa", "2d_area_nuc_cell_ratio"], - exclude_outliers=False) + exclude_outliers=True) if dataset == "all_baseline": df = add_colony_time_all_datasets(df) @@ -264,26 +264,12 @@ def process_full_tracks(df_all, thresh, pix_size, interval): # For LRM df_full = add_features.add_lineage_features(df_full, feature_list=['volume_at_B', 'duration_BC', 'volume_at_C', 'delta_volume_BC']) - df_full = add_features.add_feature_at(df_full, "frame_transition", 'height', 'height_percentile', pix_size) - for feature in ['xy_aspect', 'SA_vol_ratio', 'neighbor_avg_lrm_volume_90um', 'neighbor_avg_lrm_height_90um', - 'neighbor_avg_lrm_xy_aspect_90um','neighbor_avg_lrm_mesh_sa_90um']: - df_full = add_features.add_feature_at(df_full, "frame_transition", feature, feature) - + df_full = add_features.add_features_at_transition(df_full) df_full = add_features.get_early_transient_gr_of_neighborhood(df_full, scale=get_plot_labels_for_metric('neighbor_avg_dxdt_48_volume_90um')[0]) df_full = add_features.sum_mitotic_events_along_full_track(df_full) df_full = add_features.normalize_sum_events(df_full) - - ft_list = ['neighbor_avg_dxdt_48_volume_90um', - 'neighbor_avg_lrm_volume_90um', - 'neighbor_avg_lrm_height_90um', - 'neighbor_avg_lrm_xy_aspect_90um', - 'neighbor_avg_lrm_mesh_sa_90um', - 'neighbor_avg_lrm_2d_area_nuc_cell_ratio_90um',] - multiplier_list = [get_plot_labels_for_metric(x)[0] for x in ft_list] - df_full = add_features.add_mean_feature_over_trajectory(df_full, ft_list, multiplier_list) - for feat in ft_list: - df_full = add_features.add_feature_at(df_full, "frame_transition", feat, feat) + df_full = add_features.add_mean_features(df_full) # Add flag for use after merging back to main manifest df_full = add_features.add_full_track_flag(df_full) From 68da6df3e2fca93c25db091ec8419d55f625d054 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Mon, 28 Oct 2024 11:56:23 -0700 Subject: [PATCH 56/68] return df --- nuc_morph_analysis/lib/preprocessing/add_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nuc_morph_analysis/lib/preprocessing/add_features.py b/nuc_morph_analysis/lib/preprocessing/add_features.py index 9d211e6b..26bc2084 100644 --- a/nuc_morph_analysis/lib/preprocessing/add_features.py +++ b/nuc_morph_analysis/lib/preprocessing/add_features.py @@ -653,7 +653,6 @@ def add_features_at_transition(df, df_full : DataFrame The dataframe with the added feature columns """ - for feature in feature_list: df = add_feature_at(df, "frame_transition", feature, feature) return df @@ -683,3 +682,4 @@ def add_mean_features(df, """ multiplier_list = [get_plot_labels_for_metric(x)[0] for x in feature_list] df = add_mean_feature_over_trajectory(df, feature_list, multiplier_list) + return df From aa1f1ccad07419275869597704925fb5a2f9405f Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Mon, 28 Oct 2024 12:45:50 -0700 Subject: [PATCH 57/68] update maximum alpha in for ft importance figure --- .../analyses/linear_regression/select_features.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/select_features.py b/nuc_morph_analysis/analyses/linear_regression/select_features.py index aa6f4a6a..1d3104b4 100644 --- a/nuc_morph_analysis/analyses/linear_regression/select_features.py +++ b/nuc_morph_analysis/analyses/linear_regression/select_features.py @@ -58,12 +58,12 @@ TARGET_SETTINGS = { 'duration_BC': { 'tolerance': 0.05, - 'max_alpha': 0.8, - 'fig_height': 7, + 'max_alpha': 1.4, + 'fig_height': 6, }, 'delta_volume_BC': { 'tolerance': 0.05, - 'max_alpha': 11.6, + 'max_alpha': 10.2, 'fig_height': 2, } } From ed242788b9c39fd6b620c26a0589422e3684a6c5 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Mon, 28 Oct 2024 12:46:39 -0700 Subject: [PATCH 58/68] update label tables to sent. case and neighborhood fts --- .../lib/visualization/label_tables.py | 158 +++++++++--------- 1 file changed, 79 insertions(+), 79 deletions(-) diff --git a/nuc_morph_analysis/lib/visualization/label_tables.py b/nuc_morph_analysis/lib/visualization/label_tables.py index 4037c7b3..42664b05 100644 --- a/nuc_morph_analysis/lib/visualization/label_tables.py +++ b/nuc_morph_analysis/lib/visualization/label_tables.py @@ -141,83 +141,83 @@ def get_scale_factor_table(dataset="all_baseline"): LABEL_TABLE = { # Times and durations ("sync_time_Ff", "index_sequence"): "Time", - "normalized_time": "Normalized Interphase Time", - "colony_time": "Aligned Colony Time", + "normalized_time": "Normalized interphase time", + "colony_time": "Aligned colony time", ("Ff", "frame_formation"): "Formation", - ("frame_inflection", "frame_transition"): "Transtion", + ("frame_inflection", "frame_transition"): "Transition", ("Fb", "frame_breakdown"): "Breakdown", - "time_at_A": "Time at Formation", + "time_at_A": "Time at formation", "time_at_B": "Starting movie time", - "time_at_C": "Movie Time at Breakdown", - "colony_time_at_A": "Aligned Colony Time at Formation", - "colony_time_at_B": "Starting Aligned Colony Time", - "colony_time_at_C": "Ending Aligned Colony Time", - "duration_AB": "Rapid Expansion Duration", - ("duration_BC", "duration_BC_hr"): "Growth Duration", + "time_at_C": "Movie time at breakdown", + "colony_time_at_A": "Aligned colony time at formation", + "colony_time_at_B": "Starting aligned colony time", + "colony_time_at_C": "Ending aligned colony time", + "duration_AB": "Rapid expansion duration", + ("duration_BC", "duration_BC_hr"): "Growth duration", # Volume "volume": "Volume", - "volume_at_A": "Volume at Formation", + "volume_at_A": "Volume at formation", "volume_at_B": "Starting volume", - "volume_at_C": "Ending Volume", - "Volume_C": "Volume at Breakdown", - "volume_fold_change_BC": "Volume Fold-Change", - "volume_fold_change_fromB": "Volume Fold-Change relative to Starting Volume", - "delta_volume_BC": "Added Volume", - "difference_volume_at_B": "Difference in Starting Volume", - "difference_half_vol_at_C_and_B": "1/2 Mother Ending Volume - Daughter Starting Volume", - "avg_sister_volume_fold_change_BC": "Sisters Average Volume Fold-Change", - "avg_sister_volume_at_B": "Sisters Average Starting Volume", + "volume_at_C": "Ending volume", + "Volume_C": "Volume at breakdown", + "volume_fold_change_BC": "Volume fold-change", + "volume_fold_change_fromB": "Volume fold-change relative to starting volume", + "delta_volume_BC": "Added volume", + "difference_volume_at_B": "Difference in starting volume", + "difference_half_vol_at_C_and_B": "1/2 mother ending volume - daughter starting volume", + "avg_sister_volume_fold_change_BC": "Sisters average volume fold-change", + "avg_sister_volume_at_B": "Sisters average starting volume", "volume_sub": "Change in volume", # Growth rates - "exp_growth_coeff_BC": "Exponetial Growth Coeff. B to C", - "linear_growth_rate_BC": "Late Growth Rate", - "tscale_linearityfit_volume": "Fitted Time Scaling Factor (\u03B1)", - "RMSE_linearityfit_volume": "Root Mean Squared Error", - "late_growth_rate_by_endpoints": "Growth Rate", + "exp_growth_coeff_BC": "Exponential growth coeff. B to C", + "linear_growth_rate_BC": "Late growth rate", + "tscale_linearityfit_volume": "Fitted time scaling factor (\u03B1)", + "RMSE_linearityfit_volume": "Root mean squared error", + "late_growth_rate_by_endpoints": "Growth rate", "dxdt_t2-dxdt_t1": "Late average transient growth rate - early average transient growth rate", # Height "height": "Height", - "avg_height": "Average Height", - "height_fold_change_BC": "Growth Height Fold-Change", + "avg_height": "Average height", + "height_fold_change_BC": "Growth height fold-change", "height_at_B": "Starting height", - "height_at_C": "Ending Height", + "height_at_C": "Ending height", # Surface Area - "mesh_sa": "Surface Area", - "SA_at_A": "Surface Area at A", - "SA_at_B": "Starting Surface Area", - "SA_at_C": "Ending Surface Area", - "delta_SA_BC": "\u0394Surface Area B to C", - "SA_fold_change_BC": "Surface Area Fold-Change B to C", - "SA_fold_change_fromB": "Surface Area Fold-Change", - "SA_vol_ratio": "SA/Volume", - "tscale_linearityfit_SA": "Fitted Surface Area Time Scaling (\u03B1)", - "RMSE_linearityfit_SA": "Root Mean Squared Error", + "mesh_sa": "Surface area", + "SA_at_A": "Surface area at A", + "SA_at_B": "Starting surface area", + "SA_at_C": "Ending surface area", + "delta_SA_BC": "\u0394Surface area B to C", + "SA_fold_change_BC": "Surface area fold-change B to C", + "SA_fold_change_fromB": "Surface area fold-change", + "SA_vol_ratio": "SA/volume", + "tscale_linearityfit_SA": "Fitted surface area time scaling (\u03B1)", + "RMSE_linearityfit_SA": "Root mean squared error", # Dimensions beyond height "length": "XY short axis width", "width": "XY long axis length", # Aspect Ratio - "xy_aspect": "XY Aspect Ratio", - "xz_aspect": "XZ Aspect Ratio", - "zy_aspect": "YZ Aspect Ratio", - "xz_aspect_fold_change_BC": "XZ Aspect Ratio Fold-Change B to C", - "avg_xz_aspect_ratio": "Average XZ Aspect Ratio", - "xz_aspect_at_B": "Starting XZ aspect ratio", + "xy_aspect": "Xy aspect ratio", + "xz_aspect": "Xz aspect ratio", + "zy_aspect": "Yz aspect ratio", + "xz_aspect_fold_change_BC": "Xz aspect ratio fold-change B to C", + "avg_xz_aspect_ratio": "Average xz aspect ratio", + "xz_aspect_at_B": "Starting xz aspect ratio", # Colony Position "distance": "Distance", - "distance_from_centroid": "Distance From Centroid", - "normalized_distance_from_centroid": "Normalized Distance From Centroid", - "max_distance_from_centroid": "Max Distance From Centroid", - "colony_depth": "Colony Depth", - "normalized_colony_depth": "Normalized Colony Depth", - "max_colony_depth": "Max Colony Depth", - "avg_colony_depth": "Average Colony Depth", + "distance_from_centroid": "Distance from centroid", + "normalized_distance_from_centroid": "Normalized distance from centroid", + "max_distance_from_centroid": "Max distance from centroid", + "colony_depth": "Colony depth", + "normalized_colony_depth": "Normalized colony depth", + "max_colony_depth": "Max colony depth", + "avg_colony_depth": "Average colony depth", # Density - "colony_non_circularity": "Colony Non-circularity", - "colony_non_circularity_scaled": "Scaled Colony Non-circularity", - "avg_early_density": "Early Density", - "avg_late_density": "Late Density", + "colony_non_circularity": "Colony non-circularity", + "colony_non_circularity_scaled": "Scaled colony non-circularity", + "avg_early_density": "Early density", + "avg_late_density": "Late density", "density": "Density", - "avg_density": "Average Density", + "avg_density": "Average density", # Lineage "parent_id": "Parent ID", "family_id": "Family ID", @@ -225,13 +225,13 @@ def get_scale_factor_table(dataset="all_baseline"): "sisters_duration_BC": "Sisters growth duration", "sisters_delta_volume_BC": "Sisters added volume", # Flags - "is_outlier": "Outlier Flag", - "is_tp_outlier": "Single Timepoint Outlier", - "is_outlier_track": "Outlier Track Flag", - "is_growth_outlier": "Growth Feature Outlier Flag", - "fov_edge": "FOV-edge Flag", - "termination": "Track Termination", - "is_full_track": "Full Interphase Track Flag", + "is_outlier": "Outlier flag", + "is_tp_outlier": "Single timepoint outlier", + "is_outlier_track": "Outlier track flag", + "is_growth_outlier": "Growth feature outlier flag", + "fov_edge": "FOV-edge flag", + "termination": "Track termination", + "is_full_track": "Full interphase track flag", # colony segmentations "colony_area": "area of colony (brightfield)", "nucleus_colony_area_ratio": "ratio of nuclear area to colony area", @@ -240,27 +240,27 @@ def get_scale_factor_table(dataset="all_baseline"): "height_at_B": "Starting height", "density_at_B": "Starting density", "xy_aspect_at_B": "Starting XY aspect ratio", - "SA_vol_ratio_at_B": "Starting SA/Volume ratio", - "early_transient_gr_90um": "~Starting avg. transient growth rate in 90 \u00B5m radius", - 'neighbor_avg_lrm_volume_90um_at_B': "Starting avg. volume in 90 \u00B5m radius", - 'neighbor_avg_lrm_height_90um_at_B': "Starting avg. height in 90 \u00B5m radius", - 'neighbor_avg_lrm_density_90um_at_B': "Starting avg. density in 90 \u00B5m radius", - 'neighbor_avg_lrm_xy_aspect_90um_at_B': "Starting avg. XY aspect ratio in 90 \u00B5m radius", - 'neighbor_avg_lrm_mesh_sa_90um_at_B': "Starting avg. surface area in 90 \u00B5m radius", - "mean_neighbor_avg_dxdt_48_volume_90um": "Avg. mean transient growth rate in 90 \u00B5m radius", - 'mean_neighbor_avg_lrm_volume_90um': " Avg. mean volume in 90 \u00B5m radius", - 'mean_neighbor_avg_lrm_height_90um': "Avg. mean height in 90 \u00B5m radius", - 'mean_neighbor_avg_lrm_density_90um': "Avg. mean density in 90 \u00B5m radius", - 'mean_neighbor_avg_lrm_xy_aspect_90um': "Avg. mean XY aspect ratio in 90 \u00B5m radius", - 'mean_neighbor_avg_lrm_mesh_sa_90um': "Avg. mean surface area in 90 \u00B5m radius", - 'mean_neighbor_avg_lrm_2d_area_nuc_cell_ratio_90um': "Avg. mean density in 90 \u00B5m radius", + "SA_vol_ratio_at_B": "Starting surface area/volume ratio", + "early_transient_gr_90um": "Neighborhood avg. ~starting transient growth rate", + 'neighbor_avg_lrm_volume_90um_at_B': "Neighborhood avg. starting volume", + 'neighbor_avg_lrm_height_90um_at_B': "Neighborhood avg. starting height", + 'neighbor_avg_lrm_density_90um_at_B': "Neighborhood avg. starting density", + 'neighbor_avg_lrm_xy_aspect_90um_at_B': "Neighborhood avg. starting XY aspect ratio", + 'neighbor_avg_lrm_mesh_sa_90um_at_B': "Neighborhood avg. starting surface area", + "mean_neighbor_avg_dxdt_48_volume_90um": "Neighborhood avg. mean transient growth rate", + 'mean_neighbor_avg_lrm_volume_90um': "Neighborhood avg. mean volume", + 'mean_neighbor_avg_lrm_height_90um': "Neighborhood avg. mean height", + 'mean_neighbor_avg_lrm_density_90um': "Neighborhood avg. mean density", + 'mean_neighbor_avg_lrm_xy_aspect_90um': "Neighborhood avg. mean XY aspect ratio", + 'mean_neighbor_avg_lrm_mesh_sa_90um': "Neighborhood avg. mean surface area", + 'mean_neighbor_avg_lrm_2d_area_nuc_cell_ratio_90um': "Neighborhood avg. mean density", # mitotic and apoptotic neighbor columns "number_of_frame_of_breakdown_neighbors": "# of neighboring cells undergoing breakdown", "number_of_frame_of_formation_neighbors": "# of neighboring cells undergoing formation", "number_of_frame_of_death_neighbors": "# of neighboring cells undergoing death", - "normalized_sum_has_mitotic_neighbor": "Norm. sum of mitotic neighbors", - "normalized_sum_has_dying_neighbor": "Norm. sum of dying neighbors", + "normalized_sum_has_mitotic_neighbor": "Frequency of mitotic adjacent neighbors", + "normalized_sum_has_dying_neighbor": "Frequency of dying adjacent neighbors", # 2D area features "2d_area_nuc_cell_ratio": "Nucleus area/(Pseudo)cell area", "2d_area_nucleus": "Nuclear area", From c8ce61c52adaa00486392be6e760349996cf9a94 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Mon, 28 Oct 2024 13:12:32 -0700 Subject: [PATCH 59/68] save w/ layout tight so pdf plots dont get cut off --- .../analyses/linear_regression/analysis_plots.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py index cccbfbe4..addc13f5 100644 --- a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py +++ b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py @@ -58,7 +58,7 @@ def plot_feature_cluster_correlations(df_track_level_features, feature_list, fig cluster_grid.ax_heatmap.tick_params(axis='x', labelsize=12, width=0.7) cluster_grid.ax_heatmap.tick_params(axis='y', labelsize=12, width=0.7, labelright=False, labelleft=True, left=True, right=False) - save_and_show_plot(f'{figdir}/feature_correlation_clustermap', figure=cluster_grid.fig, dpi=300) + save_and_show_plot(f'{figdir}/feature_correlation_clustermap', figure=cluster_grid.fig, bbox_inches='tight', dpi=300) def run_regression(df_track_level_features, target, features, name, alpha, figdir): @@ -187,7 +187,7 @@ def replace_values(val): ax.tick_params(axis='both', which='both', length=0) title = ax.set_title(f'Target: {get_plot_labels_for_metric(target)[1]}', loc='left') title.set_position([-0.1,1]) - save_and_show_plot(f'{figdir}{target}_prediction_r_squared_matrix_alpha_{df.alpha[0]}') + save_and_show_plot(f'{figdir}{target}_prediction_r_squared_matrix_alpha_{df.alpha[0]}', bbox_inches='tight') def plot_feature_contribution(coef_alpha, test_sc, perms, target, fig_height, figdir): @@ -227,7 +227,7 @@ def plot_feature_contribution(coef_alpha, test_sc, perms, target, fig_height, fi coef_alpha = coef_alpha[coef_alpha["Column"] != col] coef_alpha['Magnitude coefficient importance'] = abs(coef_alpha['Coefficient Importance']) - coef_alpha['Sign'] = coef_alpha['Coefficient Importance'].apply(lambda x: 'Positive coefficient' if x > 0 else 'Negative coefficient') + coef_alpha['Sign'] = coef_alpha['Coefficient Importance'].apply(lambda x: 'Positive' if x > 0 else 'Negative') coef_alpha['Mean Magnitude'] = coef_alpha.groupby('Column')['Magnitude coefficient importance'].transform('mean') coef_alpha = coef_alpha.sort_values('Mean Magnitude', ascending=False).drop(columns=['Mean Magnitude']) @@ -238,7 +238,7 @@ def plot_feature_contribution(coef_alpha, test_sc, perms, target, fig_height, fi y="Column", x="Magnitude coefficient importance", hue="Sign", - palette={'Positive coefficient': '#156082', 'Negative coefficient': 'grey'}, + palette={'Positive': '#156082', 'Negative': 'grey'}, errorbar="sd", width=0.7, native_scale=True) From ac26b747aeeadc9ad2b8dafd2f017ad7b4a63887 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Mon, 28 Oct 2024 17:13:00 -0700 Subject: [PATCH 60/68] remove hard coded max alpha, default pre-compute --- .../linear_regression/analysis_plots.py | 10 ++-- .../linear_regression/select_features.py | 13 ----- .../supplemental_lrm_figure_workflow.py | 50 +++++++++++-------- 3 files changed, 34 insertions(+), 39 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py index addc13f5..8ecf9d19 100644 --- a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py +++ b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py @@ -61,7 +61,7 @@ def plot_feature_cluster_correlations(df_track_level_features, feature_list, fig save_and_show_plot(f'{figdir}/feature_correlation_clustermap', figure=cluster_grid.fig, bbox_inches='tight', dpi=300) -def run_regression(df_track_level_features, target, features, name, alpha, figdir): +def run_regression(df_track_level_features, target, features, name, alpha, figdir="figures/"): """ Run linear regression on the given dataset and return the results. @@ -94,7 +94,7 @@ def run_regression(df_track_level_features, target, features, name, alpha, figdi std = round(all_test_sc["Test r$^2$"].std(), 3) return {'target': target, 'feature_group': name, 'r_squared': r_squared, 'stdev': std, 'alpha': 0, 'feats_used': get_feature_list(features, target)} -def run_regression_workflow(targets, feature_configs, df_track_level_features, figdir, alpha): +def run_regression_workflow(targets, feature_configs, df_track_level_features, alpha, figdir="figures/"): """ Run the regression workflow for multiple targets and feature configurations. @@ -190,7 +190,7 @@ def replace_values(val): save_and_show_plot(f'{figdir}{target}_prediction_r_squared_matrix_alpha_{df.alpha[0]}', bbox_inches='tight') -def plot_feature_contribution(coef_alpha, test_sc, perms, target, fig_height, figdir): +def plot_feature_contribution(coef_alpha, test_sc, perms, target, alpha, fig_height, figdir): """ For a given target, plot feature importance for each feature in the linear model at a specified alpha. Features that touch 0 are considered not important and are excluded from the plot. @@ -205,6 +205,8 @@ def plot_feature_contribution(coef_alpha, test_sc, perms, target, fig_height, fi DataFrame containing the permutation test results target: str Prediction feature + alpha: int + Regularization parameter fig_height: int Height of the figure based on number of important features save_path: str @@ -214,8 +216,6 @@ def plot_feature_contribution(coef_alpha, test_sc, perms, target, fig_height, fi ------- Figure """ - - alpha = coef_alpha["alpha"].unique()[0] p_value = round(perms["p_value"].item(), 3) test_r2_mean = round(test_sc["Test r$^2$"].mean(), 2) test_r2_std = round(test_sc["Test r$^2$"].std() / 2, 2) diff --git a/nuc_morph_analysis/analyses/linear_regression/select_features.py b/nuc_morph_analysis/analyses/linear_regression/select_features.py index 1d3104b4..49b50682 100644 --- a/nuc_morph_analysis/analyses/linear_regression/select_features.py +++ b/nuc_morph_analysis/analyses/linear_regression/select_features.py @@ -55,19 +55,6 @@ ] } -TARGET_SETTINGS = { - 'duration_BC': { - 'tolerance': 0.05, - 'max_alpha': 1.4, - 'fig_height': 6, - }, - 'delta_volume_BC': { - 'tolerance': 0.05, - 'max_alpha': 10.2, - 'fig_height': 2, - } -} - def get_feature_list(feature_group_list, target): """ Get feature list to include in linear model. diff --git a/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py b/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py index f1f12a2a..83bb6787 100644 --- a/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py @@ -14,8 +14,7 @@ df_full = filter_data.all_timepoints_full_tracks(df_all) df_track_level_features = filter_data.track_level_features(df_full) -FIGDIR='linear_regression/figures/' -TARGETS = ['duration_BC', 'delta_volume_BC'] +#%% CONFIG = { 'all_features': ['start_intrinsic', 'lifetime_intrinsic', 'start_extrinsic', 'lifetime_extrinsic'], 'start_intrinsic': ['start_intrinsic'], @@ -25,8 +24,8 @@ 'intrinsic': ['start_intrinsic', 'lifetime_intrinsic'], 'extrinsic': ['start_extrinsic', 'lifetime_extrinsic'], } -EXTENDED_WORKFLOW = False - +FIGDIR='linear_regression/figures/' +TARGETS = ['duration_BC', 'delta_volume_BC'] #%% Preprocess dataframe to ensure same N for all analysis (full tracks with lineage features) dropna_cols = get_feature_list(CONFIG['all_features'], None) @@ -34,30 +33,39 @@ print(f"Number of tracks: {len(data)}") #%% Create maxtrix of r squared values -df = run_regression_workflow(TARGETS, CONFIG, data, FIGDIR, alpha=0) +df = run_regression_workflow(TARGETS, CONFIG, data, alpha=0) plot_heatmap(df, FIGDIR, 'YlOrRd') -#%% Plot feature importance -for target in ['duration_BC', 'delta_volume_BC']: - df_alpha, df_test, df_coeff = fit_linear_regression(data, - cols=get_feature_list(CONFIG['all_features'], target), - target=target, alpha=[TARGET_SETTINGS[target]['max_alpha']], - tol=TARGET_SETTINGS[target]['tolerance'], save_path=FIGDIR, save=False) - - plot_feature_contribution(df_alpha, df_test, df_coeff, target, TARGET_SETTINGS[target]['fig_height'], FIGDIR) - -#%% Plot feature correlations using all full tracks -plot_feature_cluster_correlations(df_track_level_features, get_feature_list(CONFIG['all_features'], None), FIGDIR) +#%% +RECOMPUTE_ALPHA = True +TOLERANCE = 0.05 +computed_alpha = {'duration_BC': 1.4, + 'delta_volume_BC': 10.2} -#%% Create movie of increasing alpha until tolerance of 0.05 is reached -if EXTENDED_WORKFLOW: +#%% Recompute maximum alpha when tolerance is reached +if RECOMPUTE_ALPHA: for target in ['duration_BC', 'delta_volume_BC']: - fit_linear_regression( + all_coef_alpha, all_test_sc, all_perms = fit_linear_regression( data, cols=get_feature_list(CONFIG['all_features'], target), target=target, alpha=np.arange(0, 15, 0.2, dtype=float), - tol=TARGET_SETTINGS[target]['tolerance'], + tol=TOLERANCE, save_path="figures/", save=True) -# %% + + max_alpha = all_coef_alpha.alpha.max() + computed_alpha[target] = max_alpha + print(f"{target}: {max_alpha}") + +#%% Plot feature importance for max alpha when tolerance is reached +for target, fig_height in zip(['duration_BC', 'delta_volume_BC'], [6, 2]): + df_alpha, df_test, df_coeff = fit_linear_regression(data, + cols=get_feature_list(CONFIG['all_features'], target), + target=target, alpha=[computed_alpha[target]], + tol=TOLERANCE, save_path=FIGDIR, save=False) + plot_feature_contribution(df_alpha, df_test, df_coeff, target, computed_alpha[target], fig_height, FIGDIR) + +#%% Plot feature correlations using all full tracks +plot_feature_cluster_correlations(df_track_level_features, get_feature_list(CONFIG['all_features'], None), FIGDIR) +#%% From d816f2d86f8215cebe902f172ec1618d4e5db28c Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Tue, 29 Oct 2024 09:08:37 -0700 Subject: [PATCH 61/68] update imports --- .../linear_regression/supplemental_lrm_figure_workflow.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py b/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py index 83bb6787..80286847 100644 --- a/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py @@ -2,12 +2,11 @@ import numpy as np from nuc_morph_analysis.lib.preprocessing import global_dataset_filtering, filter_data from nuc_morph_analysis.analyses.linear_regression.linear_regression import fit_linear_regression +from nuc_morph_analysis.analyses.linear_regression.select_features import get_feature_list from nuc_morph_analysis.analyses.linear_regression.analysis_plots import (run_regression_workflow, plot_feature_cluster_correlations, plot_heatmap, plot_feature_contribution) -from nuc_morph_analysis.analyses.linear_regression.select_features import (get_feature_list, - TARGET_SETTINGS) #%% df_all = global_dataset_filtering.load_dataset_with_features() From fc4c310c98f947d8023b3d8c4f176b95e9effc53 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Tue, 29 Oct 2024 13:31:26 -0700 Subject: [PATCH 62/68] one off change to cell health workflow to print a reported metric! --- nuc_morph_analysis/analyses/cell_health/cell_health_plots.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nuc_morph_analysis/analyses/cell_health/cell_health_plots.py b/nuc_morph_analysis/analyses/cell_health/cell_health_plots.py index 6740a399..8743a375 100644 --- a/nuc_morph_analysis/analyses/cell_health/cell_health_plots.py +++ b/nuc_morph_analysis/analyses/cell_health/cell_health_plots.py @@ -87,6 +87,8 @@ def plot_event_histogram(df, event_type, figdir): ax[2].set_xlabel('Time (hr)') plt.tight_layout() + print(f"{colony}: max {percent_event.max():.2f}%") + save_and_show_plot(f'{figdir}/{event_label}_histogram_{colony}') From 785e4955f88bf1a9d6164525fd993d3974ca7341 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Wed, 30 Oct 2024 09:31:29 -0700 Subject: [PATCH 63/68] default to load all features if none specified --- .../analyses/linear_regression/linear_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression.py index 825e49b3..379e1842 100644 --- a/nuc_morph_analysis/analyses/linear_regression/linear_regression.py +++ b/nuc_morph_analysis/analyses/linear_regression/linear_regression.py @@ -47,7 +47,7 @@ def main( save_path.mkdir(parents=True, exist_ok=True) if len(cols) < 1: - cols = get_feature_list(["features", "lineage_feats"], None) + cols = get_feature_list(['start_intrinsic', 'lifetime_intrinsic', 'start_extrinsic', 'lifetime_extrinsic'], None) if not cached_dataframe: df_all = global_dataset_filtering.load_dataset_with_features() From 2982d3c2e7f1686240519818873ea6edc45eddaa Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Thu, 31 Oct 2024 15:39:53 -0700 Subject: [PATCH 64/68] PR comments - remove rounding --- .../analyses/linear_regression/analysis_plots.py | 6 +++--- .../analyses/linear_regression/linear_regression.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py index 8ecf9d19..708fbfc3 100644 --- a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py +++ b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py @@ -90,8 +90,8 @@ def run_regression(df_track_level_features, target, features, name, alpha, figdi multiple_predictions=False ) print(f"Target {target}, Alpha: {alpha}. Feature group: {name}") - r_squared = round(all_test_sc["Test r$^2$"].mean(), 3) - std = round(all_test_sc["Test r$^2$"].std(), 3) + r_squared = all_test_sc["Test r$^2$"].mean() + std = all_test_sc["Test r$^2$"].std() return {'target': target, 'feature_group': name, 'r_squared': r_squared, 'stdev': std, 'alpha': 0, 'feats_used': get_feature_list(features, target)} def run_regression_workflow(targets, feature_configs, df_track_level_features, alpha, figdir="figures/"): @@ -224,7 +224,7 @@ def plot_feature_contribution(coef_alpha, test_sc, perms, target, alpha, fig_hei lower_bound = df_col["Coefficient Importance"].mean() - df_col["Coefficient Importance"].std() upper_bound = df_col["Coefficient Importance"].mean() + df_col["Coefficient Importance"].std() if lower_bound < 0 and upper_bound > 0 or df_col["Coefficient Importance"].mean() == 0: - coef_alpha = coef_alpha[coef_alpha["Column"] != col] + coef_alpha = coef_alpha[coef_alpha["Column"] != col] # if coeff importance is 0, dont plot feature coef_alpha['Magnitude coefficient importance'] = abs(coef_alpha['Coefficient Importance']) coef_alpha['Sign'] = coef_alpha['Coefficient Importance'].apply(lambda x: 'Positive' if x > 0 else 'Negative') diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression.py index 379e1842..ea0e5363 100644 --- a/nuc_morph_analysis/analyses/linear_regression/linear_regression.py +++ b/nuc_morph_analysis/analyses/linear_regression/linear_regression.py @@ -79,6 +79,7 @@ def fit_linear_regression( save_path - location to save files save - whether to save movies and pngs permute_col - list of features to permute and replace with noise + multiple_predictions - Boolean, whether to stop when r^2 is reduced by tolerance """ sns.set_context("talk") random_state = 2652124 From d237b4884bbdac6a21a951f0e65f4146814533e5 Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Thu, 31 Oct 2024 16:03:50 -0700 Subject: [PATCH 65/68] update feature name --- .../analyses/linear_regression/select_features.py | 2 +- nuc_morph_analysis/lib/preprocessing/add_features.py | 4 ++-- nuc_morph_analysis/lib/visualization/label_tables.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/select_features.py b/nuc_morph_analysis/analyses/linear_regression/select_features.py index 49b50682..0d714f3e 100644 --- a/nuc_morph_analysis/analyses/linear_regression/select_features.py +++ b/nuc_morph_analysis/analyses/linear_regression/select_features.py @@ -25,7 +25,7 @@ 'neighbor_avg_lrm_height_90um_at_B', 'neighbor_avg_lrm_xy_aspect_90um_at_B', 'neighbor_avg_lrm_mesh_sa_90um_at_B', - 'early_transient_gr_90um', + 'early_neighbor_avg_dxdt_48_volume_90um', ], 'lifetime_extrinsic': [ # extrinsic lifetime diff --git a/nuc_morph_analysis/lib/preprocessing/add_features.py b/nuc_morph_analysis/lib/preprocessing/add_features.py index 26bc2084..440f1ab8 100644 --- a/nuc_morph_analysis/lib/preprocessing/add_features.py +++ b/nuc_morph_analysis/lib/preprocessing/add_features.py @@ -198,8 +198,8 @@ def get_early_transient_gr_of_neighborhood(df, scale, time_shift=24, window_leng for tid, dft in df.groupby("track_id"): t_calculate = dft.index_sequence.min() + time_shift time_window_mask = dft.index_sequence.between(t_calculate, t_calculate + window_length) - transient_gr_whole_colony = dft.loc[time_window_mask, "neighbor_avg_dxdt_48_volume_90um"].mean() - df.loc[df.track_id == tid, "early_transient_gr_90um"] = transient_gr_whole_colony * scale + transient_gr_90um = dft.loc[time_window_mask, "neighbor_avg_dxdt_48_volume_90um"].mean() + df.loc[df.track_id == tid, "early_neighbor_avg_dxdt_48_volume_90um"] = transient_gr_90um * scale return df diff --git a/nuc_morph_analysis/lib/visualization/label_tables.py b/nuc_morph_analysis/lib/visualization/label_tables.py index 42664b05..108b9560 100644 --- a/nuc_morph_analysis/lib/visualization/label_tables.py +++ b/nuc_morph_analysis/lib/visualization/label_tables.py @@ -241,7 +241,7 @@ def get_scale_factor_table(dataset="all_baseline"): "density_at_B": "Starting density", "xy_aspect_at_B": "Starting XY aspect ratio", "SA_vol_ratio_at_B": "Starting surface area/volume ratio", - "early_transient_gr_90um": "Neighborhood avg. ~starting transient growth rate", + "early_neighbor_avg_dxdt_48_volume_90um": "Neighborhood avg. ~starting transient growth rate", 'neighbor_avg_lrm_volume_90um_at_B': "Neighborhood avg. starting volume", 'neighbor_avg_lrm_height_90um_at_B': "Neighborhood avg. starting height", 'neighbor_avg_lrm_density_90um_at_B': "Neighborhood avg. starting density", From 6558f3cb6a87e1f8c4af3d8cabcac7242bdde9ed Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Thu, 31 Oct 2024 16:04:14 -0700 Subject: [PATCH 66/68] PR comments - documentation, remove dropna step --- .../linear_regression/linear_regression.py | 41 +++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression.py index ea0e5363..49be203f 100644 --- a/nuc_morph_analysis/analyses/linear_regression/linear_regression.py +++ b/nuc_morph_analysis/analyses/linear_regression/linear_regression.py @@ -41,7 +41,30 @@ def main( cached_dataframe=None, save_movie=False, ): - + """ + Main function to perform linear regression analysis. + + Parameters + ---------- + cols : list of str + List of column names to be used as features for the regression. + target : str + The target column name for the regression. + alpha_range : tuple of float + The range of alpha values (regularization strength) to be tested. + tolerance : float + The tolerance for the optimization. + save_path : str + The path where the results and plots will be saved. + cached_dataframe : pd.DataFrame, optional + A cached DataFrame to use for the analysis. If None, a new DataFrame will be created. + save_movie : bool, optional + If True, a movie of the regression process will be saved. Default is False. + + Returns + ------- + None + """ save_path = Path(save_path) save_path = save_path / Path("linear_regression") save_path.mkdir(parents=True, exist_ok=True) @@ -72,7 +95,7 @@ def fit_linear_regression( ): """ data - track level features - cols - input features + cols - input features, must not contain rows with nans target - target to predict alpha - hyperparameter for lasso tol - tolerance to check drop in r^2 for finding best alpha (ex. 0.02) @@ -102,11 +125,6 @@ def fit_linear_regression( # find best alpha for Lasso model for alpha_ind, this_alpha in tqdm(enumerate(alpha), total=len(alpha)): - # drop any nan rows - dropna_cols = cols + [target] - data = data.dropna(subset=dropna_cols) - print(f"number of tracks: {len(data)}") - # permute columns if necessary if len(permute_cols) > 0: for col in permute_cols: @@ -140,11 +158,10 @@ def fit_linear_regression( # break if permutation score is less than linear regression value (max possible) # with a tolerance # or if p_value > 0.05 - rounded_permutation_score = round(score, 2) if alpha_ind == 0: - max_val = rounded_permutation_score + max_val = score if multiple_predictions: - if abs(rounded_permutation_score - max_val) > tol or (pvalue > 0.05): + if abs(score - max_val) > tol or (pvalue > 0.05): break # if relatively equal to linear regression value, then continue @@ -175,8 +192,8 @@ def fit_linear_regression( ) # Save test r^2 and test MSE to dataframe - range_test_scores = [round(i, 2) for i in cv_model["test_r2"]] - range_errors = [round(i, 2) for i in cv_model["test_neg_mean_squared_error"]] + range_test_scores = [i for i in cv_model["test_r2"]] + range_errors = [i for i in cv_model["test_neg_mean_squared_error"]] test_sc = pd.DataFrame() test_sc[r"Test r$^2$"] = range_test_scores test_sc["Test MSE"] = range_errors From 5440777d3a37b0508fbaa88517316a890d41645f Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Thu, 31 Oct 2024 17:13:30 -0700 Subject: [PATCH 67/68] PR comment - remove rounding alpha --- .../analyses/linear_regression/linear_regression.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression.py index 49be203f..56a8d235 100644 --- a/nuc_morph_analysis/analyses/linear_regression/linear_regression.py +++ b/nuc_morph_analysis/analyses/linear_regression/linear_regression.py @@ -97,7 +97,7 @@ def fit_linear_regression( data - track level features cols - input features, must not contain rows with nans target - target to predict - alpha - hyperparameter for lasso + alpha - hyperparameter for lassogit tol - tolerance to check drop in r^2 for finding best alpha (ex. 0.02) save_path - location to save files save - whether to save movies and pngs @@ -121,7 +121,7 @@ def fit_linear_regression( if multiple_predictions: # remove 0 alpha due to convergence errors alpha = [i for i in alpha if i != 0] - alpha = [round(i, 1) for i in alpha] + # find best alpha for Lasso model for alpha_ind, this_alpha in tqdm(enumerate(alpha), total=len(alpha)): @@ -247,6 +247,7 @@ def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path): drop=True ) this_perms = all_perms.loc[all_perms["alpha"] == alpha].reset_index(drop=True) + p_value = round(this_perms["p_value"].item(), 3) test_r2_mean = round(this_test_sc["Test r$^2$"].mean(), 2) test_r2_std = round(this_test_sc["Test r$^2$"].std() / 2, 2) @@ -265,15 +266,15 @@ def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path): g.fig.subplots_adjust(top=0.9) # adjust the Figure in rp g.fig.suptitle( - f"Prediction of {get_plot_labels_for_metric(target)[1]}\nalpha={alpha}, test r\u00B2={test_r2_mean}±{test_r2_std}, P={p_value}" + f"Prediction of {get_plot_labels_for_metric(target)[1]}\nalpha={alpha:2f}, test r\u00B2={test_r2_mean}±{test_r2_std}, P={p_value}" ) label_list = [ get_plot_labels_for_metric(col)[1] for col in all_coef_alpha["Column"].unique() ] g.set_yticklabels(label_list) - print(f"Saving coefficients_{target}_alpha_{alpha}.png") - this_path = str(save_path / Path(f"coefficients_{target}_alpha_{alpha}.png")) + print(f"Saving coefficients_{target}_alpha_{alpha:2f}.png") + this_path = str(save_path / Path(f"coefficients_{target}_alpha_{alpha:2f}.png")) files.append(this_path) if not xlim: From 964d45b4ce1f7c94279b2f704bc01d055903239d Mon Sep 17 00:00:00 2001 From: Chantelle Leveille Date: Fri, 8 Nov 2024 10:29:55 -0800 Subject: [PATCH 68/68] update correlation plot --- .../linear_regression/supplemental_lrm_figure_workflow.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py b/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py index 80286847..c1806c82 100644 --- a/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py @@ -65,6 +65,7 @@ tol=TOLERANCE, save_path=FIGDIR, save=False) plot_feature_contribution(df_alpha, df_test, df_coeff, target, computed_alpha[target], fig_height, FIGDIR) -#%% Plot feature correlations using all full tracks -plot_feature_cluster_correlations(df_track_level_features, get_feature_list(CONFIG['all_features'], None), FIGDIR) -#%% +#%% Plot feature correlations using lineage full tracks +plot_feature_cluster_correlations(data, get_feature_list(CONFIG['all_features'], None), FIGDIR) + +# %%