diff --git a/nuc_morph_analysis/analyses/cell_health/cell_health_plots.py b/nuc_morph_analysis/analyses/cell_health/cell_health_plots.py index 6740a399..8743a375 100644 --- a/nuc_morph_analysis/analyses/cell_health/cell_health_plots.py +++ b/nuc_morph_analysis/analyses/cell_health/cell_health_plots.py @@ -87,6 +87,8 @@ def plot_event_histogram(df, event_type, figdir): ax[2].set_xlabel('Time (hr)') plt.tight_layout() + print(f"{colony}: max {percent_event.max():.2f}%") + save_and_show_plot(f'{figdir}/{event_label}_histogram_{colony}') diff --git a/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py new file mode 100644 index 00000000..708fbfc3 --- /dev/null +++ b/nuc_morph_analysis/analyses/linear_regression/analysis_plots.py @@ -0,0 +1,265 @@ +import seaborn as sns +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from nuc_morph_analysis.lib.visualization.plotting_tools import get_plot_labels_for_metric +from nuc_morph_analysis.lib.visualization.notebook_tools import save_and_show_plot +from nuc_morph_analysis.analyses.linear_regression.linear_regression import fit_linear_regression +from nuc_morph_analysis.analyses.linear_regression.select_features import (get_feature_list) + +def plot_feature_cluster_correlations(df_track_level_features, feature_list, figdir): + """ + Plot clustermap of feature correlations. + + Parameters + ---------- + df_track_level_features : pd.DataFrame + DataFrame containing track level features + feature_list : list + List of features to include in the clustermap + Output from get_feature_list + figdir : str + Directory to save the figure + + Returns + ------- + Figure + """ + data = df_track_level_features[feature_list] + + cluster_grid = sns.clustermap(data.corr(), vmin=-1, vmax=1, cmap='BrBG', #'vlag' red to blue + cbar_pos=(0.4, 0.9, 0.3, 0.02), + cbar_kws={"orientation": "horizontal"}, + annot=True, fmt=".1f", annot_kws={"size": 12}, + figsize=(18, 18)) + + + # Hide the dendrograms + cluster_grid.ax_row_dendrogram.set_visible(False) + cluster_grid.ax_col_dendrogram.set_visible(False) + + # Get the reordered labels using the dendrogram information + reordered_column_index = cluster_grid.dendrogram_col.reordered_ind + reordered_labels = [get_plot_labels_for_metric(data.columns[i])[1] for i in reordered_column_index] + + # add a number to the end of each label + reordered_labels = [f'{label} ({i+1})' for i, label in enumerate(reordered_labels)] + + # for the bottom label just use numbers + numbered_labels = [f'{i+1}' for i in range(len(reordered_labels))] + + # Ensure the number of labels matches the number of ticks + cluster_grid.ax_heatmap.set_xticks([x + 0.5 for x in range(len(numbered_labels))]) + cluster_grid.ax_heatmap.set_xticklabels(numbered_labels, rotation=0) + cluster_grid.ax_heatmap.set_yticks([y + 0.5 for y in range(len(reordered_labels))]) + cluster_grid.ax_heatmap.set_yticklabels(reordered_labels, rotation=0) + + # Adjust the padding between the labels and the heatmap + cluster_grid.ax_heatmap.tick_params(axis='x', labelsize=12, width=0.7) + cluster_grid.ax_heatmap.tick_params(axis='y', labelsize=12, width=0.7, labelright=False, labelleft=True, left=True, right=False) + + save_and_show_plot(f'{figdir}/feature_correlation_clustermap', figure=cluster_grid.fig, bbox_inches='tight', dpi=300) + + +def run_regression(df_track_level_features, target, features, name, alpha, figdir="figures/"): + """ + Run linear regression on the given dataset and return the results. + + Parameters: + ---------- + df_track_level_features (pd.DataFrame): DataFrame containing the track level features. + target (str): The target variable for regression. + features (list): List of features to be used for regression. + name (str): Name of the feature group. + alpha (list): List of alpha values for regularization. + figdir (str): Directory path to save the figures. + + Returns: + -------- + dict: A dictionary containing the target, feature group name, mean R-squared value, + standard deviation of R-squared values, alpha value, and the features used. + """ + _, all_test_sc, _ = fit_linear_regression( + df_track_level_features, + cols=get_feature_list(features, target), + target=target, + alpha=alpha, + tol=0.04, + save_path=figdir, + save=False, + multiple_predictions=False + ) + print(f"Target {target}, Alpha: {alpha}. Feature group: {name}") + r_squared = all_test_sc["Test r$^2$"].mean() + std = all_test_sc["Test r$^2$"].std() + return {'target': target, 'feature_group': name, 'r_squared': r_squared, 'stdev': std, 'alpha': 0, 'feats_used': get_feature_list(features, target)} + +def run_regression_workflow(targets, feature_configs, df_track_level_features, alpha, figdir="figures/"): + """ + Run the regression workflow for multiple targets and feature configurations. + + Parameters: + ---------- + targets (list): List of target variables for regression. + feature_configs (dict): Dictionary where keys are feature group names and values are lists of features. + df_track_level_features (pd.DataFrame): DataFrame containing the track level features. + figdir (str): Directory path to save the figures and results. + alpha (float): Alpha value for regularization. + + Returns: + -------- + pd.DataFrame: DataFrame containing the results of the regression workflow, including target, + R-squared value, standard deviation, feature group, alpha value, and features used. + """ + df = pd.DataFrame(columns=['target', 'r_squared', 'stdev', 'feature_group', 'alpha', 'feats_used']) + + for target in targets: + for name, features in feature_configs.items(): + result = run_regression(df_track_level_features, target, features, name, [alpha], figdir) + df = df.append(result, ignore_index=True) + df.to_csv(f"{figdir}r_squared_results.csv") + + df['num_feats_used'] = df['feats_used'].apply(lambda x: len(x)) + + return df + + +def plot_heatmap(df, figdir, cmap='coolwarm'): + """ + Plot heatmap of r_squared values for different feature groups. + + Parameters + ---------- + df : pd.DataFrame + DataFrame containing r_squared values for different feature groups + figdir : str + Directory to save the figure + cmap: str + linear colormap + + Returns + ------- + Figure + """ + # Split the 'feature_group' column into two + df[['start_lifetime', 'intrinsic_extrinsic']] = df['feature_group'].str.split('_', expand=True) + def replace_values(val): + if val in ['all', 'features']: + return 'all_features' + else: + return val + df[['start_lifetime', 'intrinsic_extrinsic']] = df[['start_lifetime', 'intrinsic_extrinsic']].applymap(replace_values) + for index, row in df.iterrows(): + if row['start_lifetime'] == 'intrinsic': + df.at[index, 'intrinsic_extrinsic'] = 'intrinsic' + df.at[index, 'start_lifetime'] = 'both' + elif row['start_lifetime'] == 'extrinsic': + df.at[index, 'intrinsic_extrinsic'] = 'extrinsic' + df.at[index, 'start_lifetime'] = 'both' + + for target, df_target in df.groupby('target'): + pivot_df = df_target.pivot(index='start_lifetime', columns='intrinsic_extrinsic', values='r_squared') + pivot_df_std = df_target.pivot(index='start_lifetime', columns='intrinsic_extrinsic', values='stdev') + + fig, ax = plt.subplots(figsize=(10, 8)) + + sns.heatmap(pivot_df, annot=False, cmap=cmap, ax=ax, vmin=0, vmax=0.5) + + first_element = True + for text_x in range(pivot_df.shape[0]): + for text_y in range(pivot_df.shape[1]): + value = pivot_df.iloc[text_x, text_y] + std_dev = pivot_df_std.iloc[text_x, text_y] + if not np.isnan(value): + color = 'white' if first_element else 'black' + ax.text(text_y+0.5, text_x+0.5, f'{value:.2f} ± {std_dev:.2f}', + horizontalalignment='center', + verticalalignment='center', + color=color, fontsize=16) + first_element = False + + ax.set_xticklabels(['','Extrinsic','Intrinsic']) + ax.xaxis.tick_top() + ax.set_yticklabels(['', 'Both', 'Lifetime', 'Start of growth'], rotation=0) + + ax.set_xlabel('') + ax.set_ylabel('') + ax.tick_params(axis='both', which='both', length=0) + title = ax.set_title(f'Target: {get_plot_labels_for_metric(target)[1]}', loc='left') + title.set_position([-0.1,1]) + save_and_show_plot(f'{figdir}{target}_prediction_r_squared_matrix_alpha_{df.alpha[0]}', bbox_inches='tight') + + +def plot_feature_contribution(coef_alpha, test_sc, perms, target, alpha, fig_height, figdir): + """ + For a given target, plot feature importance for each feature in the linear model at a specified alpha. + Features that touch 0 are considered not important and are excluded from the plot. + + Paramaters + ------- + coef_alpha: pd.DataFrame + DataFrame containing the coefficient importance for each feature + test_sc: pd.DataFrame + DataFrame containing the test r2 scores + perms: pd.DataFrame + DataFrame containing the permutation test results + target: str + Prediction feature + alpha: int + Regularization parameter + fig_height: int + Height of the figure based on number of important features + save_path: str + Path to save the plot + + Returns + ------- + Figure + """ + p_value = round(perms["p_value"].item(), 3) + test_r2_mean = round(test_sc["Test r$^2$"].mean(), 2) + test_r2_std = round(test_sc["Test r$^2$"].std() / 2, 2) + + for col, df_col in coef_alpha.groupby("Column"): + lower_bound = df_col["Coefficient Importance"].mean() - df_col["Coefficient Importance"].std() + upper_bound = df_col["Coefficient Importance"].mean() + df_col["Coefficient Importance"].std() + if lower_bound < 0 and upper_bound > 0 or df_col["Coefficient Importance"].mean() == 0: + coef_alpha = coef_alpha[coef_alpha["Column"] != col] # if coeff importance is 0, dont plot feature + + coef_alpha['Magnitude coefficient importance'] = abs(coef_alpha['Coefficient Importance']) + coef_alpha['Sign'] = coef_alpha['Coefficient Importance'].apply(lambda x: 'Positive' if x > 0 else 'Negative') + + coef_alpha['Mean Magnitude'] = coef_alpha.groupby('Column')['Magnitude coefficient importance'].transform('mean') + coef_alpha = coef_alpha.sort_values('Mean Magnitude', ascending=False).drop(columns=['Mean Magnitude']) + + plt.figure(figsize=(4,fig_height*.5)) + ax = sns.barplot( + data=coef_alpha, + y="Column", + x="Magnitude coefficient importance", + hue="Sign", + palette={'Positive': '#156082', 'Negative': 'grey'}, + errorbar="sd", + width=0.7, + native_scale=True) + + for patch in ax.patches: + patch.set_edgecolor('black') + patch.set_linewidth(1.5) + current_height = patch.get_height() + desired_height = 0.7 + patch.set_height(desired_height) + patch.set_y(patch.get_y() + (current_height - desired_height) * 0.5) + + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + + plt.ylabel("") + plt.legend(frameon=False, bbox_to_anchor=(1.05, 1), loc='upper left') + if target == 'delta_volume_BC': + ax.get_legend().remove() + plt.title(f"Target: {get_plot_labels_for_metric(target)[1]}, alpha={alpha}, test r\u00B2={test_r2_mean}±{test_r2_std}, P={p_value}") + label_list = [get_plot_labels_for_metric(col)[1] for col in coef_alpha["Column"].unique()] + + plt.yticks(ticks=range(len(label_list)), labels=label_list) + save_and_show_plot(f'{figdir}/coefficients_{target}_alpha_{alpha}', dpi=300, bbox_inches='tight') \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py b/nuc_morph_analysis/analyses/linear_regression/linear_regression.py similarity index 50% rename from nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py rename to nuc_morph_analysis/analyses/linear_regression/linear_regression.py index bbadf2f6..56a8d235 100644 --- a/nuc_morph_analysis/analyses/linear_regression/linear_regression_workflow.py +++ b/nuc_morph_analysis/analyses/linear_regression/linear_regression.py @@ -1,9 +1,7 @@ import argparse -import ast import os import warnings from pathlib import Path -import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns @@ -12,23 +10,68 @@ RepeatedKFold, cross_validate, ) +from tqdm import tqdm from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler -from tqdm import tqdm from nuc_morph_analysis.lib.preprocessing import global_dataset_filtering, filter_data from sklearn.model_selection import permutation_test_score -from nuc_morph_analysis.lib.visualization.plotting_tools import get_plot_labels_for_metric +from nuc_morph_analysis.lib.visualization.plotting_tools import ( + get_plot_labels_for_metric, +) +import imageio +from nuc_morph_analysis.analyses.linear_regression.select_features import ( + get_feature_list, +) +from nuc_morph_analysis.analyses.linear_regression.utils import ( + list_of_strings, + list_of_floats, +) pd.options.mode.chained_assignment = None # default='warn' warnings.simplefilter(action="ignore", category=FutureWarning) -def main(cols, target, alpha_range, tolerance, save_path, cached_dataframe=None): +def main( + cols, + target, + alpha_range, + tolerance, + save_path, + cached_dataframe=None, + save_movie=False, +): + """ + Main function to perform linear regression analysis. + + Parameters + ---------- + cols : list of str + List of column names to be used as features for the regression. + target : str + The target column name for the regression. + alpha_range : tuple of float + The range of alpha values (regularization strength) to be tested. + tolerance : float + The tolerance for the optimization. + save_path : str + The path where the results and plots will be saved. + cached_dataframe : pd.DataFrame, optional + A cached DataFrame to use for the analysis. If None, a new DataFrame will be created. + save_movie : bool, optional + If True, a movie of the regression process will be saved. Default is False. + + Returns + ------- + None + """ save_path = Path(save_path) save_path = save_path / Path("linear_regression") save_path.mkdir(parents=True, exist_ok=True) + if len(cols) < 1: + cols = get_feature_list(['start_intrinsic', 'lifetime_intrinsic', 'start_extrinsic', 'lifetime_extrinsic'], None) + if not cached_dataframe: df_all = global_dataset_filtering.load_dataset_with_features() df_full = filter_data.all_timepoints_full_tracks(df_all) @@ -36,18 +79,30 @@ def main(cols, target, alpha_range, tolerance, save_path, cached_dataframe=None) else: df_track_level_features = pd.read_csv(cached_dataframe) - fit_linear_regression(df_track_level_features, cols, target, alpha_range, tolerance, save_path) + fit_linear_regression( + df_track_level_features, + cols, + target, + alpha_range, + tolerance, + save_path, + save_movie, + ) + def fit_linear_regression( - data, cols, target, alpha, tol, save_path + data, cols, target, alpha, tol, save_path, save, permute_cols=[], multiple_predictions=True ): """ data - track level features - cols - input features + cols - input features, must not contain rows with nans target - target to predict - alpha - hyperparameter for lasso + alpha - hyperparameter for lassogit tol - tolerance to check drop in r^2 for finding best alpha (ex. 0.02) save_path - location to save files + save - whether to save movies and pngs + permute_col - list of features to permute and replace with noise + multiple_predictions - Boolean, whether to stop when r^2 is reduced by tolerance """ sns.set_context("talk") random_state = 2652124 @@ -55,15 +110,27 @@ def fit_linear_regression( # init empty dicts and lists all_test_sc = [] all_coef_alpha = [] - all_perms = {'score': [], 'perm_score_mean': [], 'perm_score_std': [], 'p_value': [], 'alpha': []} + all_perms = { + "score": [], + "perm_score_mean": [], + "perm_score_std": [], + "p_value": [], + "alpha": [], + } + + if multiple_predictions: + # remove 0 alpha due to convergence errors + alpha = [i for i in alpha if i != 0] - # find best alpha for Lasso model - for alpha_ind, this_alpha in enumerate(alpha): - print("fitting alpha", this_alpha) - # drop any nan rows - dropna_cols = cols + [target] - data = data.dropna(subset=dropna_cols) + # find best alpha for Lasso model + for alpha_ind, this_alpha in tqdm(enumerate(alpha), total=len(alpha)): + # permute columns if necessary + if len(permute_cols) > 0: + for col in permute_cols: + mu, sigma = 0, 1 + noise = np.random.normal(mu, sigma, len(data)) + data[col] = noise # make numpy array for inputs and target all_input = data[cols].reset_index(drop=True).values @@ -80,25 +147,30 @@ def fit_linear_regression( # run permutation test score, permutation_scores, pvalue = permutation_test_score( - model, all_input, all_target, random_state=random_state, cv=5, n_permutations=500, + model, + all_input, + all_target, + random_state=random_state, + cv=5, + n_permutations=500, ) # break if permutation score is less than linear regression value (max possible) # with a tolerance # or if p_value > 0.05 - rounded_permutation_score = round(score, 2) if alpha_ind == 0: - max_val = rounded_permutation_score - if abs(rounded_permutation_score - max_val) > tol or (pvalue > 0.05): - break + max_val = score + if multiple_predictions: + if abs(score - max_val) > tol or (pvalue > 0.05): + break # if relatively equal to linear regression value, then continue # save permutation score and p_value to dictionary - all_perms['score'].append(score) - all_perms['perm_score_mean'].append(permutation_scores.mean()) - all_perms['perm_score_std'].append(permutation_scores.std()) - all_perms['p_value'].append(pvalue) - all_perms['alpha'].append(this_alpha) + all_perms["score"].append(score) + all_perms["perm_score_mean"].append(permutation_scores.mean()) + all_perms["perm_score_std"].append(permutation_scores.std()) + all_perms["p_value"].append(pvalue) + all_perms["alpha"].append(this_alpha) # run cross validate to get model coefficients cv_model = cross_validate( @@ -120,10 +192,8 @@ def fit_linear_regression( ) # Save test r^2 and test MSE to dataframe - range_test_scores = [round(i, 2) for i in cv_model["test_r2"]] - range_errors = [ - round(i, 2) for i in cv_model["test_neg_mean_squared_error"] - ] + range_test_scores = [i for i in cv_model["test_r2"]] + range_errors = [i for i in cv_model["test_neg_mean_squared_error"]] test_sc = pd.DataFrame() test_sc[r"Test r$^2$"] = range_test_scores test_sc["Test MSE"] = range_errors @@ -141,7 +211,8 @@ def fit_linear_regression( # Get test scores for all alpha all_test_sc = pd.concat(all_test_sc, axis=0).reset_index(drop=True) all_test_sc["Test MSE"] = -all_test_sc["Test MSE"] - all_test_sc.to_csv(save_path / "mse.csv") + save_path = save_path / Path(f"{target}") + save_path.mkdir(parents=True, exist_ok=True) # Get coeffs for all alpha all_coef_alpha = pd.concat(all_coef_alpha, axis=0).reset_index(drop=True) @@ -150,61 +221,73 @@ def fit_linear_regression( var_name="Column", value_name="Coefficient Importance", ).reset_index(drop=True) - all_coef_alpha.to_csv(save_path / "coefficients.csv") # Get permutation scores and p values for all alpha all_perms = pd.DataFrame(all_perms).reset_index(drop=True) - all_perms.to_csv(save_path / "perm_scores.csv") - # Save coefficient plot for max alpha value - save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path) + # Save coefficient plot movie + if save: + all_test_sc.to_csv(save_path / "mse.csv") + all_coef_alpha.to_csv(save_path / "coefficients.csv") + all_perms.to_csv(save_path / "perm_scores.csv") + save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path) return all_coef_alpha, all_test_sc, all_perms -def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path): - # subset to max alpha - max_alpha = all_coef_alpha['alpha'].max() - all_coef_alpha = all_coef_alpha.loc[all_coef_alpha['alpha'] == max_alpha].reset_index(drop=True) - all_test_sc = all_test_sc.loc[all_test_sc['alpha'] == max_alpha].reset_index(drop=True) - all_perms = all_perms.loc[all_perms['alpha'] == max_alpha].reset_index(drop=True) - - p_value = round(all_perms['p_value'].item(), 3) - test_r2_mean = round(all_test_sc['Test r$^2$'].mean(), 2) - test_r2_std = round(all_test_sc['Test r$^2$'].std()/2, 2) - - - g = sns.catplot( - data=all_coef_alpha, - x='Column', - y="Coefficient Importance", - kind="bar", - errorbar="sd", - aspect=1.5, - height=4, - ) - g.fig.subplots_adjust(top=0.8) # adjust the Figure in rp - g.fig.suptitle(f'p-value {p_value}, test r^2 {test_r2_mean}+-{test_r2_std}') - label_list = [get_plot_labels_for_metric(col)[1] for col in all_coef_alpha['Column'].unique()] - g.set_xticklabels(label_list, rotation=90) - print(f'Saving coefficients_{target}_alpha_{max_alpha}.png') - g.savefig(save_path / f'coefficients_{target}_alpha_{max_alpha}.png') +def save_plots(all_coef_alpha, all_test_sc, all_perms, target, save_path): -def list_of_strings(arg): - return arg.split(",") + xlim = None + files = [] + for alpha in all_coef_alpha["alpha"].unique(): + this_coef_alpha = all_coef_alpha.loc[ + all_coef_alpha["alpha"] == alpha + ].reset_index(drop=True) + this_test_sc = all_test_sc.loc[all_test_sc["alpha"] == alpha].reset_index( + drop=True + ) + this_perms = all_perms.loc[all_perms["alpha"] == alpha].reset_index(drop=True) + + p_value = round(this_perms["p_value"].item(), 3) + test_r2_mean = round(this_test_sc["Test r$^2$"].mean(), 2) + test_r2_std = round(this_test_sc["Test r$^2$"].std() / 2, 2) + + g = sns.catplot( + data=this_coef_alpha, + y="Column", + x="Coefficient Importance", + kind="bar", + errorbar="sd", + aspect=2, + height=10, + ) + g.set(ylabel="") -def list_of_floats(arg): - return list(map(float, arg.split(","))) + g.fig.subplots_adjust(top=0.9) # adjust the Figure in rp + g.fig.suptitle( + f"Prediction of {get_plot_labels_for_metric(target)[1]}\nalpha={alpha:2f}, test r\u00B2={test_r2_mean}±{test_r2_std}, P={p_value}" + ) + label_list = [ + get_plot_labels_for_metric(col)[1] + for col in all_coef_alpha["Column"].unique() + ] + g.set_yticklabels(label_list) + print(f"Saving coefficients_{target}_alpha_{alpha:2f}.png") + this_path = str(save_path / Path(f"coefficients_{target}_alpha_{alpha:2f}.png")) + files.append(this_path) + if not xlim: + xlim = g.fig.axes[0].get_xlim() + g.set(xlim=xlim) + g.savefig(this_path, dpi=300) -def str2bool(v): - if isinstance(v, bool): - return v - if v.lower() in ("yes", "true", "t", "y", "1"): - return True - elif v.lower() in ("no", "false", "False", "f", "n", "0"): - return False + # save movie of pngs + writer = imageio.get_writer(save_path / f"{target}_coefficients_over_time.mp4", fps=2) + for im in files: + writer.append_data(imageio.imread(im)) + os.remove(im) + writer.close() if __name__ == "__main__": @@ -222,7 +305,7 @@ def str2bool(v): parser.add_argument( "--cols", type=list_of_strings, - default=['volume_at_B', 'time_at_B', 'colony_time_at_B', 'SA_at_B'], + default=[], help="Supply a list of column names to use as independent variables in the linear regression analysis.", ) parser.add_argument( @@ -234,7 +317,7 @@ def str2bool(v): parser.add_argument( "--alpha_range", type=list_of_floats, - default=[1.3], + default=np.arange(0, 15, 0.1, dtype=float), help="Supply a list of alpha values to use in lasso regression", ) parser.add_argument( @@ -249,6 +332,12 @@ def str2bool(v): default=0.02, help="Tolerace for change in regression score to determine best alpha", ) + parser.add_argument( + "--save", + type=bool, + default=False, + help="Save plots", + ) args = parser.parse_args() main( cols=args.cols, @@ -256,5 +345,6 @@ def str2bool(v): alpha_range=args.alpha_range, tolerance=args.tolerance, save_path=args.save_path, - cached_dataframe=args.cached_dataframe + cached_dataframe=args.cached_dataframe, + save_movie=args.save, ) diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume.sh b/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume.sh deleted file mode 100755 index 2b6da4d1..00000000 --- a/nuc_morph_analysis/analyses/linear_regression/scripts/added_volume.sh +++ /dev/null @@ -1,5 +0,0 @@ -python linear_regression_workflow.py \ ---cols 'volume_at_A','volume_at_B','time_at_A','time_at_B','colony_time_at_A','colony_time_at_B','SA_at_B' \ ---alpha_range 0,0.1,0.5,1,1.3,1.5,2,2.5,5,10,11,12,13 \ ---target 'delta_volume_BC' \ ---save_path "../../figures/" \ \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC.sh b/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC.sh deleted file mode 100755 index ac7c53d7..00000000 --- a/nuc_morph_analysis/analyses/linear_regression/scripts/duration_BC.sh +++ /dev/null @@ -1,5 +0,0 @@ -python linear_regression_workflow.py \ ---cols 'volume_at_A','volume_at_B','time_at_A','time_at_B','colony_time_at_A','colony_time_at_B','SA_at_B' \ ---alpha_range 0,0.1,0.5,1,1.3,1.5,2,2.5,5,10 \ ---target 'duration_BC' \ ---save_path "../../figures/" \ \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume.sh b/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume.sh deleted file mode 100755 index bc098992..00000000 --- a/nuc_morph_analysis/analyses/linear_regression/scripts/ending_volume.sh +++ /dev/null @@ -1,5 +0,0 @@ -python linear_regression_workflow.py \ ---cols 'volume_at_A','volume_at_B','time_at_A','time_at_B','colony_time_at_A','colony_time_at_B','SA_at_B' \ ---alpha_range 0,0.1,0.5,1,1.3,1.5,2,2.5,5,10,11,12,13 \ ---target 'volume_at_C' \ ---save_path "../../figures/" \ \ No newline at end of file diff --git a/nuc_morph_analysis/analyses/linear_regression/select_features.py b/nuc_morph_analysis/analyses/linear_regression/select_features.py new file mode 100644 index 00000000..0d714f3e --- /dev/null +++ b/nuc_morph_analysis/analyses/linear_regression/select_features.py @@ -0,0 +1,83 @@ +FEATURE_GROUPS = { + 'start_intrinsic': [ #intrinic at start of growth + 'volume_at_B', + 'height_at_B', + 'SA_vol_ratio_at_B', + 'SA_at_B', + 'xy_aspect_at_B', + 'sisters_volume_at_B', + ], + + 'lifetime_intrinsic': [ # intrinsic lifetime + 'duration_BC', + 'volume_at_C', + 'delta_volume_BC', + 'late_growth_rate_by_endpoints', + 'tscale_linearityfit_volume', + 'sisters_duration_BC', + 'sisters_delta_volume_BC', + ], + + 'start_extrinsic': [ # extrinsic at start of growth + 'time_at_B', + 'colony_time_at_B', + 'neighbor_avg_lrm_volume_90um_at_B', + 'neighbor_avg_lrm_height_90um_at_B', + 'neighbor_avg_lrm_xy_aspect_90um_at_B', + 'neighbor_avg_lrm_mesh_sa_90um_at_B', + 'early_neighbor_avg_dxdt_48_volume_90um', + ], + + 'lifetime_extrinsic': [ # extrinsic lifetime + 'normalized_sum_has_mitotic_neighbor', + 'normalized_sum_has_dying_neighbor', + 'mean_neighbor_avg_lrm_volume_90um', + 'mean_neighbor_avg_lrm_height_90um', + 'mean_neighbor_avg_lrm_xy_aspect_90um', + 'mean_neighbor_avg_lrm_mesh_sa_90um', + 'mean_neighbor_avg_dxdt_48_volume_90um', + 'mean_neighbor_avg_lrm_2d_area_nuc_cell_ratio_90um', + ], +} + +TARGET_CONTAINTING_FEATS = { + 'duration_BC': [ + 'duration_BC', + 'late_growth_rate_by_endpoints', + ], + 'delta_volume_BC': [ + 'volume_at_C', + 'delta_volume_BC', + 'late_growth_rate_by_endpoints', + ], + None: [ + '', + ] +} + +def get_feature_list(feature_group_list, target): + """ + Get feature list to include in linear model. + Gets full features list and excludes ones that contain target information. + + Parameters + ---------- + feature_group_list: list + List of feature groups to include in the feature list + target: str + Target variable to predict + + Returns + ------- + features: list + List of features to include in the linear model + """ + features = [] + for group in feature_group_list: + features = features + FEATURE_GROUPS[group] + + if target is not None: + features = [feature for feature in features if feature not in TARGET_CONTAINTING_FEATS[target]] + + return features + diff --git a/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py b/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py new file mode 100644 index 00000000..c1806c82 --- /dev/null +++ b/nuc_morph_analysis/analyses/linear_regression/supplemental_lrm_figure_workflow.py @@ -0,0 +1,71 @@ +#%% +import numpy as np +from nuc_morph_analysis.lib.preprocessing import global_dataset_filtering, filter_data +from nuc_morph_analysis.analyses.linear_regression.linear_regression import fit_linear_regression +from nuc_morph_analysis.analyses.linear_regression.select_features import get_feature_list +from nuc_morph_analysis.analyses.linear_regression.analysis_plots import (run_regression_workflow, + plot_feature_cluster_correlations, + plot_heatmap, + plot_feature_contribution) + +#%% +df_all = global_dataset_filtering.load_dataset_with_features() +df_full = filter_data.all_timepoints_full_tracks(df_all) +df_track_level_features = filter_data.track_level_features(df_full) + +#%% +CONFIG = { + 'all_features': ['start_intrinsic', 'lifetime_intrinsic', 'start_extrinsic', 'lifetime_extrinsic'], + 'start_intrinsic': ['start_intrinsic'], + 'lifetime_intrinsic': ['lifetime_intrinsic'], + 'start_extrinsic': ['start_extrinsic'], + 'lifetime_extrinsic': ['lifetime_extrinsic'], + 'intrinsic': ['start_intrinsic', 'lifetime_intrinsic'], + 'extrinsic': ['start_extrinsic', 'lifetime_extrinsic'], +} +FIGDIR='linear_regression/figures/' +TARGETS = ['duration_BC', 'delta_volume_BC'] + +#%% Preprocess dataframe to ensure same N for all analysis (full tracks with lineage features) +dropna_cols = get_feature_list(CONFIG['all_features'], None) +data = df_track_level_features.dropna(subset=dropna_cols) +print(f"Number of tracks: {len(data)}") + +#%% Create maxtrix of r squared values +df = run_regression_workflow(TARGETS, CONFIG, data, alpha=0) +plot_heatmap(df, FIGDIR, 'YlOrRd') + +#%% +RECOMPUTE_ALPHA = True +TOLERANCE = 0.05 +computed_alpha = {'duration_BC': 1.4, + 'delta_volume_BC': 10.2} + +#%% Recompute maximum alpha when tolerance is reached +if RECOMPUTE_ALPHA: + for target in ['duration_BC', 'delta_volume_BC']: + all_coef_alpha, all_test_sc, all_perms = fit_linear_regression( + data, + cols=get_feature_list(CONFIG['all_features'], target), + target=target, + alpha=np.arange(0, 15, 0.2, dtype=float), + tol=TOLERANCE, + save_path="figures/", + save=True) + + max_alpha = all_coef_alpha.alpha.max() + computed_alpha[target] = max_alpha + print(f"{target}: {max_alpha}") + +#%% Plot feature importance for max alpha when tolerance is reached +for target, fig_height in zip(['duration_BC', 'delta_volume_BC'], [6, 2]): + df_alpha, df_test, df_coeff = fit_linear_regression(data, + cols=get_feature_list(CONFIG['all_features'], target), + target=target, alpha=[computed_alpha[target]], + tol=TOLERANCE, save_path=FIGDIR, save=False) + plot_feature_contribution(df_alpha, df_test, df_coeff, target, computed_alpha[target], fig_height, FIGDIR) + +#%% Plot feature correlations using lineage full tracks +plot_feature_cluster_correlations(data, get_feature_list(CONFIG['all_features'], None), FIGDIR) + +# %% diff --git a/nuc_morph_analysis/analyses/linear_regression/utils.py b/nuc_morph_analysis/analyses/linear_regression/utils.py new file mode 100644 index 00000000..423cf66b --- /dev/null +++ b/nuc_morph_analysis/analyses/linear_regression/utils.py @@ -0,0 +1,15 @@ +def list_of_strings(arg): + return arg.split(",") + + +def list_of_floats(arg): + return list(map(float, arg.split(","))) + + +def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "False", "f", "n", "0"): + return False diff --git a/nuc_morph_analysis/lib/preprocessing/add_features.py b/nuc_morph_analysis/lib/preprocessing/add_features.py index 0700eae0..440f1ab8 100644 --- a/nuc_morph_analysis/lib/preprocessing/add_features.py +++ b/nuc_morph_analysis/lib/preprocessing/add_features.py @@ -1,4 +1,5 @@ from nuc_morph_analysis.analyses.lineage.get_features import lineage_trees +from nuc_morph_analysis.lib.visualization.plotting_tools import get_plot_labels_for_metric import numpy as np FRAME_COL = {"Ff": "A", "frame_transition": "B", "Fb": "C"} @@ -140,6 +141,68 @@ def add_feature_at(df, frame_column, feature, feature_column, multiplier=1): ) return df +def add_mean_feature_over_trajectory(df, feature_list, multiplier_list): + """ + Add the mean of a given feature over the growth trajectory + from transition to frame breakdown. + + Parameters + ---------- + df : DataFrame + The dataframe + feature_list : list + List of column names + multiplier_list : list + List of scale to multiply the mean by + + Returns + ------- + df : DataFrame + The dataframe with the added mean feature columns + """ + for feature, multiplier in zip(feature_list, multiplier_list): + for tid, dft in df.groupby("track_id"): + start = dft.frame_transition.values[0] + stop = dft.Fb.values[0] + df_mean = dft[(dft['index_sequence'] >= start) & (dft['index_sequence'] <= stop)] + mean = df_mean[feature].mean() * multiplier + df.loc[df.track_id == tid, f"mean_{feature}"] = mean + return df + + +def get_early_transient_gr_of_neighborhood(df, scale, time_shift=24, window_length=6): + """ + Get the transient growth rate of the local neighborhood 2 hours into the growth trajectory. + + This time shift of two hours into the growth trajectory is necessary because the metric + is calculated as the average of a 4 hour rolling window. The middle of a four hour window + does not occur until two hours into the timelapse. To calculate this feature equivalently + for each trajectory, two hours was used for all tracks to get a metric for the transient + growth rate of the neighborhood early in the growth trajectory. The early transient growth + rate is averaged over 30 minutes as defined by the window_length. + + Parameters + ---------- + df : DataFrame + The dataframe + time_shift : int + The time shift in frames to calculate the transient growth rate in frames + window_length : int + The length of the time window in frames to average over + + Returns + ------- + df : DataFrame + The dataframe with the added transient growth rate feature columns + """ + for tid, dft in df.groupby("track_id"): + t_calculate = dft.index_sequence.min() + time_shift + time_window_mask = dft.index_sequence.between(t_calculate, t_calculate + window_length) + transient_gr_90um = dft.loc[time_window_mask, "neighbor_avg_dxdt_48_volume_90um"].mean() + df.loc[df.track_id == tid, "early_neighbor_avg_dxdt_48_volume_90um"] = transient_gr_90um * scale + + return df + def add_volume_at(df, pixel_size, frame_column): """ @@ -390,6 +453,62 @@ def add_non_interphase_size_shape_flag(df): ].any(axis=1) return df +def get_sister(df, pid, current_tid): + """ + Gets the track_id of the sibling + + Parameters + ---------- + df: Dataframe + The dataset dataframe + track_id: int + The track_id of the cell + + Returns + ------- + sister_id: List + List containing the track_id of the sibling cell + """ + df_sisters = df.loc[df.parent_id == pid] + tids = df_sisters.track_id.unique() + sister_id = [tid for tid in tids if tid != current_tid] + return sister_id + +def add_lineage_features(df, feature_list): + """ + If the full track has a full track sister or mother, add the given relative's feature as a single track feature column in the dataframe. + + Paramaters + ---------- + df: DataFrame + The dataframe + feature_list: list + List of column names + + Returns + ------- + df: DataFrame + The dataframe with new columns (ie mothers_vol_at_B, sisters_duration) + """ + + for feature in feature_list: + df[f"mothers_{feature}"] = np.nan + df[f"sisters_{feature}"] = np.nan + + df_lineage = df[df['colony'].isin(['small', 'medium'])] + + for tid, dft in df_lineage.groupby("track_id"): + parent_id = dft.parent_id.values[0] + if parent_id != -1 and parent_id in df_lineage.track_id.unique(): + for feature in feature_list: + df.loc[df.track_id == tid, f"mothers_{feature}"] = df_lineage.loc[df_lineage.track_id == parent_id, feature].values[0] + if parent_id != -1: + sister_id = get_sister(df_lineage, parent_id, tid) + if len(sister_id) > 0: + for feature in feature_list: + df.loc[df.track_id == tid, f"sisters_{feature}"] = df_lineage.loc[df_lineage.track_id == sister_id[0], feature].values[0] + + return df def sum_events_along_full_track(df0, feature_list, index_columns=['track_id','index_sequence','Fb','frame_transition']): """ @@ -468,6 +587,26 @@ def sum_mitotic_events_along_full_track(df0, feature_list=[]): return sum_events_along_full_track(df0, feature_list) +def normalize_sum_events(df_full, event_cols=['sum_has_mitotic_neighbor', 'sum_has_dying_neighbor']): + """ + Normalize sum of mitotic and death events by growth duration + + Parameters + ---------- + df_full: DataFrame + The dataframe of full tracks + event_cols: list + ie. 'sum_has_mitotic_neighbor', 'sum_has_dying_neighbor' + + Returns + ------- + df_full: DataFrame + The dataframe with the normalized sum of events columns + """ + for col in event_cols: + df_full[f"normalized_{col}"] = df_full[col] / df_full['duration_BC'] + return df_full + def add_perimeter_ratio(df): """ compute ratio of the nucleus perimeter to the pseudo cell perimeter (2d) @@ -486,4 +625,61 @@ def add_perimeter_ratio(df): """ df['2d_perimeter_nuc_cell_ratio'] = df['2d_perimeter_nucleus'] / df['2d_perimeter_pseudo_cell'] - return df \ No newline at end of file + return df + +def add_features_at_transition(df, + feature_list=['xy_aspect', + 'SA_vol_ratio', + 'neighbor_avg_lrm_volume_90um', + 'neighbor_avg_lrm_height_90um', + 'neighbor_avg_lrm_xy_aspect_90um', + 'neighbor_avg_lrm_mesh_sa_90um', + 'neighbor_avg_dxdt_48_volume_90um', + 'neighbor_avg_lrm_2d_area_nuc_cell_ratio_90um'] + ): + """ + Add feature measurements at transition that are used in the linear regression analysis. + Features should be pre-calculated and not need to be scaled. + + Parameters + ---------- + df_full : DataFrame + The dataframe containing full trajectories + feature_list : list + List of column names + + Returns + ------- + df_full : DataFrame + The dataframe with the added feature columns + """ + for feature in feature_list: + df = add_feature_at(df, "frame_transition", feature, feature) + return df + +def add_mean_features(df, + feature_list=['neighbor_avg_dxdt_48_volume_90um', + 'neighbor_avg_lrm_volume_90um', + 'neighbor_avg_lrm_height_90um', + 'neighbor_avg_lrm_xy_aspect_90um', + 'neighbor_avg_lrm_mesh_sa_90um', + 'neighbor_avg_lrm_2d_area_nuc_cell_ratio_90um'] + ): + """ + Add mean feature measurements over the growth trajectory that are used in the linear regression analysis. + + Parameters + ---------- + df : DataFrame + The dataframe containing full trajectories + feature_list : list + List of column names + + Returns + ------- + df : DataFrame + The dataframe with the added mean feature columns + """ + multiplier_list = [get_plot_labels_for_metric(x)[0] for x in feature_list] + df = add_mean_feature_over_trajectory(df, feature_list, multiplier_list) + return df diff --git a/nuc_morph_analysis/lib/preprocessing/add_neighborhood_avg_features_lrm.py b/nuc_morph_analysis/lib/preprocessing/add_neighborhood_avg_features_lrm.py new file mode 100644 index 00000000..36525e5f --- /dev/null +++ b/nuc_morph_analysis/lib/preprocessing/add_neighborhood_avg_features_lrm.py @@ -0,0 +1,168 @@ +# %% +import pandas as pd +from tqdm import tqdm +import numpy as np +from multiprocessing import Pool, cpu_count +from nuc_morph_analysis.lib.preprocessing.load_data import get_dataset_pixel_size +from nuc_morph_analysis.lib.preprocessing import filter_data +from nuc_morph_analysis.lib.preprocessing.filter_data import all_timepoints_minimal_filtering + +LOCAL_RADIUS_LIST = [90, -1] +LOCAL_RADIUS_STR_LIST = ["90um", "whole_colony"] +NEIGHBOR_FEATURE_LIST = ["volume"] +NEIGHBOR_PREFIX = "neighbor_avg_lrm_" + +def run_in_parallel(args): + return get_neighbor_avg_at_t(*args) + + +def get_neighbor_avg_at_t(dft, local_radius_list, columns, min_neighbor_thresh=5): + """ + Compute the average value of the columns for each CellId at a given timepoint within a given colony + by finding its neighbors within a given radius=local_radius + + Parameters + ---------- + dft : pd.DataFrame + dataframe that minimally has the following columns: + ['index_sequence','track_id','centroid_x','centroid_y'] + with index = "CellId" + local_radius_list : list + list of integers that represent the radius in microns to use for finding neighbors + -1 signifies the whole colony + columns : list + list of column names to compute the average value for + min_neighbor_thresh : int + minimum number of neighbors required to compute the average value + + Returns + ------- + dft : pd.DataFrame + dataframe with the average value of the columns for each CellId at a given timepoint within a given colony + """ + + # now get the centroid values for all CellIds at that index sequence + centroid_xy = dft[["centroid_x", "centroid_y"]].values + # now compute pair-wise distances between all centroid_xy values + dists = np.linalg.norm(centroid_xy[:, None] - centroid_xy[None], axis=2) + + # initialize the new columns to be added to the dataframe + # to avoid the PerformanceWarning: DataFrame is highly fragmented occuring in the last line of this function + data = {} + for li, local_radius in enumerate(local_radius_list): + local_radius_str = LOCAL_RADIUS_STR_LIST[li] + for column in columns: + data[f"{NEIGHBOR_PREFIX}{column}_{local_radius_str}"] = np.nan + + dftnew = pd.DataFrame(data, index=dft.index) + + for li, local_radius in enumerate(local_radius_list): + local_radius_str = LOCAL_RADIUS_STR_LIST[li] + # now for each CellId find the neighbors within a circle with radius = radius + pix_size = get_dataset_pixel_size(dft.colony.values[0]) + radius = local_radius / pix_size # pixels + if local_radius < 0: + neighbors = np.ones(dists.shape, dtype=bool) + else: + neighbors = dists < radius + + # don't allow "self" to be a neighbor + np.fill_diagonal(neighbors, False) + + for column in columns: + col_vals1d = dft[column].values + col_vals2d = np.tile(col_vals1d, (col_vals1d.size, 1)).astype(float) + # now to make the diagonal values nan so that the cell of interest is not included in the average, but only the neighbors + col_vals2d[~neighbors] = np.nan + + # some rows could be all nan + # so we need to track the indices of the rows that are not all nan + # and only use those indices to update the dataframe + # this is important for avoiding an annoyint RuntimeWarning: Mean of empty slice + non_nan_neighbors_per_row = np.sum(~np.isnan(col_vals2d), axis=1) + + non_nan_rows = non_nan_neighbors_per_row >= min_neighbor_thresh + col_vals2d = col_vals2d[non_nan_rows] + + # now compute the average of the column values for each row (average of the neighbors) + # Compute the average value of the column for each neighborhood + col_vals_avg = np.nanmean(col_vals2d, axis=1) + + # Add the average values to the dataframe + # dft.loc[non_nan_rows,f'{prefix}{column}_{local_radius_str}'] = col_vals_avg + dftnew.loc[ + dft.index.values[non_nan_rows], f"{NEIGHBOR_PREFIX}{column}_{local_radius_str}" + ] = col_vals_avg + dft = dft.join(dftnew) + return dft + + +def run_script( + df, + num_workers=1, + feature_list=NEIGHBOR_FEATURE_LIST, + local_radius_list=LOCAL_RADIUS_LIST, + exclude_outliers=True, +): + """ + Determine average values of features within neighborhoods of defined radius around each cell + + Parameters + ---------- + df : pd.DataFrame + dataframe with index=CellId, and the following columns minimally: + ['index_sequence','track_id','centroid_x','centroid_y'] + feature_list + [x for x in df.columns if 'dxdt' in x] + num_workers : int, optional + number of workers to use for parallel processing. The default is 1. + feature_list : list, optional + list of column names that represent the features to proces. The default is NEIGHBOR_FEATURE_LIST + local_radius_list : list, optional + list of integers that represent the radius in microns to use for finding neighbors. The default is LOCAL_RADIUS_LIST. + -1 signifies the whole colony + exclude_outliers : bool, optional + whether to exclude outliers. The default is False. + + Returns + ------- + dforig : pd.DataFrame + original dataframe with the newly computed columns added + index of dataframe = CellId + """ + + dforig = df.copy() + original_columns = dforig.columns.tolist() + if exclude_outliers: + # to ensure that outlier data points are not used for the neighborhood avgs, filter out time point outliers here + df = all_timepoints_minimal_filtering(df) + + # also be sure to filter out non-interphase cells from neighborhood + # df = filter_data.filter_out_non_interphase_size_shape_flag(df) + # df = filter_data.filter_out_cells_entering_or_exiting_mitosis(df) + + for colony in df.colony.unique(): + dfi = df[df["colony"] == colony] + pass_cols = ["index_sequence", "colony", "track_id", "centroid_x", "centroid_y"] + + columns = feature_list + [x for x in dfi.columns if "dxdt" in x] + + # first find the unique index_sequence values + index_sequences = dfi["index_sequence"].unique() + dfin = dfi[ + pass_cols + columns + ] # reduce the size of the dataframe being passed in by only including necessary columns + + # run in parallel + args_list = [ + (dfin[dfin["index_sequence"] == t], local_radius_list, columns) for t in index_sequences + ] + num_workers = np.min([num_workers, cpu_count()]) + if num_workers == 1: + results = list(map(run_in_parallel, args_list)) + else: + results = list(Pool(num_workers).imap_unordered(run_in_parallel, args_list)) + dft = pd.concat(results) + new_cols = [x for x in dft.columns if x not in original_columns] + dforig.loc[dft.index, new_cols] = dft[new_cols] + return dforig + +# %% diff --git a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py index 30deef53..30eb05cf 100644 --- a/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py +++ b/nuc_morph_analysis/lib/preprocessing/global_dataset_filtering.py @@ -9,6 +9,7 @@ is_tp_outlier, add_features, add_neighborhood_avg_features, + add_neighborhood_avg_features_lrm, compute_change_over_time, ) from nuc_morph_analysis.analyses.volume import add_growth_features @@ -17,6 +18,7 @@ add_fov_touch_timepoint_for_colonies, ) from nuc_morph_analysis.analyses.height.add_colony_time import add_colony_time_all_datasets +from nuc_morph_analysis.lib.visualization.plotting_tools import get_plot_labels_for_metric from nuc_morph_analysis.lib.preprocessing import labeling_neighbors_helper def load_dataset_with_features( @@ -198,6 +200,9 @@ def process_all_tracks(df, dataset, remove_growth_outliers, num_workers): df = add_features.add_non_interphase_size_shape_flag(df) df = add_change_over_time(df) df = add_neighborhood_avg_features.run_script(df, num_workers=num_workers) + df = add_neighborhood_avg_features_lrm.run_script(df, num_workers=num_workers, + feature_list=["volume", "height", "xy_aspect", "mesh_sa", "2d_area_nuc_cell_ratio"], + exclude_outliers=True) if dataset == "all_baseline": df = add_colony_time_all_datasets(df) @@ -245,7 +250,7 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_features.add_location_at(df_full, frame, "y") df_full = add_features.add_time_at(df_full, frame, interval) df_full = add_features.add_colony_time_at(df_full, frame, interval) - + df_full = add_features.add_duration_in_frames(df_full, "Ff", "frame_transition") df_full = add_features.add_duration_in_frames(df_full, "frame_transition", "Fb") df_full = add_features.add_duration_in_frames(df_full, "Ff", "Fb") @@ -256,8 +261,15 @@ def process_full_tracks(df_all, thresh, pix_size, interval): df_full = add_growth_features.add_early_growth_rate(df_full, interval) df_full = add_growth_features.add_late_growth_rate_by_endpoints(df_full) df_full = add_growth_features.fit_tracks_to_time_powerlaw(df_full, "volume", interval) - + + # For LRM + df_full = add_features.add_lineage_features(df_full, feature_list=['volume_at_B', 'duration_BC', 'volume_at_C', 'delta_volume_BC']) + df_full = add_features.add_feature_at(df_full, "frame_transition", 'height', 'height_percentile', pix_size) + df_full = add_features.add_features_at_transition(df_full) + df_full = add_features.get_early_transient_gr_of_neighborhood(df_full, scale=get_plot_labels_for_metric('neighbor_avg_dxdt_48_volume_90um')[0]) df_full = add_features.sum_mitotic_events_along_full_track(df_full) + df_full = add_features.normalize_sum_events(df_full) + df_full = add_features.add_mean_features(df_full) # Add flag for use after merging back to main manifest df_full = add_features.add_full_track_flag(df_full) diff --git a/nuc_morph_analysis/lib/visualization/label_tables.py b/nuc_morph_analysis/lib/visualization/label_tables.py index df39dec6..108b9560 100644 --- a/nuc_morph_analysis/lib/visualization/label_tables.py +++ b/nuc_morph_analysis/lib/visualization/label_tables.py @@ -141,100 +141,126 @@ def get_scale_factor_table(dataset="all_baseline"): LABEL_TABLE = { # Times and durations ("sync_time_Ff", "index_sequence"): "Time", - "normalized_time": "Normalized Interphase Time", - "colony_time": "Aligned Colony Time", + "normalized_time": "Normalized interphase time", + "colony_time": "Aligned colony time", ("Ff", "frame_formation"): "Formation", - ("frame_inflection", "frame_transition"): "Transtion", + ("frame_inflection", "frame_transition"): "Transition", ("Fb", "frame_breakdown"): "Breakdown", - "time_at_B": "Movie Time at Transition", - "time_at_C": "Movie Time at Breakdown", - "colony_time_at_B": "Aligned Colony Time at Transition", - "colony_time_at_C": "Aligned Colony Time at Breakdown", - "duration_AB": "Rapid Expansion Duration", - ("duration_BC", "duration_BC_hr"): "Growth Duration", + "time_at_A": "Time at formation", + "time_at_B": "Starting movie time", + "time_at_C": "Movie time at breakdown", + "colony_time_at_A": "Aligned colony time at formation", + "colony_time_at_B": "Starting aligned colony time", + "colony_time_at_C": "Ending aligned colony time", + "duration_AB": "Rapid expansion duration", + ("duration_BC", "duration_BC_hr"): "Growth duration", # Volume "volume": "Volume", - "volume_at_A": "Volume at Formation", - "volume_at_B": "Starting Volume", - "volume_at_C": "Ending Volume", - "Volume_C": "Volume at Breakdown", - "volume_fold_change_BC": "Volume Fold-Change", - "volume_fold_change_fromB": "Volume Fold-Change relative to Starting Volume", - "delta_volume_BC": "Added Volume", - "difference_volume_at_B": "Difference in Starting Volume", - "difference_half_vol_at_C_and_B": "1/2 Mother Ending Volume - Daughter Starting Volume", - "avg_sister_volume_fold_change_BC": "Sisters Average Volume Fold-Change", - "avg_sister_volume_at_B": "Sisters Average Starting Volume", + "volume_at_A": "Volume at formation", + "volume_at_B": "Starting volume", + "volume_at_C": "Ending volume", + "Volume_C": "Volume at breakdown", + "volume_fold_change_BC": "Volume fold-change", + "volume_fold_change_fromB": "Volume fold-change relative to starting volume", + "delta_volume_BC": "Added volume", + "difference_volume_at_B": "Difference in starting volume", + "difference_half_vol_at_C_and_B": "1/2 mother ending volume - daughter starting volume", + "avg_sister_volume_fold_change_BC": "Sisters average volume fold-change", + "avg_sister_volume_at_B": "Sisters average starting volume", "volume_sub": "Change in volume", # Growth rates - "exp_growth_coeff_BC": "Exponetial Growth Coeff. B to C", - "linear_growth_rate_BC": "Late Growth Rate", - "tscale_linearityfit_volume": "Fitted Time Scaling Factor (\u03B1)", - "RMSE_linearityfit_volume": "Root Mean Squared Error", - "late_growth_rate_by_endpoints": "Growth Rate", + "exp_growth_coeff_BC": "Exponential growth coeff. B to C", + "linear_growth_rate_BC": "Late growth rate", + "tscale_linearityfit_volume": "Fitted time scaling factor (\u03B1)", + "RMSE_linearityfit_volume": "Root mean squared error", + "late_growth_rate_by_endpoints": "Growth rate", "dxdt_t2-dxdt_t1": "Late average transient growth rate - early average transient growth rate", # Height "height": "Height", - "avg_height": "Average Height", - "height_fold_change_BC": "Growth Height Fold-Change", - "height_at_B": "Starting Height", - "height_at_C": "Ending Height", + "avg_height": "Average height", + "height_fold_change_BC": "Growth height fold-change", + "height_at_B": "Starting height", + "height_at_C": "Ending height", # Surface Area - "mesh_sa": "Surface Area", - "SA_at_A": "Surface Area at A", - "SA_at_B": "Surface Area at B", - "SA_at_C": "Surface Area at C", - "delta_SA_BC": "\u0394Surface Area B to C", - "SA_fold_change_BC": "Surface Area Fold-Change B to C", - "SA_fold_change_fromB": "Surface Area Fold-Change", - "SA_vol_ratio": "SA/Volume", - "tscale_linearityfit_SA": "Fitted Surface Area Time Scaling (\u03B1)", - "RMSE_linearityfit_SA": "Root Mean Squared Error", + "mesh_sa": "Surface area", + "SA_at_A": "Surface area at A", + "SA_at_B": "Starting surface area", + "SA_at_C": "Ending surface area", + "delta_SA_BC": "\u0394Surface area B to C", + "SA_fold_change_BC": "Surface area fold-change B to C", + "SA_fold_change_fromB": "Surface area fold-change", + "SA_vol_ratio": "SA/volume", + "tscale_linearityfit_SA": "Fitted surface area time scaling (\u03B1)", + "RMSE_linearityfit_SA": "Root mean squared error", # Dimensions beyond height "length": "XY short axis width", "width": "XY long axis length", # Aspect Ratio - "xy_aspect": "XY Aspect Ratio", - "xz_aspect": "XZ Aspect Ratio", - "zy_aspect": "YZ Aspect Ratio", - "xz_aspect_fold_change_BC": "XZ Aspect Ratio Fold-Change B to C", - "avg_xz_aspect_ratio": "Average XZ Aspect Ratio", - "xz_aspect_at_B": "XZ Aspect Ratio at B", + "xy_aspect": "Xy aspect ratio", + "xz_aspect": "Xz aspect ratio", + "zy_aspect": "Yz aspect ratio", + "xz_aspect_fold_change_BC": "Xz aspect ratio fold-change B to C", + "avg_xz_aspect_ratio": "Average xz aspect ratio", + "xz_aspect_at_B": "Starting xz aspect ratio", # Colony Position "distance": "Distance", - "distance_from_centroid": "Distance From Centroid", - "normalized_distance_from_centroid": "Normalized Distance From Centroid", - "max_distance_from_centroid": "Max Distance From Centroid", - "colony_depth": "Colony Depth", - "normalized_colony_depth": "Normalized Colony Depth", - "max_colony_depth": "Max Colony Depth", - "avg_colony_depth": "Average Colony Depth", + "distance_from_centroid": "Distance from centroid", + "normalized_distance_from_centroid": "Normalized distance from centroid", + "max_distance_from_centroid": "Max distance from centroid", + "colony_depth": "Colony depth", + "normalized_colony_depth": "Normalized colony depth", + "max_colony_depth": "Max colony depth", + "avg_colony_depth": "Average colony depth", # Density - "colony_non_circularity": "Colony Non-circularity", - "colony_non_circularity_scaled": "Scaled Colony Non-circularity", - "avg_early_density": "Early Density", - "avg_late_density": "Late Density", + "colony_non_circularity": "Colony non-circularity", + "colony_non_circularity_scaled": "Scaled colony non-circularity", + "avg_early_density": "Early density", + "avg_late_density": "Late density", "density": "Density", - "avg_density": "Average Density", + "avg_density": "Average density", # Lineage "parent_id": "Parent ID", "family_id": "Family ID", + "sisters_volume_at_B": "Sisters starting volume", + "sisters_duration_BC": "Sisters growth duration", + "sisters_delta_volume_BC": "Sisters added volume", # Flags - "is_outlier": "Outlier Flag", - "is_tp_outlier": "Single Timepoint Outlier", - "is_outlier_track": "Outlier Track Flag", - "is_growth_outlier": "Growth Feature Outlier Flag", - "fov_edge": "FOV-edge Flag", - "termination": "Track Termination", - "is_full_track": "Full Interphase Track Flag", + "is_outlier": "Outlier flag", + "is_tp_outlier": "Single timepoint outlier", + "is_outlier_track": "Outlier track flag", + "is_growth_outlier": "Growth feature outlier flag", + "fov_edge": "FOV-edge flag", + "termination": "Track termination", + "is_full_track": "Full interphase track flag", # colony segmentations "colony_area": "area of colony (brightfield)", "nucleus_colony_area_ratio": "ratio of nuclear area to colony area", "seg_twoD_zMIP_area": "total projected nuclear area", + # LRM feats + "height_at_B": "Starting height", + "density_at_B": "Starting density", + "xy_aspect_at_B": "Starting XY aspect ratio", + "SA_vol_ratio_at_B": "Starting surface area/volume ratio", + "early_neighbor_avg_dxdt_48_volume_90um": "Neighborhood avg. ~starting transient growth rate", + 'neighbor_avg_lrm_volume_90um_at_B': "Neighborhood avg. starting volume", + 'neighbor_avg_lrm_height_90um_at_B': "Neighborhood avg. starting height", + 'neighbor_avg_lrm_density_90um_at_B': "Neighborhood avg. starting density", + 'neighbor_avg_lrm_xy_aspect_90um_at_B': "Neighborhood avg. starting XY aspect ratio", + 'neighbor_avg_lrm_mesh_sa_90um_at_B': "Neighborhood avg. starting surface area", + "mean_neighbor_avg_dxdt_48_volume_90um": "Neighborhood avg. mean transient growth rate", + 'mean_neighbor_avg_lrm_volume_90um': "Neighborhood avg. mean volume", + 'mean_neighbor_avg_lrm_height_90um': "Neighborhood avg. mean height", + 'mean_neighbor_avg_lrm_density_90um': "Neighborhood avg. mean density", + 'mean_neighbor_avg_lrm_xy_aspect_90um': "Neighborhood avg. mean XY aspect ratio", + 'mean_neighbor_avg_lrm_mesh_sa_90um': "Neighborhood avg. mean surface area", + 'mean_neighbor_avg_lrm_2d_area_nuc_cell_ratio_90um': "Neighborhood avg. mean density", + # mitotic and apoptotic neighbor columns "number_of_frame_of_breakdown_neighbors": "# of neighboring cells undergoing breakdown", "number_of_frame_of_formation_neighbors": "# of neighboring cells undergoing formation", "number_of_frame_of_death_neighbors": "# of neighboring cells undergoing death", + "normalized_sum_has_mitotic_neighbor": "Frequency of mitotic adjacent neighbors", + "normalized_sum_has_dying_neighbor": "Frequency of dying adjacent neighbors", # 2D area features "2d_area_nuc_cell_ratio": "Nucleus area/(Pseudo)cell area", "2d_area_nucleus": "Nuclear area", @@ -246,7 +272,7 @@ def get_scale_factor_table(dataset="all_baseline"): "2d_intensity_mean_edge" : "Average distance to (pseudo)cell edge", "2d_intensity_max_edge" : "Max distance to (pseudo)cell edge", } -# now add the dxdt columns + def convert_to_hr(bin_interval, dataset="all_baseline"): diff --git a/run_all_manuscript_workflows.py b/run_all_manuscript_workflows.py index 8694c539..7abf1180 100644 --- a/run_all_manuscript_workflows.py +++ b/run_all_manuscript_workflows.py @@ -58,6 +58,9 @@ def figureS1_segmentation_model_validation(): def supplemental_figure_cell_health(): import nuc_morph_analysis.analyses.cell_health.cell_health_workflow + + def supplemental_figure_linear_regression_model(): + import nuc_morph_analysis.analyses.linear_regression.supplemental_lrm_figure_workflow ALL_WORKFLOWS = get_jobs(Workflows)