serios refactoring

antonkulaga · Sep 20, 2020 · 863b5e0 · 863b5e0
1 parent b084fd1
commit 863b5e0
Show file tree

Hide file tree

Showing 9 changed files with 3,616 additions and 68,773 deletions.
diff --git a/environment.yaml b/environment.yaml
@@ -15,7 +15,7 @@ dependencies:
   - scikit-learn>=0.23.2
   - seaborn>=0.10.1
   - more-itertools>=8.4.0
-  - shap>=0.35.0
+  - shap>=0.36.0
   - lightgbm>=3.0.0
   - statsmodels>=0.11.1
   - optuna>=2.1.0

diff --git a/notebooks/stage_one_shap_selection.ipynb b/notebooks/stage_one_shap_selection.ipynb
diff --git a/notebooks/stage_two_shap_selection.ipynb b/notebooks/stage_two_shap_selection.ipynb
diff --git a/tune.py b/tune.py
@@ -74,7 +74,7 @@ def optimize(folds, hold_outs, locations, metrics, repeats, to_select, trait, tr
     from yspecies.partition import DataPartitioner, PartitionParameters
     from yspecies.selection import ShapSelector
     from yspecies.tuning import Tune
-    from yspecies.results import FeatureSummary, FeatureResults
+    from yspecies.explanations import FeatureSummary, FeatureResults
     import optuna
     from optuna import Trial
 

diff --git a/yspecies/results.py → yspecies/explanations.py b/yspecies/results.py → yspecies/explanations.py
@@ -1,15 +1,20 @@
-import matplotlib.pyplot as plt
-from loguru import logger
-from more_itertools import flatten
-from functools import cached_property
 from dataclasses import *
+from functools import cached_property
+from pathlib import Path
+
+import matplotlib.pyplot as plt
 import shap
+from more_itertools import flatten
+from sklearn.base import TransformerMixin
 
 import yspecies
-from yspecies.selection import Fold
-from yspecies.utils import *
+from yspecies.models import Metrics
 from yspecies.partition import ExpressionPartitions
-from yspecies.models import Metrics, BasicMetrics
+from yspecies.utils import *
+from loguru import logger
+from yspecies.selection import Fold
+
+from scipy.stats import kendalltau
 
 @dataclass(frozen=True)
 class FeatureResults:
@@ -49,7 +54,7 @@ def kendall_tau_abs_mean(self):
         return self.selected[f"kendall_tau_to_{self.to_predict}"].abs().mean()
 
     @property
-    def head(self) -> Fold:
+    def head(self):
         return self.folds[0]
 
     @cached_property
@@ -80,7 +85,8 @@ def metrics_df(self) -> pd.DataFrame:
 
     @cached_property
     def hold_out_metrics(self) -> pd.DataFrame:
-        return yspecies.selection.Metrics.to_dataframe([f.validation_metrics for f in self.folds]).join(pd.Series(data =self.partitions.hold_out_species * self.partitions.n_cv_folds, name="hold_out_species"))
+        return yspecies.selection.Metrics.to_dataframe([f.validation_metrics for f in self.folds])\
+            .join(pd.Series(data =self.partitions.hold_out_species * self.partitions.n_cv_folds, name="hold_out_species"))
 
 
     def __repr__(self):
@@ -133,9 +139,14 @@ def selected_extended(self):
         return self.selected.join(self.stable_shap_dataframe_T, how="left")
 
     @cached_property
-    def stable_shap_values(self):
+    def stable_shap_values(self) -> np.ndarray:
         return np.mean(np.nan_to_num(self.shap_values, 0.0), axis=0) if self.nan_as_zero else np.nanmean(self.shap_values, axis=0)
 
+    @cached_property
+    def stable_interaction_values(self):
+        return np.mean(np.nan_to_num(self.interaction_values, 0.0), axis=0) if self.nan_as_zero else np.nanmean(self.interaction_values, axis=0)
+
+
     @cached_property
     def shap_dataframes(self) -> List[np.ndarray]:
         return [f.shap_dataframe for f in self.folds]
@@ -144,9 +155,32 @@ def shap_dataframes(self) -> List[np.ndarray]:
     def shap_values(self) -> List[np.ndarray]:
         return [f.shap_values for f in self.folds]
 
+    @cached_property
+    def interaction_values(self) -> List[np.ndarray]:
+        return [f.interaction_values for f in self.folds]
+
     @cached_property
     def feature_names(self):
-        return self.partitions.data.genes_meta["symbol"].values
+        gene_names = self.partitions.data.genes_meta["symbol"].values
+        col = self.partitions.data.X.columns[-1]
+        return np.append(gene_names, col) if "encoded" in col else gene_names
+
+    @cached_property
+    def expected_values_mean(self):
+        return np.mean([f.expected_value for f in self.folds])
+
+    @cached_property
+    def expected_values(self):
+        return [f.expected_value for f in self.folds]
+
+    def make_figure(self, save: Path):
+        fig = plt.gcf()
+        if save is not None:
+            from IPython.display import set_matplotlib_formats
+            set_matplotlib_formats('svg')
+            plt.savefig(save)
+        plt.close()
+        return fig
 
     def _plot_(self, shap_values: List[np.ndarray] or np.ndarray, gene_names: bool = True, save: Path = None,
                max_display=None, title=None, layered_violin_max_num_bins = 20,
@@ -161,13 +195,33 @@ def _plot_(self, shap_values: List[np.ndarray] or np.ndarray, gene_names: bool =
                           plot_type=plot_type,
                           color=color, axis_color=axis_color, alpha=alpha
                           )
-        fig = plt.gcf()
-        if save is not None:
-            from IPython.display import set_matplotlib_formats
-            set_matplotlib_formats('svg')
-            plt.savefig(save)
-        plt.close()
-        return fig
+        return self.make_figure(save)
+
+    def _plot_decision_(self, expected_value: float, shap_values: List[np.ndarray] or np.ndarray, gene_names: bool = True, save: Path = None):
+        #shap.summary_plot(shap_values, self.partitions.X, show=False)
+        feature_names = None if gene_names is False else self.feature_names
+        min_max = (self.partitions.data.y.min(), self.partitions.data.y.max())
+        shap.decision_plot(expected_value, shap_values, xlim=min_max,  feature_names=feature_names.tolist(), show=False)
+        return self.make_figure(save)
+
+    def plot_decision(self, save: Path = None):
+        return self._plot_decision_(self.expected_values_mean, self.stable_shap_values, True, save)
+
+    def plot_fold_decision(self, num: int):
+        assert num < len(self.folds), "index should be withing folds range!"
+        f = self.folds[num]
+        self._plot_decision_(f.expected_value, f.shap_values)
+
+    def plot_dependency(self, feature: str, interaction_index:str = "auto", save: Path = None):
+        shap.dependence_plot(feature, self.stable_shap_values, self.partitions.X, feature_names=self.feature_names, interaction_index=interaction_index)
+        return self.make_figure(save)
+
+    def plot_interactions(self, save: Path = None):
+        return self._plot_decision_(self.expected_values_mean, self.stable_interaction_values, True, save)
+
+    def plot_fold_interactions(self, num: int, gene_names: bool = True, save: Path = None):
+        f = self.folds[num]
+        return self._plot_decision_(f.expected_value, f.interaction_values, gene_names, save)
 
     def plot(self, gene_names: bool = True, save: Path = None,
              title=None,  max_display=100, layered_violin_max_num_bins = 20,
@@ -377,7 +431,66 @@ def _plot_(self, shap_values: List[np.ndarray] or np.ndarray, gene_names: bool =
         return fig
 
     def plot(self, gene_names: bool = True, save: Path = None,
-             title=None,  max_display=100, layered_violin_max_num_bins = 20,
-             plot_type=None, color=None, axis_color="#333333", alpha=1, show=True, class_names=None, plot_size = None):
+             max_display=50, title=None, plot_size = 0.5, layered_violin_max_num_bins = 20,
+             plot_type=None, color=None, axis_color="#333333", alpha=1, class_names=None):
         return self._plot_(self.stable_shap_values, gene_names, save, max_display, title,
-                           layered_violin_max_num_bins, plot_type, color, axis_color, alpha, class_names, plot_size)
+                           layered_violin_max_num_bins, plot_type, color, axis_color, alpha, class_names = class_names, plot_size=plot_size)
+
+
+@dataclass
+class ShapSelector(TransformerMixin):
+
+    def fit(self, folds_with_params: Tuple[List[Fold], Dict], y=None) -> 'ShapSelector':
+        return self
+
+    @logger.catch
+    def transform(self, folds_with_params: Tuple[List[Fold], Dict]) -> FeatureResults:
+        folds, parameters = folds_with_params
+        fold_shap_values = [f.shap_values for f in folds]
+        partitions = folds[0].partitions
+        # calculate shap values out of fold
+        mean_shap_values = np.nanmean(fold_shap_values, axis=0)
+        shap_values_transposed = mean_shap_values.T
+        fold_number = partitions.n_cv_folds
+
+        X_transposed = partitions.X_T.values
+
+        select_by_shap = partitions.features.select_by == "shap"
+
+        score_name = 'shap_absolute_sum_to_' + partitions.features.to_predict if select_by_shap else f'{partitions.features.importance_type}_score_to_' + partitions.features.to_predict
+        kendal_tau_name = 'kendall_tau_to_' + partitions.features.to_predict
+
+        # get features that have stable weight across self.bootstraps
+        output_features_by_weight = []
+        for i, column in enumerate(folds[0].shap_dataframe.columns):
+            non_zero_cols = 0
+            cols = []
+            for f in folds:
+                weight = f.feature_weights[i] if select_by_shap else folds[0].shap_absolute_sum[column]
+                cols.append(weight)
+                if weight != 0:
+                    non_zero_cols += 1
+            if non_zero_cols == fold_number:
+                if 'ENSG' in partitions.X.columns[
+                    i]:  # TODO: change from hard-coded ENSG checkup to something more meaningful
+                    output_features_by_weight.append({
+                        'ensembl_id': partitions.X.columns[i],
+                        score_name: np.mean(cols),
+                        # 'name': partitions.X.columns[i], #ensemble_data.gene_name_of_gene_id(X.columns[i]),
+                        kendal_tau_name: kendalltau(shap_values_transposed[i], X_transposed[i], nan_policy='omit')[0]
+                    })
+        if(len(output_features_by_weight)==0):
+            logger.error(f"could not find genes which are in all folds,  creating empty dataframe instead!")
+            empty_selected = pd.DataFrame(columns=["symbol", score_name, kendal_tau_name])
+            return FeatureResults(empty_selected, folds, partitions, parameters)
+        selected_features = pd.DataFrame(output_features_by_weight)
+        selected_features = selected_features.set_index("ensembl_id")
+        if isinstance(partitions.data.genes_meta, pd.DataFrame):
+            selected_features = partitions.data.genes_meta.drop(columns=["species"]) \
+                .join(selected_features, how="inner") \
+                .sort_values(by=[score_name], ascending=False)
+        #selected_features.index = "ensembl_id"
+        results = FeatureResults(selected_features, folds, partitions, parameters)
+        logger.info(f"Metrics: \n{results.metrics_average}")
+        return results
+# from shap import Explanation
diff --git a/yspecies/helpers.py b/yspecies/helpers.py
@@ -6,8 +6,8 @@
 from yspecies.partition import DataPartitioner
 from yspecies.partition import PartitionParameters
 from yspecies.preprocess import DataExtractor
-from yspecies.results import FeatureSummary
-from yspecies.selection import CrossValidator, ShapSelector
+from yspecies.explanations import ShapSelector, FeatureSummary
+from yspecies.selection import CrossValidator
 from yspecies.tuning import MultiObjectiveResults
 from yspecies.workflow import TupleWith, Repeat, Collect
 
@@ -23,29 +23,27 @@ class PipelineFactory:
     def partition_parameters(self):
         return PartitionParameters(self.n_folds, self.n_hold_out, 2,   42)
 
-    def load_study_by_trait(self, trait: str, study_name: str = None):
-        path = self.locations.interim.optimization / (trait+".sqlite")
-        study_name = f"{trait}_r2_huber_kendall" if study_name is None else study_name
-        return self.load_study(path, study_name)
-
-    def load_study(self, path: Path, name: str):
+    def load_study(self, path: Path, study_name: str):
         url = f'sqlite:///' +str(path.absolute())
         print('loading (if exists) study from '+url)
         storage = optuna.storages.RDBStorage(
             url=url
             #engine_kwargs={'check_same_thread': False}
         )
-        return optuna.multi_objective.study.create_study(directions=['maximize', 'minimize', 'maximize'], storage=storage, study_name=name, load_if_exists = True)
+        return optuna.multi_objective.study.create_study(directions=['maximize', 'minimize', 'maximize'], storage=storage, study_name=study_name, load_if_exists = True)
 
-    def make_partitioning_shap_pipeline(self, trait: str, study_name: str = None):
+    def make_partition_shap_pipe(self, trait: str = None, study_name: str = None, study_path: Path = None):
+        assert trait is not None or study_path is not None, "either trait or study path should be not None"
         study_name = f"{trait}_r2_huber_kendall" if study_name is None else study_name
-        study = self.load_study_by_trait(trait, study_name)
+        study_path = self.locations.interim.optimization / (trait+".sqlite") if study_path is None else study_path
+        study = self.load_study(study_path, study_name)
         if len(study.get_pareto_front_trials())>0 :
             metrics, params = MultiObjectiveResults.from_study(study).best_metrics_params_r2()
             params["verbose"] = -1
             if "early_stopping_round" not in params:
                 params["early_stopping_round"] = 10
         else:
+            print("FALLING BACK TO DEFAULT PARAMETERS")
             params = {"bagging_fraction": 0.9522534844058304,
                       "boosting_type": "dart",
                       "objective": "regression",
@@ -67,8 +65,8 @@ def make_partitioning_shap_pipeline(self, trait: str, study_name: str = None):
         ]
         )
 
-    def make_shap_pipeline(self, trait: str, study_name: str = None):
-        partition_shap_pipe = self.make_partitioning_shap_pipeline(trait, study_name)
+    def make_shap_pipeline(self, trait: str = None, study_name: str = None, study_path: Path = None):
+        partition_shap_pipe = self.make_partition_shap_pipe(trait, study_name, study_path)
         return Pipeline(
             [
                 ('extractor', DataExtractor()),
@@ -77,8 +75,8 @@ def make_shap_pipeline(self, trait: str, study_name: str = None):
             ]
         )
 
-    def make_repeated_shap_pipeline(self, trait: str, study_name: str = None):
-        partition_shap_pipe = self.make_partitioning_shap_pipeline(trait, study_name)
+    def make_repeated_shap_pipeline(self, trait: str = None, study_name: str = None, study_path: Path = None):
+        partition_shap_pipe = self.make_partition_shap_pipe(trait, study_name, study_path = study_path)
         repeated_cv = Repeat(partition_shap_pipe, self.repeats, lambda x,i: (x[0], replace(x[1], seed=i)))
         return Pipeline(
             [

diff --git a/yspecies/pipelines.py b/yspecies/pipelines.py
@@ -4,7 +4,7 @@
 
 from yspecies.partition import DataPartitioner, PartitionParameters
 from yspecies.preprocess import DataExtractor
-from yspecies.results import FeatureSummary
+from yspecies.explanations import FeatureSummary
 from yspecies.selection import ShapSelector
 from yspecies.utils import *
 from yspecies.workflow import TupleWith, Repeat, Collect