Skip to content

Commit

Permalink
serios refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
antonkulaga committed Sep 20, 2020
1 parent b084fd1 commit 863b5e0
Show file tree
Hide file tree
Showing 9 changed files with 3,616 additions and 68,773 deletions.
2 changes: 1 addition & 1 deletion environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ dependencies:
- scikit-learn>=0.23.2
- seaborn>=0.10.1
- more-itertools>=8.4.0
- shap>=0.35.0
- shap>=0.36.0
- lightgbm>=3.0.0
- statsmodels>=0.11.1
- optuna>=2.1.0
Expand Down
2,746 changes: 2,697 additions & 49 deletions notebooks/stage_one_shap_selection.ipynb

Large diffs are not rendered by default.

69,352 changes: 740 additions & 68,612 deletions notebooks/stage_two_shap_selection.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def optimize(folds, hold_outs, locations, metrics, repeats, to_select, trait, tr
from yspecies.partition import DataPartitioner, PartitionParameters
from yspecies.selection import ShapSelector
from yspecies.tuning import Tune
from yspecies.results import FeatureSummary, FeatureResults
from yspecies.explanations import FeatureSummary, FeatureResults
import optuna
from optuna import Trial

Expand Down
155 changes: 134 additions & 21 deletions yspecies/results.py → yspecies/explanations.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
import matplotlib.pyplot as plt
from loguru import logger
from more_itertools import flatten
from functools import cached_property
from dataclasses import *
from functools import cached_property
from pathlib import Path

import matplotlib.pyplot as plt
import shap
from more_itertools import flatten
from sklearn.base import TransformerMixin

import yspecies
from yspecies.selection import Fold
from yspecies.utils import *
from yspecies.models import Metrics
from yspecies.partition import ExpressionPartitions
from yspecies.models import Metrics, BasicMetrics
from yspecies.utils import *
from loguru import logger
from yspecies.selection import Fold

from scipy.stats import kendalltau

@dataclass(frozen=True)
class FeatureResults:
Expand Down Expand Up @@ -49,7 +54,7 @@ def kendall_tau_abs_mean(self):
return self.selected[f"kendall_tau_to_{self.to_predict}"].abs().mean()

@property
def head(self) -> Fold:
def head(self):
return self.folds[0]

@cached_property
Expand Down Expand Up @@ -80,7 +85,8 @@ def metrics_df(self) -> pd.DataFrame:

@cached_property
def hold_out_metrics(self) -> pd.DataFrame:
return yspecies.selection.Metrics.to_dataframe([f.validation_metrics for f in self.folds]).join(pd.Series(data =self.partitions.hold_out_species * self.partitions.n_cv_folds, name="hold_out_species"))
return yspecies.selection.Metrics.to_dataframe([f.validation_metrics for f in self.folds])\
.join(pd.Series(data =self.partitions.hold_out_species * self.partitions.n_cv_folds, name="hold_out_species"))


def __repr__(self):
Expand Down Expand Up @@ -133,9 +139,14 @@ def selected_extended(self):
return self.selected.join(self.stable_shap_dataframe_T, how="left")

@cached_property
def stable_shap_values(self):
def stable_shap_values(self) -> np.ndarray:
return np.mean(np.nan_to_num(self.shap_values, 0.0), axis=0) if self.nan_as_zero else np.nanmean(self.shap_values, axis=0)

@cached_property
def stable_interaction_values(self):
return np.mean(np.nan_to_num(self.interaction_values, 0.0), axis=0) if self.nan_as_zero else np.nanmean(self.interaction_values, axis=0)


@cached_property
def shap_dataframes(self) -> List[np.ndarray]:
return [f.shap_dataframe for f in self.folds]
Expand All @@ -144,9 +155,32 @@ def shap_dataframes(self) -> List[np.ndarray]:
def shap_values(self) -> List[np.ndarray]:
return [f.shap_values for f in self.folds]

@cached_property
def interaction_values(self) -> List[np.ndarray]:
return [f.interaction_values for f in self.folds]

@cached_property
def feature_names(self):
return self.partitions.data.genes_meta["symbol"].values
gene_names = self.partitions.data.genes_meta["symbol"].values
col = self.partitions.data.X.columns[-1]
return np.append(gene_names, col) if "encoded" in col else gene_names

@cached_property
def expected_values_mean(self):
return np.mean([f.expected_value for f in self.folds])

@cached_property
def expected_values(self):
return [f.expected_value for f in self.folds]

def make_figure(self, save: Path):
fig = plt.gcf()
if save is not None:
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')
plt.savefig(save)
plt.close()
return fig

def _plot_(self, shap_values: List[np.ndarray] or np.ndarray, gene_names: bool = True, save: Path = None,
max_display=None, title=None, layered_violin_max_num_bins = 20,
Expand All @@ -161,13 +195,33 @@ def _plot_(self, shap_values: List[np.ndarray] or np.ndarray, gene_names: bool =
plot_type=plot_type,
color=color, axis_color=axis_color, alpha=alpha
)
fig = plt.gcf()
if save is not None:
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')
plt.savefig(save)
plt.close()
return fig
return self.make_figure(save)

def _plot_decision_(self, expected_value: float, shap_values: List[np.ndarray] or np.ndarray, gene_names: bool = True, save: Path = None):
#shap.summary_plot(shap_values, self.partitions.X, show=False)
feature_names = None if gene_names is False else self.feature_names
min_max = (self.partitions.data.y.min(), self.partitions.data.y.max())
shap.decision_plot(expected_value, shap_values, xlim=min_max, feature_names=feature_names.tolist(), show=False)
return self.make_figure(save)

def plot_decision(self, save: Path = None):
return self._plot_decision_(self.expected_values_mean, self.stable_shap_values, True, save)

def plot_fold_decision(self, num: int):
assert num < len(self.folds), "index should be withing folds range!"
f = self.folds[num]
self._plot_decision_(f.expected_value, f.shap_values)

def plot_dependency(self, feature: str, interaction_index:str = "auto", save: Path = None):
shap.dependence_plot(feature, self.stable_shap_values, self.partitions.X, feature_names=self.feature_names, interaction_index=interaction_index)
return self.make_figure(save)

def plot_interactions(self, save: Path = None):
return self._plot_decision_(self.expected_values_mean, self.stable_interaction_values, True, save)

def plot_fold_interactions(self, num: int, gene_names: bool = True, save: Path = None):
f = self.folds[num]
return self._plot_decision_(f.expected_value, f.interaction_values, gene_names, save)

def plot(self, gene_names: bool = True, save: Path = None,
title=None, max_display=100, layered_violin_max_num_bins = 20,
Expand Down Expand Up @@ -377,7 +431,66 @@ def _plot_(self, shap_values: List[np.ndarray] or np.ndarray, gene_names: bool =
return fig

def plot(self, gene_names: bool = True, save: Path = None,
title=None, max_display=100, layered_violin_max_num_bins = 20,
plot_type=None, color=None, axis_color="#333333", alpha=1, show=True, class_names=None, plot_size = None):
max_display=50, title=None, plot_size = 0.5, layered_violin_max_num_bins = 20,
plot_type=None, color=None, axis_color="#333333", alpha=1, class_names=None):
return self._plot_(self.stable_shap_values, gene_names, save, max_display, title,
layered_violin_max_num_bins, plot_type, color, axis_color, alpha, class_names, plot_size)
layered_violin_max_num_bins, plot_type, color, axis_color, alpha, class_names = class_names, plot_size=plot_size)


@dataclass
class ShapSelector(TransformerMixin):

def fit(self, folds_with_params: Tuple[List[Fold], Dict], y=None) -> 'ShapSelector':
return self

@logger.catch
def transform(self, folds_with_params: Tuple[List[Fold], Dict]) -> FeatureResults:
folds, parameters = folds_with_params
fold_shap_values = [f.shap_values for f in folds]
partitions = folds[0].partitions
# calculate shap values out of fold
mean_shap_values = np.nanmean(fold_shap_values, axis=0)
shap_values_transposed = mean_shap_values.T
fold_number = partitions.n_cv_folds

X_transposed = partitions.X_T.values

select_by_shap = partitions.features.select_by == "shap"

score_name = 'shap_absolute_sum_to_' + partitions.features.to_predict if select_by_shap else f'{partitions.features.importance_type}_score_to_' + partitions.features.to_predict
kendal_tau_name = 'kendall_tau_to_' + partitions.features.to_predict

# get features that have stable weight across self.bootstraps
output_features_by_weight = []
for i, column in enumerate(folds[0].shap_dataframe.columns):
non_zero_cols = 0
cols = []
for f in folds:
weight = f.feature_weights[i] if select_by_shap else folds[0].shap_absolute_sum[column]
cols.append(weight)
if weight != 0:
non_zero_cols += 1
if non_zero_cols == fold_number:
if 'ENSG' in partitions.X.columns[
i]: # TODO: change from hard-coded ENSG checkup to something more meaningful
output_features_by_weight.append({
'ensembl_id': partitions.X.columns[i],
score_name: np.mean(cols),
# 'name': partitions.X.columns[i], #ensemble_data.gene_name_of_gene_id(X.columns[i]),
kendal_tau_name: kendalltau(shap_values_transposed[i], X_transposed[i], nan_policy='omit')[0]
})
if(len(output_features_by_weight)==0):
logger.error(f"could not find genes which are in all folds, creating empty dataframe instead!")
empty_selected = pd.DataFrame(columns=["symbol", score_name, kendal_tau_name])
return FeatureResults(empty_selected, folds, partitions, parameters)
selected_features = pd.DataFrame(output_features_by_weight)
selected_features = selected_features.set_index("ensembl_id")
if isinstance(partitions.data.genes_meta, pd.DataFrame):
selected_features = partitions.data.genes_meta.drop(columns=["species"]) \
.join(selected_features, how="inner") \
.sort_values(by=[score_name], ascending=False)
#selected_features.index = "ensembl_id"
results = FeatureResults(selected_features, folds, partitions, parameters)
logger.info(f"Metrics: \n{results.metrics_average}")
return results
# from shap import Explanation
28 changes: 13 additions & 15 deletions yspecies/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from yspecies.partition import DataPartitioner
from yspecies.partition import PartitionParameters
from yspecies.preprocess import DataExtractor
from yspecies.results import FeatureSummary
from yspecies.selection import CrossValidator, ShapSelector
from yspecies.explanations import ShapSelector, FeatureSummary
from yspecies.selection import CrossValidator
from yspecies.tuning import MultiObjectiveResults
from yspecies.workflow import TupleWith, Repeat, Collect

Expand All @@ -23,29 +23,27 @@ class PipelineFactory:
def partition_parameters(self):
return PartitionParameters(self.n_folds, self.n_hold_out, 2, 42)

def load_study_by_trait(self, trait: str, study_name: str = None):
path = self.locations.interim.optimization / (trait+".sqlite")
study_name = f"{trait}_r2_huber_kendall" if study_name is None else study_name
return self.load_study(path, study_name)

def load_study(self, path: Path, name: str):
def load_study(self, path: Path, study_name: str):
url = f'sqlite:///' +str(path.absolute())
print('loading (if exists) study from '+url)
storage = optuna.storages.RDBStorage(
url=url
#engine_kwargs={'check_same_thread': False}
)
return optuna.multi_objective.study.create_study(directions=['maximize', 'minimize', 'maximize'], storage=storage, study_name=name, load_if_exists = True)
return optuna.multi_objective.study.create_study(directions=['maximize', 'minimize', 'maximize'], storage=storage, study_name=study_name, load_if_exists = True)

def make_partitioning_shap_pipeline(self, trait: str, study_name: str = None):
def make_partition_shap_pipe(self, trait: str = None, study_name: str = None, study_path: Path = None):
assert trait is not None or study_path is not None, "either trait or study path should be not None"
study_name = f"{trait}_r2_huber_kendall" if study_name is None else study_name
study = self.load_study_by_trait(trait, study_name)
study_path = self.locations.interim.optimization / (trait+".sqlite") if study_path is None else study_path
study = self.load_study(study_path, study_name)
if len(study.get_pareto_front_trials())>0 :
metrics, params = MultiObjectiveResults.from_study(study).best_metrics_params_r2()
params["verbose"] = -1
if "early_stopping_round" not in params:
params["early_stopping_round"] = 10
else:
print("FALLING BACK TO DEFAULT PARAMETERS")
params = {"bagging_fraction": 0.9522534844058304,
"boosting_type": "dart",
"objective": "regression",
Expand All @@ -67,8 +65,8 @@ def make_partitioning_shap_pipeline(self, trait: str, study_name: str = None):
]
)

def make_shap_pipeline(self, trait: str, study_name: str = None):
partition_shap_pipe = self.make_partitioning_shap_pipeline(trait, study_name)
def make_shap_pipeline(self, trait: str = None, study_name: str = None, study_path: Path = None):
partition_shap_pipe = self.make_partition_shap_pipe(trait, study_name, study_path)
return Pipeline(
[
('extractor', DataExtractor()),
Expand All @@ -77,8 +75,8 @@ def make_shap_pipeline(self, trait: str, study_name: str = None):
]
)

def make_repeated_shap_pipeline(self, trait: str, study_name: str = None):
partition_shap_pipe = self.make_partitioning_shap_pipeline(trait, study_name)
def make_repeated_shap_pipeline(self, trait: str = None, study_name: str = None, study_path: Path = None):
partition_shap_pipe = self.make_partition_shap_pipe(trait, study_name, study_path = study_path)
repeated_cv = Repeat(partition_shap_pipe, self.repeats, lambda x,i: (x[0], replace(x[1], seed=i)))
return Pipeline(
[
Expand Down
2 changes: 1 addition & 1 deletion yspecies/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from yspecies.partition import DataPartitioner, PartitionParameters
from yspecies.preprocess import DataExtractor
from yspecies.results import FeatureSummary
from yspecies.explanations import FeatureSummary
from yspecies.selection import ShapSelector
from yspecies.utils import *
from yspecies.workflow import TupleWith, Repeat, Collect
Expand Down
Loading

0 comments on commit 863b5e0

Please sign in to comment.