diff --git a/setup.py b/setup.py index 52867f50..5f20c870 100644 --- a/setup.py +++ b/setup.py @@ -41,6 +41,11 @@ "modernnca": [ "category_encoders", ], + "dpdt": [ + # TODO: pypi package is not available yet + "git+https://github.com/KohlerHECTOR/DPDTreeEstimator.git", + # used hash: a74791d2190da27b43accd4da9e7d141380326ea + ], } benchmark_requires = [] @@ -53,6 +58,7 @@ "tabdpt", "tabm", "modernnca", + "dpdt", ]: benchmark_requires += extras_require[extra_package] benchmark_requires = list(set(benchmark_requires)) diff --git a/tabrepo/benchmark/models/ag/__init__.py b/tabrepo/benchmark/models/ag/__init__.py index 4cfded7c..03e3b348 100644 --- a/tabrepo/benchmark/models/ag/__init__.py +++ b/tabrepo/benchmark/models/ag/__init__.py @@ -8,6 +8,7 @@ from tabrepo.benchmark.models.ag.tabm.tabm_model import TabMModel from tabrepo.benchmark.models.ag.tabpfnv2.tabpfnv2_client_model import TabPFNV2ClientModel from tabrepo.benchmark.models.ag.tabpfnv2.tabpfnv2_model import TabPFNV2Model +from tabrepo.benchmark.models.ag.dpdt.dpdt_model import BoostedDPDTModel __all__ = [ "ExplainableBoostingMachineModel", @@ -18,4 +19,5 @@ "TabMModel", "TabPFNV2ClientModel", "TabPFNV2Model", + "BoostedDPDTModel" ] diff --git a/tabrepo/benchmark/models/ag/dpdt/__init__.py b/tabrepo/benchmark/models/ag/dpdt/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tabrepo/benchmark/models/ag/dpdt/dpdt_model.py b/tabrepo/benchmark/models/ag/dpdt/dpdt_model.py new file mode 100644 index 00000000..a9ee6e3d --- /dev/null +++ b/tabrepo/benchmark/models/ag/dpdt/dpdt_model.py @@ -0,0 +1,161 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from autogluon.common.utils.pandas_utils import get_approximate_df_mem_usage +from autogluon.common.utils.resource_utils import ResourceManager +from autogluon.core.models import AbstractModel + +if TYPE_CHECKING: + import pandas as pd + + +def _to_cat(X): + return X + + +class BoostedDPDTModel(AbstractModel): + ag_key = "BOOSTEDDPDT" + ag_name = "boosted_dpdt" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._preprocessor = None + + def get_model_cls(self): + from dpdt import AdaBoostDPDT + + if self.problem_type in ["binary", "multiclass"]: + model_cls = AdaBoostDPDT + else: + raise AssertionError(f"Unsupported problem_type: {self.problem_type}") + return model_cls + + def _preprocess(self, X, **kwargs): + X = super()._preprocess(X, **kwargs) + if self._preprocessor is None: + import numpy as np + from sklearn.compose import ColumnTransformer, make_column_selector + from sklearn.impute import SimpleImputer + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder + + categorical_pipeline = Pipeline( + [ + ( + "encoder", + OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=-1 + ), + ), + ( + "imputer", + SimpleImputer( + strategy="constant", add_indicator=True, fill_value=-1 + ), + ), + ( + "to_category", + FunctionTransformer(_to_cat), + ), + ] + ).set_output(transform="pandas") + + self._preprocessor = ColumnTransformer( + transformers=[ + ( + "num", + SimpleImputer(strategy="mean", add_indicator=True), + make_column_selector(dtype_include=np.number), + ), + ( + "cat", + categorical_pipeline, + make_column_selector(dtype_include=["object", "category"]), + ), + ], + remainder="passthrough", + ).set_output(transform="pandas") + self._preprocessor.fit(X) + + return self._preprocessor.transform(X) + + def _fit( + self, + X: pd.DataFrame, + y: pd.Series, + num_cpus: int = 1, + time_limit: float | None = None, + **kwargs, + ): + model_cls = self.get_model_cls() + hyp = self._get_model_params() + + self.model = model_cls( + **hyp, + n_jobs="best" if num_cpus > 1 else num_cpus, + time_limit=time_limit, + ) + X = self.preprocess(X) + self.model = self.model.fit( + X=X, + y=y, + ) + + def _set_default_params(self): + default_params = { + "random_state": 42, + "n_estimators": 1000, + } + for param, val in default_params.items(): + self._set_default_param_value(param, val) + + @classmethod + def supported_problem_types(cls) -> list[str] | None: + return ["binary", "multiclass"] + + def _get_default_resources(self) -> tuple[int, int]: + # logical=False is faster in training + num_cpus = ResourceManager.get_cpu_count_psutil(logical=False) + num_gpus = 0 + return num_cpus, num_gpus + + def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int: + hyperparameters = self._get_model_params() + return self.estimate_memory_usage_static( + X=X, + problem_type=self.problem_type, + num_classes=self.num_classes, + hyperparameters=hyperparameters, + **kwargs, + ) + + @classmethod + def _estimate_memory_usage_static( + cls, + *, + X: pd.DataFrame, + hyperparameters: dict | None = None, + **kwargs, + ) -> int: + # TODO: add a callback that stops when running out of memory. + if hyperparameters is None: + hyperparameters = {} + + dataset_size_mem_est = ( + 40 + * hyperparameters.get("cart_nodes_list", [2.5])[0] + * hyperparameters.get("cart_nodes_list", [0, 1])[1] + * get_approximate_df_mem_usage(X).sum() + ) + baseline_overhead_mem_est = 3e8 # 300 MB generic overhead + + return int(dataset_size_mem_est + baseline_overhead_mem_est) + + @classmethod + def _class_tags(cls): + return {"can_estimate_memory_usage_static": True} + + def _more_tags(self) -> dict: + """DPDT does not yet support refit full.""" + return {"can_refit_full": False} diff --git a/tabrepo/benchmark/models/model_register.py b/tabrepo/benchmark/models/model_register.py index 9066e788..15e7a594 100644 --- a/tabrepo/benchmark/models/model_register.py +++ b/tabrepo/benchmark/models/model_register.py @@ -13,6 +13,7 @@ TabMModel, TabPFNV2ClientModel, TabPFNV2Model, + BoostedDPDTModel, ) tabrepo_model_register: ModelRegistry = copy.deepcopy(ag_model_registry) @@ -26,6 +27,7 @@ TabDPTModel, TabMModel, ModernNCAModel, + BoostedDPDTModel, ] for _model_cls in _models_to_add: diff --git a/tabrepo/models/dpdt/__init__.py b/tabrepo/models/dpdt/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tabrepo/models/dpdt/generate.py b/tabrepo/models/dpdt/generate.py new file mode 100644 index 00000000..9d4188be --- /dev/null +++ b/tabrepo/models/dpdt/generate.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +import numpy as np + +from tabrepo.benchmark.models.ag.dpdt.dpdt_model import BoostedDPDTModel +from tabrepo.models.utils import convert_numpy_dtypes +from tabrepo.utils.config_utils import CustomAGConfigGenerator + + +def generate_configs_bdpdt(num_random_configs=200): + # TODO: transform this to a ConfigSpace configuration space or similar + # TODO: and/or switch to better random seed logic + + # Generate 1000 samples from log-normal distribution + # Parameters: mu = log(0.01), sigma = log(10.0) + np.random.seed(42) # For reproducibility + mu = float(np.log(0.01)) + sigma = float(np.log(10.0)) + samples = np.random.lognormal(mean=mu, sigma=sigma, size=num_random_configs) + + # Generate 1000 samples from q_log_uniform_values distribution + # Parameters: min=1.5, max=50.5, q=1 + np.random.seed(43) + min_val = 1.5 + max_val = 50.5 + q = 1 + # Generate log-uniform samples and quantize + log_min = np.log(min_val) + log_max = np.log(max_val) + log_uniform_samples = np.random.uniform(log_min, log_max, size=num_random_configs) + min_samples_leaf_samples = np.round(np.exp(log_uniform_samples) / q) * q + min_samples_leaf_samples = np.clip( + min_samples_leaf_samples, min_val, max_val + ).astype(int) + + # Generate 1000 samples for min_weight_fraction_leaf + # Values: [0.0, 0.01], probabilities: [0.95, 0.05] + np.random.seed(44) + min_weight_fraction_leaf_samples = np.random.choice( + [0.0, 0.01], size=num_random_configs, p=[0.95, 0.05] + ) + + # Generate 1000 samples for max_features + # Values: ["sqrt", "log2", 10000], probabilities: [0.5, 0.25, 0.25] + np.random.seed(45) + max_features_samples = np.random.choice( + ["sqrt", "log2", 10000], size=num_random_configs, p=[0.5, 0.25, 0.25] + ) + + np.random.seed(46) + max_depth_samples = np.random.choice([2, 3], size=num_random_configs, p=[0.4, 0.6]) + + np.random.seed(47) + min_samples_split = np.random.choice( + [2, 3], size=num_random_configs, p=[0.95, 0.05] + ) + + np.random.seed(48) + min_impurity_decrease_samples = np.random.choice( + [0, 0.01, 0.02, 0.05], size=num_random_configs, p=[0.85, 0.05, 0.05, 0.05] + ) + + np.random.seed(49) + choices = [[8, 4], [4, 8], [16, 2], [4, 4, 2]] + indices = np.random.choice(len(choices), size=num_random_configs) + cart_nodes_list = [choices[i] for i in indices] + + configs = [] + for i in range(num_random_configs): + try: + max_features = int(max_features_samples[i]) + except Exception: + max_features = max_features_samples[i] + config = { + "learning_rate": samples[i], + "max_depth": max_depth_samples[i], + "min_samples_split": min_samples_split[i], + "min_impurity_decrease": min_impurity_decrease_samples[i], + "cart_nodes_list": cart_nodes_list[i], + "min_samples_leaf": min_samples_leaf_samples[i], + "min_weight_fraction_leaf": min_weight_fraction_leaf_samples[i], + "max_features": max_features, + } + configs.append(config) + + return [convert_numpy_dtypes(config) for config in configs] + + +gen_boosteddpdt = CustomAGConfigGenerator( + model_cls=BoostedDPDTModel, + search_space_func=generate_configs_bdpdt, + manual_configs=[{}], +) + + +if __name__ == "__main__": + print(generate_configs_bdpdt(3)) diff --git a/tabrepo/models/utils.py b/tabrepo/models/utils.py index 5f021f4b..5940d95c 100644 --- a/tabrepo/models/utils.py +++ b/tabrepo/models/utils.py @@ -46,6 +46,7 @@ def get_configs_generator_from_name(model_name: str): # "TabPFN": lambda: importlib.import_module("tabrepo.models.tabpfn.generate").gen_tabpfn, # not supported in TabArena "TabPFNv2": lambda: importlib.import_module("tabrepo.models.tabpfnv2.generate").gen_tabpfnv2, "XGBoost": lambda: importlib.import_module("tabrepo.models.xgboost.generate").gen_xgboost, + "BoostedDPDT": lambda: importlib.import_module("tabrepo.models.dpdt.generate").gen_boosteddpdt, } if model_name not in name_to_import_map: diff --git a/tst/benchmark/models/test_dpdt.py b/tst/benchmark/models/test_dpdt.py new file mode 100644 index 00000000..d2347e20 --- /dev/null +++ b/tst/benchmark/models/test_dpdt.py @@ -0,0 +1,17 @@ +import pytest + + +def test_dpdt(): + model_hyperparameters = {"n_estimators": 2, "cart_nodes_list":(4,3)} + + try: + from autogluon.tabular.testing import FitHelper + from tabrepo.benchmark.models.ag import BoostedDPDTModel + model_cls = BoostedDPDTModel + FitHelper.verify_model(model_cls=model_cls, model_hyperparameters=model_hyperparameters) + except ImportError as err: + pytest.skip( + f"Import Error, skipping test... " + f"Ensure you have the proper dependencies installed to run this test:\n" + f"{err}" + )