Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@
"modernnca": [
"category_encoders",
],
"dpdt": [
# TODO: pypi package is not available yet
"git+https://github.com/KohlerHECTOR/DPDTreeEstimator.git",
# used hash: a74791d2190da27b43accd4da9e7d141380326ea
],
}

benchmark_requires = []
Expand All @@ -53,6 +58,7 @@
"tabdpt",
"tabm",
"modernnca",
"dpdt",
]:
benchmark_requires += extras_require[extra_package]
benchmark_requires = list(set(benchmark_requires))
Expand Down
2 changes: 2 additions & 0 deletions tabrepo/benchmark/models/ag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from tabrepo.benchmark.models.ag.tabm.tabm_model import TabMModel
from tabrepo.benchmark.models.ag.tabpfnv2.tabpfnv2_client_model import TabPFNV2ClientModel
from tabrepo.benchmark.models.ag.tabpfnv2.tabpfnv2_model import TabPFNV2Model
from tabrepo.benchmark.models.ag.dpdt.dpdt_model import BoostedDPDTModel

__all__ = [
"ExplainableBoostingMachineModel",
Expand All @@ -18,4 +19,5 @@
"TabMModel",
"TabPFNV2ClientModel",
"TabPFNV2Model",
"BoostedDPDTModel"
]
Empty file.
161 changes: 161 additions & 0 deletions tabrepo/benchmark/models/ag/dpdt/dpdt_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from autogluon.common.utils.pandas_utils import get_approximate_df_mem_usage
from autogluon.common.utils.resource_utils import ResourceManager
from autogluon.core.models import AbstractModel

if TYPE_CHECKING:
import pandas as pd


def _to_cat(X):
return X


class BoostedDPDTModel(AbstractModel):
ag_key = "BOOSTEDDPDT"
ag_name = "boosted_dpdt"

def __init__(self, **kwargs):
super().__init__(**kwargs)
self._preprocessor = None

def get_model_cls(self):
from dpdt import AdaBoostDPDT

if self.problem_type in ["binary", "multiclass"]:
model_cls = AdaBoostDPDT
else:
raise AssertionError(f"Unsupported problem_type: {self.problem_type}")
return model_cls

def _preprocess(self, X, **kwargs):
X = super()._preprocess(X, **kwargs)
if self._preprocessor is None:
import numpy as np
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder

categorical_pipeline = Pipeline(
[
(
"encoder",
OrdinalEncoder(
handle_unknown="use_encoded_value", unknown_value=-1
),
),
(
"imputer",
SimpleImputer(
strategy="constant", add_indicator=True, fill_value=-1
),
),
(
"to_category",
FunctionTransformer(_to_cat),
),
]
).set_output(transform="pandas")

self._preprocessor = ColumnTransformer(
transformers=[
(
"num",
SimpleImputer(strategy="mean", add_indicator=True),
make_column_selector(dtype_include=np.number),
),
(
"cat",
categorical_pipeline,
make_column_selector(dtype_include=["object", "category"]),
),
],
remainder="passthrough",
).set_output(transform="pandas")
self._preprocessor.fit(X)

return self._preprocessor.transform(X)

def _fit(
self,
X: pd.DataFrame,
y: pd.Series,
num_cpus: int = 1,
time_limit: float | None = None,
**kwargs,
):
model_cls = self.get_model_cls()
hyp = self._get_model_params()

self.model = model_cls(
**hyp,
n_jobs="best" if num_cpus > 1 else num_cpus,
time_limit=time_limit,
)
X = self.preprocess(X)
self.model = self.model.fit(
X=X,
y=y,
)

def _set_default_params(self):
default_params = {
"random_state": 42,
"n_estimators": 1000,
}
for param, val in default_params.items():
self._set_default_param_value(param, val)

@classmethod
def supported_problem_types(cls) -> list[str] | None:
return ["binary", "multiclass"]

def _get_default_resources(self) -> tuple[int, int]:
# logical=False is faster in training
num_cpus = ResourceManager.get_cpu_count_psutil(logical=False)
num_gpus = 0
return num_cpus, num_gpus

def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
hyperparameters = self._get_model_params()
return self.estimate_memory_usage_static(
X=X,
problem_type=self.problem_type,
num_classes=self.num_classes,
hyperparameters=hyperparameters,
**kwargs,
)

@classmethod
def _estimate_memory_usage_static(
cls,
*,
X: pd.DataFrame,
hyperparameters: dict | None = None,
**kwargs,
) -> int:
# TODO: add a callback that stops when running out of memory.
if hyperparameters is None:
hyperparameters = {}

dataset_size_mem_est = (
40
* hyperparameters.get("cart_nodes_list", [2.5])[0]
* hyperparameters.get("cart_nodes_list", [0, 1])[1]
* get_approximate_df_mem_usage(X).sum()
)
baseline_overhead_mem_est = 3e8 # 300 MB generic overhead

return int(dataset_size_mem_est + baseline_overhead_mem_est)

@classmethod
def _class_tags(cls):
return {"can_estimate_memory_usage_static": True}

def _more_tags(self) -> dict:
"""DPDT does not yet support refit full."""
return {"can_refit_full": False}
2 changes: 2 additions & 0 deletions tabrepo/benchmark/models/model_register.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
TabMModel,
TabPFNV2ClientModel,
TabPFNV2Model,
BoostedDPDTModel,
)

tabrepo_model_register: ModelRegistry = copy.deepcopy(ag_model_registry)
Expand All @@ -26,6 +27,7 @@
TabDPTModel,
TabMModel,
ModernNCAModel,
BoostedDPDTModel,
]

for _model_cls in _models_to_add:
Expand Down
Empty file added tabrepo/models/dpdt/__init__.py
Empty file.
97 changes: 97 additions & 0 deletions tabrepo/models/dpdt/generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from __future__ import annotations

import numpy as np

from tabrepo.benchmark.models.ag.dpdt.dpdt_model import BoostedDPDTModel
from tabrepo.models.utils import convert_numpy_dtypes
from tabrepo.utils.config_utils import CustomAGConfigGenerator


def generate_configs_bdpdt(num_random_configs=200):
# TODO: transform this to a ConfigSpace configuration space or similar
# TODO: and/or switch to better random seed logic

# Generate 1000 samples from log-normal distribution
# Parameters: mu = log(0.01), sigma = log(10.0)
np.random.seed(42) # For reproducibility
mu = float(np.log(0.01))
sigma = float(np.log(10.0))
samples = np.random.lognormal(mean=mu, sigma=sigma, size=num_random_configs)

# Generate 1000 samples from q_log_uniform_values distribution
# Parameters: min=1.5, max=50.5, q=1
np.random.seed(43)
min_val = 1.5
max_val = 50.5
q = 1
# Generate log-uniform samples and quantize
log_min = np.log(min_val)
log_max = np.log(max_val)
log_uniform_samples = np.random.uniform(log_min, log_max, size=num_random_configs)
min_samples_leaf_samples = np.round(np.exp(log_uniform_samples) / q) * q
min_samples_leaf_samples = np.clip(
min_samples_leaf_samples, min_val, max_val
).astype(int)

# Generate 1000 samples for min_weight_fraction_leaf
# Values: [0.0, 0.01], probabilities: [0.95, 0.05]
np.random.seed(44)
min_weight_fraction_leaf_samples = np.random.choice(
[0.0, 0.01], size=num_random_configs, p=[0.95, 0.05]
)

# Generate 1000 samples for max_features
# Values: ["sqrt", "log2", 10000], probabilities: [0.5, 0.25, 0.25]
np.random.seed(45)
max_features_samples = np.random.choice(
["sqrt", "log2", 10000], size=num_random_configs, p=[0.5, 0.25, 0.25]
)

np.random.seed(46)
max_depth_samples = np.random.choice([2, 3], size=num_random_configs, p=[0.4, 0.6])

np.random.seed(47)
min_samples_split = np.random.choice(
[2, 3], size=num_random_configs, p=[0.95, 0.05]
)

np.random.seed(48)
min_impurity_decrease_samples = np.random.choice(
[0, 0.01, 0.02, 0.05], size=num_random_configs, p=[0.85, 0.05, 0.05, 0.05]
)

np.random.seed(49)
choices = [[8, 4], [4, 8], [16, 2], [4, 4, 2]]
indices = np.random.choice(len(choices), size=num_random_configs)
cart_nodes_list = [choices[i] for i in indices]

configs = []
for i in range(num_random_configs):
try:
max_features = int(max_features_samples[i])
except Exception:
max_features = max_features_samples[i]
config = {
"learning_rate": samples[i],
"max_depth": max_depth_samples[i],
"min_samples_split": min_samples_split[i],
"min_impurity_decrease": min_impurity_decrease_samples[i],
"cart_nodes_list": cart_nodes_list[i],
"min_samples_leaf": min_samples_leaf_samples[i],
"min_weight_fraction_leaf": min_weight_fraction_leaf_samples[i],
"max_features": max_features,
}
configs.append(config)

return [convert_numpy_dtypes(config) for config in configs]


gen_boosteddpdt = CustomAGConfigGenerator(
model_cls=BoostedDPDTModel,
search_space_func=generate_configs_bdpdt,
manual_configs=[{}],
)


if __name__ == "__main__":
print(generate_configs_bdpdt(3))
1 change: 1 addition & 0 deletions tabrepo/models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def get_configs_generator_from_name(model_name: str):
# "TabPFN": lambda: importlib.import_module("tabrepo.models.tabpfn.generate").gen_tabpfn, # not supported in TabArena
"TabPFNv2": lambda: importlib.import_module("tabrepo.models.tabpfnv2.generate").gen_tabpfnv2,
"XGBoost": lambda: importlib.import_module("tabrepo.models.xgboost.generate").gen_xgboost,
"BoostedDPDT": lambda: importlib.import_module("tabrepo.models.dpdt.generate").gen_boosteddpdt,
}

if model_name not in name_to_import_map:
Expand Down
17 changes: 17 additions & 0 deletions tst/benchmark/models/test_dpdt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import pytest


def test_dpdt():
model_hyperparameters = {"n_estimators": 2, "cart_nodes_list":(4,3)}

try:
from autogluon.tabular.testing import FitHelper
from tabrepo.benchmark.models.ag import BoostedDPDTModel
model_cls = BoostedDPDTModel
FitHelper.verify_model(model_cls=model_cls, model_hyperparameters=model_hyperparameters)
except ImportError as err:
pytest.skip(
f"Import Error, skipping test... "
f"Ensure you have the proper dependencies installed to run this test:\n"
f"{err}"
)