autogluon · KohlerHECTOR · Jun 26, 2025 · Jun 29, 2025 · Jun 29, 2025 · Jun 30, 2025
diff --git a/setup.py b/setup.py
@@ -41,6 +41,11 @@
     "modernnca": [
         "category_encoders",
     ],
+    "dpdt": [
+        # TODO: pypi package is not available yet
+        "git+https://github.com/KohlerHECTOR/DPDTreeEstimator.git",
+        # used hash: a74791d2190da27b43accd4da9e7d141380326ea
+    ],
 }
 
 benchmark_requires = []
@@ -53,6 +58,7 @@
     "tabdpt",
     "tabm",
     "modernnca",
+    "dpdt",
 ]:
     benchmark_requires += extras_require[extra_package]
 benchmark_requires = list(set(benchmark_requires))

diff --git a/tabrepo/benchmark/models/ag/__init__.py b/tabrepo/benchmark/models/ag/__init__.py
@@ -8,6 +8,7 @@
 from tabrepo.benchmark.models.ag.tabm.tabm_model import TabMModel
 from tabrepo.benchmark.models.ag.tabpfnv2.tabpfnv2_client_model import TabPFNV2ClientModel
 from tabrepo.benchmark.models.ag.tabpfnv2.tabpfnv2_model import TabPFNV2Model
+from tabrepo.benchmark.models.ag.dpdt.dpdt_model import BoostedDPDTModel
 
 __all__ = [
     "ExplainableBoostingMachineModel",
@@ -18,4 +19,5 @@
     "TabMModel",
     "TabPFNV2ClientModel",
     "TabPFNV2Model",
+    "BoostedDPDTModel"
 ]
diff --git a/tabrepo/benchmark/models/ag/dpdt/__init__.py b/tabrepo/benchmark/models/ag/dpdt/__init__.py
diff --git a/tabrepo/benchmark/models/ag/dpdt/dpdt_model.py b/tabrepo/benchmark/models/ag/dpdt/dpdt_model.py
@@ -0,0 +1,161 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from autogluon.common.utils.pandas_utils import get_approximate_df_mem_usage
+from autogluon.common.utils.resource_utils import ResourceManager
+from autogluon.core.models import AbstractModel
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+
+def _to_cat(X):
+    return X
+
+
+class BoostedDPDTModel(AbstractModel):
+    ag_key = "BOOSTEDDPDT"
+    ag_name = "boosted_dpdt"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._preprocessor = None
+
+    def get_model_cls(self):
+        from dpdt import AdaBoostDPDT
+
+        if self.problem_type in ["binary", "multiclass"]:
+            model_cls = AdaBoostDPDT
+        else:
+            raise AssertionError(f"Unsupported problem_type: {self.problem_type}")
+        return model_cls
+
+    def _preprocess(self, X, **kwargs):
+        X = super()._preprocess(X, **kwargs)
+        if self._preprocessor is None:
+            import numpy as np
+            from sklearn.compose import ColumnTransformer, make_column_selector
+            from sklearn.impute import SimpleImputer
+            from sklearn.pipeline import Pipeline
+            from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder
+
+            categorical_pipeline = Pipeline(
+                [
+                    (
+                        "encoder",
+                        OrdinalEncoder(
+                            handle_unknown="use_encoded_value", unknown_value=-1
+                        ),
+                    ),
+                    (
+                        "imputer",
+                        SimpleImputer(
+                            strategy="constant", add_indicator=True, fill_value=-1
+                        ),
+                    ),
+                    (
+                        "to_category",
+                        FunctionTransformer(_to_cat),
+                    ),
+                ]
+            ).set_output(transform="pandas")
+
+            self._preprocessor = ColumnTransformer(
+                transformers=[
+                    (
+                        "num",
+                        SimpleImputer(strategy="mean", add_indicator=True),
+                        make_column_selector(dtype_include=np.number),
+                    ),
+                    (
+                        "cat",
+                        categorical_pipeline,
+                        make_column_selector(dtype_include=["object", "category"]),
+                    ),
+                ],
+                remainder="passthrough",
+            ).set_output(transform="pandas")
+            self._preprocessor.fit(X)
+
+        return self._preprocessor.transform(X)
+
+    def _fit(
+        self,
+        X: pd.DataFrame,
+        y: pd.Series,
+        num_cpus: int = 1,
+        time_limit: float | None = None,
+        **kwargs,
+    ):
+        model_cls = self.get_model_cls()
+        hyp = self._get_model_params()
+
+        self.model = model_cls(
+            **hyp,
+            n_jobs="best" if num_cpus > 1 else num_cpus,
+            time_limit=time_limit,
+        )
+        X = self.preprocess(X)
+        self.model = self.model.fit(
+            X=X,
+            y=y,
+        )
+
+    def _set_default_params(self):
+        default_params = {
+            "random_state": 42,
+            "n_estimators": 1000,
+        }
+        for param, val in default_params.items():
+            self._set_default_param_value(param, val)
+
+    @classmethod
+    def supported_problem_types(cls) -> list[str] | None:
+        return ["binary", "multiclass"]
+
+    def _get_default_resources(self) -> tuple[int, int]:
+        # logical=False is faster in training
+        num_cpus = ResourceManager.get_cpu_count_psutil(logical=False)
+        num_gpus = 0
+        return num_cpus, num_gpus
+
+    def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
+        hyperparameters = self._get_model_params()
+        return self.estimate_memory_usage_static(
+            X=X,
+            problem_type=self.problem_type,
+            num_classes=self.num_classes,
+            hyperparameters=hyperparameters,
+            **kwargs,
+        )
+
+    @classmethod
+    def _estimate_memory_usage_static(
+        cls,
+        *,
+        X: pd.DataFrame,
+        hyperparameters: dict | None = None,
+        **kwargs,
+    ) -> int:
+        # TODO: add a callback that stops when running out of memory.
+        if hyperparameters is None:
+            hyperparameters = {}
+
+        dataset_size_mem_est = (
+            40
+            * hyperparameters.get("cart_nodes_list", [2.5])[0]
+            * hyperparameters.get("cart_nodes_list", [0, 1])[1]
+            * get_approximate_df_mem_usage(X).sum()
+        )
+        baseline_overhead_mem_est = 3e8  # 300 MB generic overhead
+
+        return int(dataset_size_mem_est + baseline_overhead_mem_est)
+
+    @classmethod
+    def _class_tags(cls):
+        return {"can_estimate_memory_usage_static": True}
+
+    def _more_tags(self) -> dict:
+        """DPDT does not yet support refit full."""
+        return {"can_refit_full": False}
diff --git a/tabrepo/benchmark/models/model_register.py b/tabrepo/benchmark/models/model_register.py
@@ -13,6 +13,7 @@
     TabMModel,
     TabPFNV2ClientModel,
     TabPFNV2Model,
+    BoostedDPDTModel,
 )
 
 tabrepo_model_register: ModelRegistry = copy.deepcopy(ag_model_registry)
@@ -26,6 +27,7 @@
     TabDPTModel,
     TabMModel,
     ModernNCAModel,
+    BoostedDPDTModel,
 ]
 
 for _model_cls in _models_to_add:

diff --git a/tabrepo/models/dpdt/__init__.py b/tabrepo/models/dpdt/__init__.py
diff --git a/tabrepo/models/dpdt/generate.py b/tabrepo/models/dpdt/generate.py
@@ -0,0 +1,97 @@
+from __future__ import annotations
+
+import numpy as np
+
+from tabrepo.benchmark.models.ag.dpdt.dpdt_model import BoostedDPDTModel
+from tabrepo.models.utils import convert_numpy_dtypes
+from tabrepo.utils.config_utils import CustomAGConfigGenerator
+
+
+def generate_configs_bdpdt(num_random_configs=200):
+    # TODO: transform this to a ConfigSpace configuration space or similar
+    # TODO: and/or switch to better random seed logic
+
+    # Generate 1000 samples from log-normal distribution
+    # Parameters: mu = log(0.01), sigma = log(10.0)
+    np.random.seed(42)  # For reproducibility
+    mu = float(np.log(0.01))
+    sigma = float(np.log(10.0))
+    samples = np.random.lognormal(mean=mu, sigma=sigma, size=num_random_configs)
+
+    # Generate 1000 samples from q_log_uniform_values distribution
+    # Parameters: min=1.5, max=50.5, q=1
+    np.random.seed(43)
+    min_val = 1.5
+    max_val = 50.5
+    q = 1
+    # Generate log-uniform samples and quantize
+    log_min = np.log(min_val)
+    log_max = np.log(max_val)
+    log_uniform_samples = np.random.uniform(log_min, log_max, size=num_random_configs)
+    min_samples_leaf_samples = np.round(np.exp(log_uniform_samples) / q) * q
+    min_samples_leaf_samples = np.clip(
+        min_samples_leaf_samples, min_val, max_val
+    ).astype(int)
+
+    # Generate 1000 samples for min_weight_fraction_leaf
+    # Values: [0.0, 0.01], probabilities: [0.95, 0.05]
+    np.random.seed(44)
+    min_weight_fraction_leaf_samples = np.random.choice(
+        [0.0, 0.01], size=num_random_configs, p=[0.95, 0.05]
+    )
+
+    # Generate 1000 samples for max_features
+    # Values: ["sqrt", "log2", 10000], probabilities: [0.5, 0.25, 0.25]
+    np.random.seed(45)
+    max_features_samples = np.random.choice(
+        ["sqrt", "log2", 10000], size=num_random_configs, p=[0.5, 0.25, 0.25]
+    )
+
+    np.random.seed(46)
+    max_depth_samples = np.random.choice([2, 3], size=num_random_configs, p=[0.4, 0.6])
+
+    np.random.seed(47)
+    min_samples_split = np.random.choice(
+        [2, 3], size=num_random_configs, p=[0.95, 0.05]
+    )
+
+    np.random.seed(48)
+    min_impurity_decrease_samples = np.random.choice(
+        [0, 0.01, 0.02, 0.05], size=num_random_configs, p=[0.85, 0.05, 0.05, 0.05]
+    )
+
+    np.random.seed(49)
+    choices = [[8, 4], [4, 8], [16, 2], [4, 4, 2]]
+    indices = np.random.choice(len(choices), size=num_random_configs)
+    cart_nodes_list = [choices[i] for i in indices]
+
+    configs = []
+    for i in range(num_random_configs):
+        try:
+            max_features = int(max_features_samples[i])
+        except Exception:
+            max_features = max_features_samples[i]
+        config = {
+            "learning_rate": samples[i],
+            "max_depth": max_depth_samples[i],
+            "min_samples_split": min_samples_split[i],
+            "min_impurity_decrease": min_impurity_decrease_samples[i],
+            "cart_nodes_list": cart_nodes_list[i],
+            "min_samples_leaf": min_samples_leaf_samples[i],
+            "min_weight_fraction_leaf": min_weight_fraction_leaf_samples[i],
+            "max_features": max_features,
+        }
+        configs.append(config)
+
+    return [convert_numpy_dtypes(config) for config in configs]
+
+
+gen_boosteddpdt = CustomAGConfigGenerator(
+    model_cls=BoostedDPDTModel,
+    search_space_func=generate_configs_bdpdt,
+    manual_configs=[{}],
+)
+
+
+if __name__ == "__main__":
+    print(generate_configs_bdpdt(3))
diff --git a/tabrepo/models/utils.py b/tabrepo/models/utils.py
@@ -46,6 +46,7 @@ def get_configs_generator_from_name(model_name: str):
         # "TabPFN": lambda: importlib.import_module("tabrepo.models.tabpfn.generate").gen_tabpfn, # not supported in TabArena
         "TabPFNv2": lambda: importlib.import_module("tabrepo.models.tabpfnv2.generate").gen_tabpfnv2,
         "XGBoost": lambda: importlib.import_module("tabrepo.models.xgboost.generate").gen_xgboost,
+        "BoostedDPDT": lambda: importlib.import_module("tabrepo.models.dpdt.generate").gen_boosteddpdt,
     }
 
     if model_name not in name_to_import_map:

diff --git a/tst/benchmark/models/test_dpdt.py b/tst/benchmark/models/test_dpdt.py
@@ -0,0 +1,17 @@
+import pytest
+
+
+def test_dpdt():
+    model_hyperparameters = {"n_estimators": 2, "cart_nodes_list":(4,3)}
+
+    try:
+        from autogluon.tabular.testing import FitHelper
+        from tabrepo.benchmark.models.ag import BoostedDPDTModel
+        model_cls = BoostedDPDTModel
+        FitHelper.verify_model(model_cls=model_cls, model_hyperparameters=model_hyperparameters)
+    except ImportError as err:
+        pytest.skip(
+            f"Import Error, skipping test... "
+            f"Ensure you have the proper dependencies installed to run this test:\n"
+            f"{err}"
+        )