From 117339dfbcb280674467dc7a889907f061330aa3 Mon Sep 17 00:00:00 2001 From: Alex Labach Date: Thu, 18 Sep 2025 15:24:59 -0400 Subject: [PATCH 1/8] Update TabDPT to v1.1, add search space, and improve hyperparameters Additional changes: - Load TabDPT from PyPI instead of Github - Remove workaround for batch size 1 - Use new default download method (huggingface-hub) instead of custom caching code --- scripts/run_generate_all_configs.py | 2 +- setup.py | 6 +- .../models/ag/tabdpt/tabdpt_model.py | 89 ++----------------- tabrepo/models/tabdpt/generate.py | 8 +- 4 files changed, 15 insertions(+), 90 deletions(-) diff --git a/scripts/run_generate_all_configs.py b/scripts/run_generate_all_configs.py index be8011b6..c08f5453 100644 --- a/scripts/run_generate_all_configs.py +++ b/scripts/run_generate_all_configs.py @@ -44,7 +44,7 @@ experiments_tabicl = gen_tabicl.generate_all_bag_experiments(num_random_configs=0) experiments_tabpfnv2 = gen_tabpfnv2.generate_all_bag_experiments(num_random_configs=n_random_configs) - experiments_tabdpt = gen_tabdpt.generate_all_bag_experiments(num_random_configs=0) + experiments_tabdpt = gen_tabdpt.generate_all_bag_experiments(num_random_configs=n_random_configs) experiments_modernnca = gen_modernnca.generate_all_bag_experiments(num_random_configs=n_random_configs) # Dummy (constant predictor) diff --git a/setup.py b/setup.py index 933dcb7e..98b9597d 100644 --- a/setup.py +++ b/setup.py @@ -30,11 +30,7 @@ "pytabkit>=1.5.0,<2.0", ], "tabdpt": [ - # TODO: pypi package is not available yet - # FIXME: newest version (1.1) has (unnecessary) strict version requirements - # that are not compatible with autogluon, so we stick to the old hash - "tabdpt @ git+https://github.com/layer6ai-labs/TabDPT.git@9699d9592b61c5f70fc88f5531cdb87b40cbedf5", - # used hash: 9699d9592b61c5f70fc88f5531cdb87b40cbedf5 + "tabdpt>=1.1.4" ], "tabm": [ "torch", diff --git a/tabrepo/benchmark/models/ag/tabdpt/tabdpt_model.py b/tabrepo/benchmark/models/ag/tabdpt/tabdpt_model.py index 3b6eb0fc..c9b10720 100644 --- a/tabrepo/benchmark/models/ag/tabdpt/tabdpt_model.py +++ b/tabrepo/benchmark/models/ag/tabdpt/tabdpt_model.py @@ -1,11 +1,6 @@ from __future__ import annotations import math -import os -import shutil -import sys -import warnings -from pathlib import Path from typing import TYPE_CHECKING from autogluon.common.utils.resource_utils import ResourceManager @@ -49,14 +44,15 @@ def _fit( model_cls = TabDPTClassifier if self.problem_type in [BINARY, MULTICLASS] else TabDPTRegressor hps = self._get_model_params() - self._predict_hps = dict(seed=42, context_size=1024) + self._predict_hps = { + k:v for k,v in hps.items() if k in ('context_size', 'permute_classes', 'temperature') + } + self._predict_hps['seed'] = 42 X = self.preprocess(X) y = y.to_numpy() self.model = model_cls( - path=self._download_and_get_model_path(), device=device, - use_flash=self._use_flash(), - **hps, + use_flash=self._use_flash() ) self.model.fit(X=X, y=y) @@ -76,23 +72,6 @@ def _use_flash() -> bool: return True - @staticmethod - def _download_and_get_model_path() -> str: - # We follow TabPFN-logic for model caching as /tmp is not a persistent cache location. - from tabdpt.estimator import TabDPTEstimator - from tabdpt.utils import download_model - - model_dir = _user_cache_dir(platform=sys.platform, appname="tabdpt") - model_dir.mkdir(exist_ok=True, parents=True) - - final_model_path = model_dir / Path(TabDPTEstimator._DEFAULT_CHECKPOINT_PATH).name - - if not final_model_path.exists(): - model_path = Path(download_model()) # downloads to /tmp - shutil.copy(model_path, final_model_path) # copy to user cache dir - - return str(final_model_path) - def _get_default_resources(self) -> tuple[int, int]: # Use only physical cores for better performance based on benchmarks num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True) @@ -110,17 +89,10 @@ def get_minimum_resources(self, is_gpu_available: bool = False) -> dict[str, int def _predict_proba(self, X, **kwargs) -> np.ndarray: X = self.preprocess(X, **kwargs) - # Fix bug in TabDPt where batches of length 1 crash the prediction. - # - We set the inference size such that there are no batches of length 1. - math.ceil(len(X) / self.model.inf_batch_size) - last_batch_size = len(X) % self.model.inf_batch_size - if last_batch_size == 1: - self.model.inf_batch_size += 1 - if self.problem_type in [REGRESSION]: return self.model.predict(X, **self._predict_hps) - y_pred_proba = self.model.predict_proba(X, **self._predict_hps) + y_pred_proba = self.model.ensemble_predict_proba(X, **self._predict_hps) return self._convert_proba_to_unified_form(y_pred_proba) def _preprocess(self, X: pd.DataFrame, **kwargs) -> pd.DataFrame: @@ -150,52 +122,3 @@ def _get_default_ag_args_ensemble(cls, **kwargs) -> dict: } default_ag_args_ensemble.update(extra_ag_args_ensemble) return default_ag_args_ensemble - -# Vendored from TabPFNv2 Code -def _user_cache_dir(platform: str, appname: str = "tabpfn") -> Path: - use_instead_path = (Path.cwd() / ".tabpfn_models").resolve() - - # https://docs.python.org/3/library/sys.html#sys.platform - if platform == "win32": - # Honestly, I don't want to do what `platformdirs` does: - # https://github.com/tox-dev/platformdirs/blob/b769439b2a3b70769a93905944a71b3e63ef4823/src/platformdirs/windows.py#L252-L265 - APPDATA_PATH = os.environ.get("APPDATA", "") - if APPDATA_PATH.strip() != "": - return Path(APPDATA_PATH) / appname - - warnings.warn( - "Could not find APPDATA environment variable to get user cache dir," - " but detected platform 'win32'." - f" Defaulting to a path '{use_instead_path}'." - " If you would prefer, please specify a directory when creating" - " the model.", - UserWarning, - stacklevel=2, - ) - return use_instead_path - - if platform == "darwin": - return Path.home() / "Library" / "Caches" / appname - - # TODO: Not entirely sure here, Python doesn't explicitly list - # all of these and defaults to the underlying operating system - # if not sure. - linux_likes = ("freebsd", "linux", "netbsd", "openbsd") - if any(platform.startswith(linux) for linux in linux_likes): - # The reason to use "" as default is that the env var could exist but be empty. - # We catch all this with the `.strip() != ""` below - XDG_CACHE_HOME = os.environ.get("XDG_CACHE_HOME", "") - if XDG_CACHE_HOME.strip() != "": - return Path(XDG_CACHE_HOME) / appname - return Path.home() / ".cache" / appname - - warnings.warn( - f"Unknown platform '{platform}' to get user cache dir." - f" Defaulting to a path at the execution site '{use_instead_path}'." - " If you would prefer, please specify a directory when creating" - " the model.", - UserWarning, - stacklevel=2, - ) - return use_instead_path - diff --git a/tabrepo/models/tabdpt/generate.py b/tabrepo/models/tabdpt/generate.py index fc66043d..486a4bc6 100644 --- a/tabrepo/models/tabdpt/generate.py +++ b/tabrepo/models/tabdpt/generate.py @@ -1,5 +1,6 @@ from __future__ import annotations +from autogluon.common.space import Real, Categorical from tabrepo.benchmark.models.ag.tabdpt.tabdpt_model import TabDPTModel from tabrepo.utils.config_utils import ConfigGenerator @@ -8,9 +9,14 @@ # Default config with refit after cross-validation. {"ag_args_ensemble": {"refit_folds": True}}, ] +search_space = { + 'temperature': Real(0.05, 1.25, default=0.3), + 'context_size': Categorical(2048, 768, 256), + 'permute_classes': Categorical(True, False) +} gen_tabdpt = ConfigGenerator( - model_cls=TabDPTModel, manual_configs=manual_configs, search_space={} + model_cls=TabDPTModel, manual_configs=manual_configs, search_space=search_space ) if __name__ == "__main__": From 07ae91a602b6f5039cf2a93dacb0b34033c11875 Mon Sep 17 00:00:00 2001 From: Alex Labach Date: Fri, 26 Sep 2025 17:18:20 -0400 Subject: [PATCH 2/8] Update temperature ranges after v1.1.5 softmax fix --- tabrepo/models/tabdpt/generate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tabrepo/models/tabdpt/generate.py b/tabrepo/models/tabdpt/generate.py index 486a4bc6..9a4cde89 100644 --- a/tabrepo/models/tabdpt/generate.py +++ b/tabrepo/models/tabdpt/generate.py @@ -10,7 +10,7 @@ {"ag_args_ensemble": {"refit_folds": True}}, ] search_space = { - 'temperature': Real(0.05, 1.25, default=0.3), + 'temperature': Real(0.05, 1.5, default=0.8), 'context_size': Categorical(2048, 768, 256), 'permute_classes': Categorical(True, False) } From 925fa30b8c227dd627e48ed2a1c95608acb76a5f Mon Sep 17 00:00:00 2001 From: Alex Labach Date: Sun, 28 Sep 2025 19:42:51 -0400 Subject: [PATCH 3/8] Bump TabDPT version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 98b9597d..b296fb16 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ "pytabkit>=1.5.0,<2.0", ], "tabdpt": [ - "tabdpt>=1.1.4" + "tabdpt>=1.1.5" ], "tabm": [ "torch", From 1feef81ad091bc14a29c6cfd01a5a8fd2794c920 Mon Sep 17 00:00:00 2001 From: Alex Labach Date: Sun, 28 Sep 2025 19:44:27 -0400 Subject: [PATCH 4/8] Fix supported hp filter for TabDPTRegressor --- tabrepo/benchmark/models/ag/tabdpt/tabdpt_model.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tabrepo/benchmark/models/ag/tabdpt/tabdpt_model.py b/tabrepo/benchmark/models/ag/tabdpt/tabdpt_model.py index c9b10720..15dfca9d 100644 --- a/tabrepo/benchmark/models/ag/tabdpt/tabdpt_model.py +++ b/tabrepo/benchmark/models/ag/tabdpt/tabdpt_model.py @@ -42,10 +42,13 @@ def _fit( from tabdpt import TabDPTClassifier, TabDPTRegressor model_cls = TabDPTClassifier if self.problem_type in [BINARY, MULTICLASS] else TabDPTRegressor + supported_hps = ('context_size', 'permute_classes', 'temperature') \ + if model_cls is TabDPTClassifier \ + else ('context_size',) hps = self._get_model_params() self._predict_hps = { - k:v for k,v in hps.items() if k in ('context_size', 'permute_classes', 'temperature') + k:v for k,v in hps.items() if k in supported_hps } self._predict_hps['seed'] = 42 X = self.preprocess(X) From 4f104046f7de2102bee79710f620620f2c24aadb Mon Sep 17 00:00:00 2001 From: Alex Labach Date: Fri, 10 Oct 2025 17:58:36 -0400 Subject: [PATCH 5/8] Add more tabdpt hyperparameters, update version --- setup.py | 2 +- tabrepo/benchmark/models/ag/tabdpt/tabdpt_model.py | 11 ++++++++--- tabrepo/models/tabdpt/generate.py | 9 +++++++-- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 1008548e..0f5232cc 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ "pytabkit>=1.5.0,<2.0", ], "tabdpt": [ - "tabdpt>=1.1.5" + "tabdpt>=1.1.6" ], "tabm": [ "torch", diff --git a/tabrepo/benchmark/models/ag/tabdpt/tabdpt_model.py b/tabrepo/benchmark/models/ag/tabdpt/tabdpt_model.py index 15dfca9d..ed1e1db4 100644 --- a/tabrepo/benchmark/models/ag/tabdpt/tabdpt_model.py +++ b/tabrepo/benchmark/models/ag/tabdpt/tabdpt_model.py @@ -42,20 +42,25 @@ def _fit( from tabdpt import TabDPTClassifier, TabDPTRegressor model_cls = TabDPTClassifier if self.problem_type in [BINARY, MULTICLASS] else TabDPTRegressor - supported_hps = ('context_size', 'permute_classes', 'temperature') \ + supported_predict_hps = ('context_size', 'permute_classes', 'temperature') \ if model_cls is TabDPTClassifier \ else ('context_size',) hps = self._get_model_params() self._predict_hps = { - k:v for k,v in hps.items() if k in supported_hps + k:v for k,v in hps.items() if k in supported_predict_hps } self._predict_hps['seed'] = 42 X = self.preprocess(X) y = y.to_numpy() self.model = model_cls( device=device, - use_flash=self._use_flash() + use_flash=self._use_flash(), + normalizer=hps.get("normalizer", "standard"), + missing_indicators=hps.get("missing_indicators", False), + clip_sigma=hps.get("clip_sigma", 4), + feature_reduction=hps.get("feature_reduction", "pca"), + faiss_metric=hps.get("faiss_metric", "l2") ) self.model.fit(X=X, y=y) diff --git a/tabrepo/models/tabdpt/generate.py b/tabrepo/models/tabdpt/generate.py index 9a4cde89..b5bf9565 100644 --- a/tabrepo/models/tabdpt/generate.py +++ b/tabrepo/models/tabdpt/generate.py @@ -10,9 +10,14 @@ {"ag_args_ensemble": {"refit_folds": True}}, ] search_space = { - 'temperature': Real(0.05, 1.5, default=0.8), + 'temperature': Categorical(0.8, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0, 1.25, 1.5), 'context_size': Categorical(2048, 768, 256), - 'permute_classes': Categorical(True, False) + 'permute_classes': Categorical(True, False), + 'normalizer': Categorical("standard", None, "minmax", "robust", "power", "quantile-uniform", "quantile-normal", "log1p"), + 'missing_indicators': Categorical(False, True), + 'clip_sigma': Categorical(4, 2, 6, 8), + 'feature_reduction': Categorical("pca", "subsample"), + 'faiss_metric': Categorical("l2", "ip") } gen_tabdpt = ConfigGenerator( From 4346c6c1c21194c2c66c8c262b4766b639004755 Mon Sep 17 00:00:00 2001 From: LennartPurucker Date: Thu, 16 Oct 2025 11:00:23 +0200 Subject: [PATCH 6/8] maint: linting and formatting add: new random seed logic or TabDPT --- .../models/ag/tabdpt/tabdpt_model.py | 42 ++++++++++++------- tabrepo/models/tabdpt/generate.py | 36 +++++++++------- tst/benchmark/models/test_tabdpt.py | 8 +++- 3 files changed, 54 insertions(+), 32 deletions(-) diff --git a/tabrepo/benchmark/models/ag/tabdpt/tabdpt_model.py b/tabrepo/benchmark/models/ag/tabdpt/tabdpt_model.py index ed1e1db4..9ec6e156 100644 --- a/tabrepo/benchmark/models/ag/tabdpt/tabdpt_model.py +++ b/tabrepo/benchmark/models/ag/tabdpt/tabdpt_model.py @@ -1,6 +1,5 @@ from __future__ import annotations -import math from typing import TYPE_CHECKING from autogluon.common.utils.resource_utils import ResourceManager @@ -41,16 +40,20 @@ def _fit( ) from tabdpt import TabDPTClassifier, TabDPTRegressor - model_cls = TabDPTClassifier if self.problem_type in [BINARY, MULTICLASS] else TabDPTRegressor - supported_predict_hps = ('context_size', 'permute_classes', 'temperature') \ - if model_cls is TabDPTClassifier \ - else ('context_size',) + model_cls = ( + TabDPTClassifier + if self.problem_type in [BINARY, MULTICLASS] + else TabDPTRegressor + ) + supported_predict_hps = ( + ("context_size", "permute_classes", "temperature") + if model_cls is TabDPTClassifier + else ("context_size",) + ) hps = self._get_model_params() - self._predict_hps = { - k:v for k,v in hps.items() if k in supported_predict_hps - } - self._predict_hps['seed'] = 42 + self._predict_hps = {k: v for k, v in hps.items() if k in supported_predict_hps} + self._predict_hps["seed"] = self.random_seed X = self.preprocess(X) y = y.to_numpy() self.model = model_cls( @@ -60,10 +63,16 @@ def _fit( missing_indicators=hps.get("missing_indicators", False), clip_sigma=hps.get("clip_sigma", 4), feature_reduction=hps.get("feature_reduction", "pca"), - faiss_metric=hps.get("faiss_metric", "l2") + faiss_metric=hps.get("faiss_metric", "l2"), ) self.model.fit(X=X, y=y) + def _get_random_seed_from_hyperparameters( + self, hyperparameters: dict + ) -> int | None | str: + return hyperparameters.get("seed", "N/A") + + @staticmethod def _use_flash() -> bool: """Detect if torch's native flash attention is available on the current machine.""" @@ -75,10 +84,7 @@ def _use_flash() -> bool: device = torch.device("cuda:0") capability = torch.cuda.get_device_capability(device) - if capability == (7, 5): - return False - - return True + return capability != (7, 5) def _get_default_resources(self) -> tuple[int, int]: # Use only physical cores for better performance based on benchmarks @@ -88,7 +94,9 @@ def _get_default_resources(self) -> tuple[int, int]: return num_cpus, num_gpus - def get_minimum_resources(self, is_gpu_available: bool = False) -> dict[str, int | float]: + def get_minimum_resources( + self, is_gpu_available: bool = False + ) -> dict[str, int | float]: return { "num_cpus": 1, "num_gpus": 1 if is_gpu_available else 0, @@ -111,7 +119,9 @@ def _preprocess(self, X: pd.DataFrame, **kwargs) -> pd.DataFrame: self._feature_generator.fit(X=X) if self._feature_generator.features_in: X = X.copy() - X[self._feature_generator.features_in] = self._feature_generator.transform(X=X) + X[self._feature_generator.features_in] = self._feature_generator.transform( + X=X + ) return X.to_numpy() @classmethod diff --git a/tabrepo/models/tabdpt/generate.py b/tabrepo/models/tabdpt/generate.py index b5bf9565..cc0cf3fc 100644 --- a/tabrepo/models/tabdpt/generate.py +++ b/tabrepo/models/tabdpt/generate.py @@ -1,27 +1,35 @@ from __future__ import annotations -from autogluon.common.space import Real, Categorical +from autogluon.common.space import Categorical + from tabrepo.benchmark.models.ag.tabdpt.tabdpt_model import TabDPTModel from tabrepo.utils.config_utils import ConfigGenerator name = "TabDPT" -manual_configs = [ - # Default config with refit after cross-validation. - {"ag_args_ensemble": {"refit_folds": True}}, -] search_space = { - 'temperature': Categorical(0.8, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0, 1.25, 1.5), - 'context_size': Categorical(2048, 768, 256), - 'permute_classes': Categorical(True, False), - 'normalizer': Categorical("standard", None, "minmax", "robust", "power", "quantile-uniform", "quantile-normal", "log1p"), - 'missing_indicators': Categorical(False, True), - 'clip_sigma': Categorical(4, 2, 6, 8), - 'feature_reduction': Categorical("pca", "subsample"), - 'faiss_metric': Categorical("l2", "ip") + "temperature": Categorical( + 0.8, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 1.0, 1.25, 1.5 + ), + "context_size": Categorical(2048, 768, 256), + "permute_classes": Categorical(True, False), + "normalizer": Categorical( + "standard", + None, + "minmax", + "robust", + "power", + "quantile-uniform", + "quantile-normal", + "log1p", + ), + "missing_indicators": Categorical(False, True), + "clip_sigma": Categorical(4, 2, 6, 8), + "feature_reduction": Categorical("pca", "subsample"), + "faiss_metric": Categorical("l2", "ip"), } gen_tabdpt = ConfigGenerator( - model_cls=TabDPTModel, manual_configs=manual_configs, search_space=search_space + model_cls=TabDPTModel, manual_configs=[{}], search_space=search_space ) if __name__ == "__main__": diff --git a/tst/benchmark/models/test_tabdpt.py b/tst/benchmark/models/test_tabdpt.py index 890ca88d..12e7ee11 100644 --- a/tst/benchmark/models/test_tabdpt.py +++ b/tst/benchmark/models/test_tabdpt.py @@ -1,5 +1,6 @@ -import pytest +from __future__ import annotations +import pytest def test_tabdpt(): @@ -8,8 +9,11 @@ def test_tabdpt(): try: from autogluon.tabular.testing import FitHelper from tabrepo.benchmark.models.ag.tabdpt.tabdpt_model import TabDPTModel + model_cls = TabDPTModel - FitHelper.verify_model(model_cls=model_cls, model_hyperparameters=model_hyperparameters) + FitHelper.verify_model( + model_cls=model_cls, model_hyperparameters=model_hyperparameters + ) except ImportError as err: pytest.skip( f"Import Error, skipping test... " From b0a6da6066d3a627e786f542b0d2d45ca680d2c9 Mon Sep 17 00:00:00 2001 From: Alex Labach Date: Thu, 16 Oct 2025 15:53:22 -0400 Subject: [PATCH 7/8] Bump TabDPT version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a8776344..ef25ce51 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ "pytabkit>=1.5.0,<2.0", ], "tabdpt": [ - "tabdpt>=1.1.6" + "tabdpt>=1.1.7" ], "tabm": [ "torch", From c6b4af7d113c9b59f5089e83e500a4b18dae6e36 Mon Sep 17 00:00:00 2001 From: LennartPurucker Date: Mon, 27 Oct 2025 11:21:40 +0100 Subject: [PATCH 8/8] add: final state of tabdpt --- setup.py | 2 +- tabrepo/utils/config_utils.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index a8776344..13a59573 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ "pytabkit>=1.5.0,<2.0", ], "tabdpt": [ - "tabdpt>=1.1.6" + "tabdpt>=1.1.6", ], "tabm": [ "torch", diff --git a/tabrepo/utils/config_utils.py b/tabrepo/utils/config_utils.py index 09dc8722..f5828ad2 100644 --- a/tabrepo/utils/config_utils.py +++ b/tabrepo/utils/config_utils.py @@ -98,6 +98,7 @@ def generate_all_bag_experiments( name_id_suffix: str = "", add_seed: Literal["static", "fold-wise", "fold-config-wise"] = "static", method_kwargs: dict | None = None, + **kwargs, ) -> list: """Generate experiments with bagging models for the search space. @@ -119,7 +120,7 @@ def generate_all_bag_experiments( runner by `method_kwargs=dict(init_kwargs=dict(path="./my_custom_path"))` """ configs = self.generate_all_configs_lst(num_random_configs=num_random_configs, name_id_suffix=name_id_suffix) - experiments = generate_bag_experiments(model_cls=self.model_cls, configs=configs, name_suffix_from_ag_args=True, add_seed=add_seed, method_kwargs=method_kwargs) + experiments = generate_bag_experiments(model_cls=self.model_cls, configs=configs, name_suffix_from_ag_args=True, add_seed=add_seed, method_kwargs=method_kwargs, **kwargs) return experiments