diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml
index 24fc6bbf1d..07ad9366a2 100644
--- a/.github/workflows/dist.yml
+++ b/.github/workflows/dist.yml
@@ -27,7 +27,7 @@ jobs:
         submodules: recursive
 
     - name: Setup Python
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v4
       with:
         python-version: 3.8
 
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 2b5d32a4f4..83510c5483 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -28,7 +28,7 @@ jobs:
         submodules: recursive
 
     - name: Setup Python
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v4
       with:
         python-version: 3.8
 
diff --git a/.github/workflows/generate-baselines.yml b/.github/workflows/generate-baselines.yml
index 337fdd269e..5149dd57d8 100644
--- a/.github/workflows/generate-baselines.yml
+++ b/.github/workflows/generate-baselines.yml
@@ -59,7 +59,7 @@ jobs:
           #   value: The python version used by the installed system
 
       - name: Setup Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ steps.python-version.outputs.value }}
 
@@ -109,7 +109,7 @@ jobs:
           #   results_path: path to the benchmark results
 
       - name: Upload Results as Artifact
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
         with:
           name: baselines
           path: |
diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml
index 2d28dd1eae..c7e5b94438 100644
--- a/.github/workflows/pre-commit.yaml
+++ b/.github/workflows/pre-commit.yaml
@@ -25,7 +25,7 @@ jobs:
         submodules: recursive
 
     - name: Setup Python 3.7
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v4
       with:
         python-version: 3.7
 
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index de29e860fc..794157f602 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -26,7 +26,7 @@ env:
   pytest-args: >-
     --forked
     --durations=20
-    --timeout=300
+    --timeout=600
     --timeout-method=thread
     -s
 
@@ -79,7 +79,7 @@ jobs:
         submodules: recursive
 
     - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
 
@@ -150,7 +150,7 @@ jobs:
 
     - name: Upload coverage
       if: matrix.code-cov && always()
-      uses: codecov/codecov-action@v2
+      uses: codecov/codecov-action@v3
       with:
         fail_ci_if_error: true
         verbose: true
diff --git a/.github/workflows/regressions.yml b/.github/workflows/regressions.yml
index 70be44beff..8bb0addcf4 100644
--- a/.github/workflows/regressions.yml
+++ b/.github/workflows/regressions.yml
@@ -142,7 +142,7 @@ jobs:
           #   value: The python version used by the installed system
 
       - name: Setup Python
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ steps.python-version.outputs.value }}
 
@@ -206,7 +206,7 @@ jobs:
           #   value: The python version used by the installed system
 
       - name: Setup Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ steps.python-version.outputs.value }}
 
@@ -250,7 +250,7 @@ jobs:
         #   - baseline_regression_x_x_x.csv
 
       - name: Download workflow artifacts
-        uses: actions/download-artifact@v2
+        uses: actions/download-artifact@v3
         with:
           path: artifacts
 
@@ -307,7 +307,7 @@ jobs:
           #   compared_means: path to the results of regression test vs baseline
 
       - name: Upload all results together as an artifact
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
         with:
           name: ${{ github.repository_owner }}_${{ steps.extract.outputs.branch }}_${{ github.sha }}
           path: |
@@ -327,7 +327,7 @@ jobs:
             && github.event.action == 'labeled'
             && github.event.label.name == 'regression-tests'
           )
-        uses: peter-evans/find-comment@v1
+        uses: peter-evans/find-comment@v2
         id: comment_finder
         with:
           issue-number: ${{ github.event.pull_request.number }}
diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml
index b4bb87fafd..5d24ae0627 100644
--- a/.github/workflows/stale.yaml
+++ b/.github/workflows/stale.yaml
@@ -9,7 +9,7 @@ jobs:
   stale:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/stale@v4
+      - uses: actions/stale@v5
         with:
           days-before-stale: 60
           days-before-close: 7
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
index 12e80b8e4e..278cd5c146 100644
--- a/autosklearn/automl.py
+++ b/autosklearn/automl.py
@@ -48,6 +48,7 @@
     BaseShuffleSplit,
     _RepeatedSplits,
 )
+from sklearn.pipeline import Pipeline
 from sklearn.utils import check_random_state
 from sklearn.utils.validation import check_is_fitted
 from smac.callbacks import IncorporateRunResultCallback
@@ -120,6 +121,7 @@
 )
 from autosklearn.util.parallel import preload_modules
 from autosklearn.util.single_thread_client import SingleThreadedClient
+from autosklearn.util.smac_wrap import SMACCallback, SmacRunCallback
 from autosklearn.util.stopwatch import StopWatch
 
 import unittest.mock
@@ -235,7 +237,7 @@ def __init__(
         logging_config: Optional[Mapping] = None,
         metrics: Sequence[Scorer] | None = None,
         scoring_functions: Optional[list[Scorer]] = None,
-        get_trials_callback: Optional[IncorporateRunResultCallback] = None,
+        get_trials_callback: SMACCallback | None = None,
         dataset_compression: bool | Mapping[str, Any] = True,
         allow_string_features: bool = True,
     ):
@@ -243,7 +245,7 @@ def __init__(
 
         if isinstance(disable_evaluator_output, Iterable):
             disable_evaluator_output = list(disable_evaluator_output)  # Incase iterator
-            allowed = set(["model", "cv_model", "y_optimization", "y_test", "y_valid"])
+            allowed = set(["model", "cv_model", "y_optimization", "y_test"])
             unknown = allowed - set(disable_evaluator_output)
             if any(unknown):
                 raise ValueError(
@@ -264,6 +266,15 @@ def __init__(
                 memory_limit=memory_limit,
             )
 
+        # If we got something callable for `get_trials_callback`, wrap it so SMAC
+        # will accept it.
+        if (
+            get_trials_callback is not None
+            and callable(get_trials_callback)
+            and not isinstance(get_trials_callback, IncorporateRunResultCallback)
+        ):
+            get_trials_callback = SmacRunCallback(get_trials_callback)
+
         self._delete_tmp_folder_after_terminate = delete_tmp_folder_after_terminate
         self._time_for_task = time_left_for_this_task
         self._per_run_time_limit = per_run_time_limit
@@ -646,273 +657,282 @@ def fit(
 
         # By default try to use the TCP logging port or get a new port
         self._logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
-        self._logger = self._get_logger(dataset_name)
 
-        # The first thing we have to do is create the logger to update the backend
-        self._backend.setup_logger(self._logger_port)
+        # Once we start the logging server, it starts in a new process
+        # If an error occurs then we want to make sure that we exit cleanly
+        # and shut it down, else it might hang
+        # https://github.com/automl/auto-sklearn/issues/1480
+        try:
+            self._logger = self._get_logger(dataset_name)
 
-        if not only_return_configuration_space:
-            # If only querying the configuration space, we do not save the start time
-            # The start time internally checks for the fit() method to execute only once
-            # But this does not apply when only querying the configuration space
-            self._backend.save_start_time(self._seed)
+            # The first thing we have to do is create the logger to update the backend
+            self._backend.setup_logger(self._logger_port)
 
-        self._stopwatch = StopWatch()
+            if not only_return_configuration_space:
+                # If only querying the configuration space, we do not save the start
+                # time The start time internally checks for the fit() method to execute
+                # only once but this does not apply when only querying the configuration
+                # space
+                self._backend.save_start_time(self._seed)
 
-        # Make sure that input is valid
-        # Performs Ordinal one hot encoding to the target
-        # both for train and test data
-        self.InputValidator = InputValidator(
-            is_classification=is_classification,
-            feat_type=feat_type,
-            logger_port=self._logger_port,
-            allow_string_features=self.allow_string_features,
-        )
-        self.InputValidator.fit(X_train=X, y_train=y, X_test=X_test, y_test=y_test)
-        X, y = self.InputValidator.transform(X, y)
+            self._stopwatch = StopWatch()
 
-        if X_test is not None and y_test is not None:
-            X_test, y_test = self.InputValidator.transform(X_test, y_test)
+            # Make sure that input is valid
+            # Performs Ordinal one hot encoding to the target
+            # both for train and test data
+            self.InputValidator = InputValidator(
+                is_classification=is_classification,
+                feat_type=feat_type,
+                logger_port=self._logger_port,
+                allow_string_features=self.allow_string_features,
+            )
+            self.InputValidator.fit(X_train=X, y_train=y, X_test=X_test, y_test=y_test)
+            X, y = self.InputValidator.transform(X, y)
 
-        # We don't support size reduction on pandas type object yet
-        if (
-            self._dataset_compression is not None
-            and not isinstance(X, pd.DataFrame)
-            and not (isinstance(y, pd.Series) or isinstance(y, pd.DataFrame))
-        ):
-            methods = self._dataset_compression["methods"]
-            memory_allocation = self._dataset_compression["memory_allocation"]
-
-            # Remove precision reduction if we can't perform it
-            if "precision" in methods and X.dtype not in supported_precision_reductions:
-                methods = [method for method in methods if method != "precision"]
-
-            with warnings_to(self._logger):
-                X, y = reduce_dataset_size_if_too_large(
-                    X=X,
-                    y=y,
-                    memory_limit=self._memory_limit,
-                    is_classification=is_classification,
-                    random_state=self._seed,
-                    operations=methods,
-                    memory_allocation=memory_allocation,
-                )
+            if X_test is not None and y_test is not None:
+                X_test, y_test = self.InputValidator.transform(X_test, y_test)
 
-        # Check the re-sampling strategy
-        try:
+            # We don't support size reduction on pandas type object yet
+            if (
+                self._dataset_compression is not None
+                and not isinstance(X, pd.DataFrame)
+                and not (isinstance(y, pd.Series) or isinstance(y, pd.DataFrame))
+            ):
+                methods = self._dataset_compression["methods"]
+                memory_allocation = self._dataset_compression["memory_allocation"]
+
+                # Remove precision reduction if we can't perform it
+                if (
+                    "precision" in methods
+                    and X.dtype not in supported_precision_reductions
+                ):
+                    methods = [method for method in methods if method != "precision"]
+
+                with warnings_to(self._logger):
+                    X, y = reduce_dataset_size_if_too_large(
+                        X=X,
+                        y=y,
+                        memory_limit=self._memory_limit,
+                        is_classification=is_classification,
+                        random_state=self._seed,
+                        operations=methods,
+                        memory_allocation=memory_allocation,
+                    )
+
+            # Check the re-sampling strategy
             self._check_resampling_strategy(
                 X=X,
                 y=y,
                 task=self._task,
             )
-        except Exception as e:
-            self._fit_cleanup()
-            raise e
 
-        # Reset learnt stuff
-        self.models_ = None
-        self.cv_models_ = None
-        self.ensemble_ = None
-
-        # The metric must exist as of this point
-        # It can be provided in the constructor, or automatically
-        # defined in the estimator fit call
-        if isinstance(self._metrics, Sequence):
-            for entry in self._metrics:
-                if not isinstance(entry, Scorer):
-                    raise ValueError(
-                        "Metric {entry} must be instance of autosklearn.metrics.Scorer."
-                    )
-        else:
-            raise ValueError(
-                "Metric must be a sequence of instances of "
-                "autosklearn.metrics.Scorer."
-            )
-
-        # If no dask client was provided, we create one, so that we can
-        # start a ensemble process in parallel to smbo optimize
-        if self._dask_client is None and (
-            self._ensemble_class is not None
-            or self._n_jobs is not None
-            and self._n_jobs > 1
-        ):
-            self._create_dask_client()
-        else:
-            self._is_dask_client_internally_created = False
-
-        self._dataset_name = dataset_name
-        self._stopwatch.start(self._dataset_name)
+            # Reset learnt stuff
+            self.models_ = None
+            self.cv_models_ = None
+            self.ensemble_ = None
 
-        # Take the feature types from the validator
-        self._feat_type = self.InputValidator.feature_validator.feat_type
+            # The metric must exist as of this point
+            # It can be provided in the constructor, or automatically
+            # defined in the estimator fit call
+            if isinstance(self._metrics, Sequence):
+                for entry in self._metrics:
+                    if not isinstance(entry, Scorer):
+                        raise ValueError(
+                            f"Metric {entry} must be instance of"
+                            " autosklearn.metrics.Scorer."
+                        )
+            else:
+                raise ValueError(
+                    "Metric must be a sequence of instances of "
+                    "autosklearn.metrics.Scorer."
+                )
 
-        self._log_fit_setup()
+            # If no dask client was provided, we create one, so that we can
+            # start a ensemble process in parallel to smbo optimize
+            if self._dask_client is None and (
+                self._ensemble_class is not None
+                or self._n_jobs is not None
+                and self._n_jobs > 1
+            ):
+                self._create_dask_client()
+            else:
+                self._is_dask_client_internally_created = False
 
-        # == Pickle the data manager to speed up loading
-        with self._stopwatch.time("Save Datamanager"):
-            datamanager = XYDataManager(
-                X,
-                y,
-                X_test=X_test,
-                y_test=y_test,
-                task=self._task,
-                feat_type=self._feat_type,
-                dataset_name=dataset_name,
-            )
+            self._dataset_name = dataset_name
+            self._stopwatch.start(self._dataset_name)
 
-            self._backend._make_internals_directory()
-            self._label_num = datamanager.info["label_num"]
-
-            self._backend.save_datamanager(datamanager)
-
-        # = Create a searchspace
-        # Do this before One Hot Encoding to make sure that it creates a
-        # search space for a dense classifier even if one hot encoding would
-        # make it sparse (tradeoff; if one hot encoding would make it sparse,
-        #  densifier and truncatedSVD would probably lead to a MemoryError,
-        # like this we can't use some of the preprocessing methods in case
-        # the data became sparse)
-        with self._stopwatch.time("Create Search space"):
-            self.configuration_space, configspace_path = self._create_search_space(
-                self._backend.temporary_directory,
-                self._backend,
-                datamanager,
-                include=self._include,
-                exclude=self._exclude,
-            )
+            # Take the feature types from the validator
+            self._feat_type = self.InputValidator.feature_validator.feat_type
 
-        if only_return_configuration_space:
-            self._fit_cleanup()
-            return self.configuration_space
-
-        # == Perform dummy predictions
-        with self._stopwatch.time("Dummy predictions"):
-            self.num_run += 1
-            self._do_dummy_prediction()
-
-        # == RUN ensemble builder
-        # Do this before calculating the meta-features to make sure that the
-        # dummy predictions are actually included in the ensemble even if
-        # calculating the meta-features takes very long
-        with self._stopwatch.time("Run Ensemble Builder"):
-
-            elapsed_time = self._stopwatch.time_since(self._dataset_name, "start")
-
-            time_left_for_ensembles = max(0, self._time_for_task - elapsed_time)
-            proc_ensemble = None
-            if time_left_for_ensembles <= 0:
-                # Fit only raises error when an ensemble class is given but
-                # time_left_for_ensembles is zero.
-                if self._ensemble_class is not None:
-                    raise ValueError(
-                        "Not starting ensemble builder because there "
-                        "is no time left. Try increasing the value "
-                        "of time_left_for_this_task."
-                    )
-            elif self._ensemble_class is None:
-                self._logger.info(
-                    "Not starting ensemble builder because no ensemble class is given."
-                )
-            else:
-                self._logger.info(
-                    "Start Ensemble with %5.2fsec time left" % time_left_for_ensembles
-                )
+            self._log_fit_setup()
 
-                proc_ensemble = EnsembleBuilderManager(
-                    start_time=time.time(),
-                    time_left_for_ensembles=time_left_for_ensembles,
-                    backend=copy.deepcopy(self._backend),
-                    dataset_name=dataset_name,
+            # == Pickle the data manager to speed up loading
+            with self._stopwatch.time("Save Datamanager"):
+                datamanager = XYDataManager(
+                    X,
+                    y,
+                    X_test=X_test,
+                    y_test=y_test,
                     task=self._task,
-                    metrics=self._metrics,
-                    ensemble_class=self._ensemble_class,
-                    ensemble_kwargs=self._ensemble_kwargs,
-                    ensemble_nbest=self._ensemble_nbest,
-                    max_models_on_disc=self._max_models_on_disc,
-                    seed=self._seed,
-                    precision=self.precision,
-                    max_iterations=self._max_ensemble_build_iterations,
-                    read_at_most=self._read_at_most,
-                    memory_limit=self._memory_limit,
-                    random_state=self._seed,
-                    logger_port=self._logger_port,
-                    pynisher_context=self._multiprocessing_context,
+                    feat_type=self._feat_type,
+                    dataset_name=dataset_name,
                 )
 
-        # kill the datamanager as it will be re-loaded anyways from sub processes
-        try:
-            del self._datamanager
-        except Exception:
-            pass
+                self._backend._make_internals_directory()
+                self._label_num = datamanager.info["label_num"]
+
+                self._backend.save_datamanager(datamanager)
+
+            # = Create a searchspace
+            # Do this before One Hot Encoding to make sure that it creates a
+            # search space for a dense classifier even if one hot encoding would
+            # make it sparse (tradeoff; if one hot encoding would make it sparse,
+            #  densifier and truncatedSVD would probably lead to a MemoryError,
+            # like this we can't use some of the preprocessing methods in case
+            # the data became sparse)
+            with self._stopwatch.time("Create Search space"):
+                self.configuration_space, configspace_path = self._create_search_space(
+                    self._backend.temporary_directory,
+                    self._backend,
+                    datamanager,
+                    include=self._include,
+                    exclude=self._exclude,
+                )
 
-        # => RUN SMAC
-        with self._stopwatch.time("Run SMAC"):
-            elapsed_time = self._stopwatch.time_since(self._dataset_name, "start")
-            time_left = self._time_for_task - elapsed_time
-
-            if self._logger:
-                self._logger.info("Start SMAC with %5.2fsec time left" % time_left)
-            if time_left <= 0:
-                self._logger.warning("Not starting SMAC because there is no time left.")
-                _proc_smac = None
-                self._budget_type = None
-            else:
-                if (
-                    self._per_run_time_limit is None
-                    or self._per_run_time_limit > time_left
-                ):
-                    self._logger.warning(
-                        "Time limit for a single run is higher than total time "
-                        "limit. Capping the limit for a single run to the total "
-                        "time given to SMAC (%f)" % time_left
+            if only_return_configuration_space:
+                return self.configuration_space
+
+            # == Perform dummy predictions
+            with self._stopwatch.time("Dummy predictions"):
+                self.num_run += 1
+                self._do_dummy_prediction()
+
+            # == RUN ensemble builder
+            # Do this before calculating the meta-features to make sure that the
+            # dummy predictions are actually included in the ensemble even if
+            # calculating the meta-features takes very long
+            with self._stopwatch.time("Run Ensemble Builder"):
+
+                elapsed_time = self._stopwatch.time_since(self._dataset_name, "start")
+
+                time_left_for_ensembles = max(0, self._time_for_task - elapsed_time)
+                proc_ensemble = None
+                if time_left_for_ensembles <= 0:
+                    # Fit only raises error when an ensemble class is given but
+                    # time_left_for_ensembles is zero.
+                    if self._ensemble_class is not None:
+                        raise ValueError(
+                            "Not starting ensemble builder because there "
+                            "is no time left. Try increasing the value "
+                            "of time_left_for_this_task."
+                        )
+                elif self._ensemble_class is None:
+                    self._logger.info(
+                        "No ensemble buildin because no ensemble class was given."
                     )
-                    per_run_time_limit = time_left
                 else:
-                    per_run_time_limit = self._per_run_time_limit
+                    self._logger.info(
+                        "Start Ensemble with %5.2fsec time left"
+                        % time_left_for_ensembles
+                    )
+
+                    proc_ensemble = EnsembleBuilderManager(
+                        start_time=time.time(),
+                        time_left_for_ensembles=time_left_for_ensembles,
+                        backend=copy.deepcopy(self._backend),
+                        dataset_name=dataset_name,
+                        task=self._task,
+                        metrics=self._metrics,
+                        ensemble_class=self._ensemble_class,
+                        ensemble_kwargs=self._ensemble_kwargs,
+                        ensemble_nbest=self._ensemble_nbest,
+                        max_models_on_disc=self._max_models_on_disc,
+                        seed=self._seed,
+                        precision=self.precision,
+                        max_iterations=self._max_ensemble_build_iterations,
+                        read_at_most=self._read_at_most,
+                        memory_limit=self._memory_limit,
+                        random_state=self._seed,
+                        logger_port=self._logger_port,
+                        pynisher_context=self._multiprocessing_context,
+                    )
 
-                # Make sure that at least 2 models are created for the ensemble process
-                num_models = time_left // per_run_time_limit
-                if num_models < 2:
-                    per_run_time_limit = time_left // 2
+            # kill the datamanager as it will be re-loaded anyways from sub processes
+            try:
+                del self._datamanager
+            except Exception:
+                pass
+
+            # => RUN SMAC
+            with self._stopwatch.time("Run SMAC"):
+                elapsed_time = self._stopwatch.time_since(self._dataset_name, "start")
+                time_left = self._time_for_task - elapsed_time
+
+                if self._logger:
+                    self._logger.info("Start SMAC with %5.2fsec time left" % time_left)
+                if time_left <= 0:
                     self._logger.warning(
-                        "Capping the per_run_time_limit to {} to have "
-                        "time for a least 2 models in each process.".format(
-                            per_run_time_limit
-                        )
+                        "Not starting SMAC because there is no time left."
                     )
+                    _proc_smac = None
+                    self._budget_type = None
+                else:
+                    if (
+                        self._per_run_time_limit is None
+                        or self._per_run_time_limit > time_left
+                    ):
+                        self._logger.warning(
+                            "Time limit for a single run is higher than total time "
+                            "limit. Capping the limit for a single run to the total "
+                            "time given to SMAC (%f)" % time_left
+                        )
+                        per_run_time_limit = time_left
+                    else:
+                        per_run_time_limit = self._per_run_time_limit
+
+                    # At least 2 models are created for the ensemble process
+                    num_models = time_left // per_run_time_limit
+                    if num_models < 2:
+                        per_run_time_limit = time_left // 2
+                        self._logger.warning(
+                            "Capping the per_run_time_limit to {} to have "
+                            "time for a least 2 models in each process.".format(
+                                per_run_time_limit
+                            )
+                        )
 
-                _proc_smac = AutoMLSMBO(
-                    config_space=self.configuration_space,
-                    dataset_name=self._dataset_name,
-                    backend=self._backend,
-                    total_walltime_limit=time_left,
-                    func_eval_time_limit=per_run_time_limit,
-                    memory_limit=self._memory_limit,
-                    data_memory_limit=self._data_memory_limit,
-                    stopwatch=self._stopwatch,
-                    n_jobs=self._n_jobs,
-                    dask_client=self._dask_client,
-                    start_num_run=self.num_run,
-                    num_metalearning_cfgs=self._initial_configurations_via_metalearning,
-                    config_file=configspace_path,
-                    seed=self._seed,
-                    metadata_directory=self._metadata_directory,
-                    metrics=self._metrics,
-                    resampling_strategy=self._resampling_strategy,
-                    resampling_strategy_args=self._resampling_strategy_arguments,
-                    include=self._include,
-                    exclude=self._exclude,
-                    disable_file_output=self._disable_evaluator_output,
-                    get_smac_object_callback=self._get_smac_object_callback,
-                    smac_scenario_args=self._smac_scenario_args,
-                    scoring_functions=self._scoring_functions,
-                    port=self._logger_port,
-                    pynisher_context=self._multiprocessing_context,
-                    ensemble_callback=proc_ensemble,
-                    trials_callback=self._get_trials_callback,
-                )
+                    n_meta_configs = self._initial_configurations_via_metalearning
+                    _proc_smac = AutoMLSMBO(
+                        config_space=self.configuration_space,
+                        dataset_name=self._dataset_name,
+                        backend=self._backend,
+                        total_walltime_limit=time_left,
+                        func_eval_time_limit=per_run_time_limit,
+                        memory_limit=self._memory_limit,
+                        data_memory_limit=self._data_memory_limit,
+                        stopwatch=self._stopwatch,
+                        n_jobs=self._n_jobs,
+                        dask_client=self._dask_client,
+                        start_num_run=self.num_run,
+                        num_metalearning_cfgs=n_meta_configs,
+                        config_file=configspace_path,
+                        seed=self._seed,
+                        metadata_directory=self._metadata_directory,
+                        metrics=self._metrics,
+                        resampling_strategy=self._resampling_strategy,
+                        resampling_strategy_args=self._resampling_strategy_arguments,
+                        include=self._include,
+                        exclude=self._exclude,
+                        disable_file_output=self._disable_evaluator_output,
+                        get_smac_object_callback=self._get_smac_object_callback,
+                        smac_scenario_args=self._smac_scenario_args,
+                        scoring_functions=self._scoring_functions,
+                        port=self._logger_port,
+                        pynisher_context=self._multiprocessing_context,
+                        ensemble_callback=proc_ensemble,
+                        trials_callback=self._get_trials_callback,
+                    )
 
-                try:
                     (
                         self.runhistory_,
                         self.trajectory_,
@@ -928,42 +948,49 @@ def fit(
                     ]
                     with open(trajectory_filename, "w") as fh:
                         json.dump(saveable_trajectory, fh)
-                except Exception as e:
-                    self._logger.exception(e)
-                    raise
-
-        self._logger.info("Starting shutdown...")
-        # Wait until the ensemble process is finished to avoid shutting down
-        # while the ensemble builder tries to access the data
-        if proc_ensemble is not None:
-            self.ensemble_performance_history = list(proc_ensemble.history)
-
-            if len(proc_ensemble.futures) > 0:
-                # Now we need to wait for the future to return as it cannot be cancelled
-                # while it is running: https://stackoverflow.com/a/49203129
-                self._logger.info(
-                    "Ensemble script still running, waiting for it to finish."
-                )
-                result = proc_ensemble.futures.pop().result()
-                if result:
-                    ensemble_history, _ = result
-                    self.ensemble_performance_history.extend(ensemble_history)
-                self._logger.info("Ensemble script finished, continue shutdown.")
-
-            # save the ensemble performance history file
-            if len(self.ensemble_performance_history) > 0:
-                pd.DataFrame(self.ensemble_performance_history).to_json(
-                    os.path.join(
-                        self._backend.internals_directory, "ensemble_history.json"
+
+            self._logger.info("Starting shutdown...")
+            # Wait until the ensemble process is finished to avoid shutting down
+            # while the ensemble builder tries to access the data
+            if proc_ensemble is not None:
+                self.ensemble_performance_history = list(proc_ensemble.history)
+
+                if len(proc_ensemble.futures) > 0:
+                    # Now we wait for the future to return as it cannot be cancelled
+                    # while it is running: https://stackoverflow.com/a/49203129
+                    self._logger.info(
+                        "Ensemble script still running, waiting for it to finish."
+                    )
+                    result = proc_ensemble.futures.pop().result()
+                    if result:
+                        ensemble_history, _ = result
+                        self.ensemble_performance_history.extend(ensemble_history)
+                    self._logger.info("Ensemble script finished, continue shutdown.")
+
+                # save the ensemble performance history file
+                if len(self.ensemble_performance_history) > 0:
+                    pd.DataFrame(self.ensemble_performance_history).to_json(
+                        os.path.join(
+                            self._backend.internals_directory, "ensemble_history.json"
+                        )
                     )
-                )
 
-        if load_models:
-            self._logger.info("Loading models...")
-            self._load_models()
-            self._logger.info("Finished loading models...")
+            if load_models:
+                self._logger.info("Loading models...")
+                self._load_models()
+                self._logger.info("Finished loading models...")
+
+        # The whole logic above from where we begin the logging server is capture
+        # in a try: finally: so that if something goes wrong, we at least close
+        # down the logging server, preventing it from hanging and not closing
+        # until ctrl+c is pressed
+        except Exception as e:
+            # This will be called before the _fit_cleanup
+            self._logger.exception(e)
+            raise e
+        finally:
+            self._fit_cleanup()
 
-        self._fit_cleanup()
         self.fitted = True
 
         return self
@@ -1447,6 +1474,7 @@ def predict(self, X, batch_size=None, n_jobs=1):
         # Each process computes predictions in chunks of batch_size rows.
         try:
             for i, tmp_model in enumerate(self.models_.values()):
+                # TODO, modify this
                 if isinstance(tmp_model, (DummyRegressor, DummyClassifier)):
                     check_is_fitted(tmp_model)
                 else:
@@ -1502,6 +1530,7 @@ def fit_ensemble(
         ensemble_nbest: Optional[int] = None,
         ensemble_class: Optional[AbstractEnsemble] = EnsembleSelection,
         ensemble_kwargs: Optional[Dict[str, Any]] = None,
+        metrics: Scorer | Sequence[Scorer] | None = None,
     ):
         check_is_fitted(self)
 
@@ -1532,6 +1561,10 @@ def fit_ensemble(
         else:
             self._is_dask_client_internally_created = False
 
+        metrics = metrics if metrics is not None else self._metrics
+        if not isinstance(metrics, Sequence):
+            metrics = [metrics]
+
         # Use the current thread to start the ensemble builder process
         # The function ensemble_builder_process will internally create a ensemble
         # builder in the provide dask client
@@ -1541,7 +1574,7 @@ def fit_ensemble(
             backend=copy.deepcopy(self._backend),
             dataset_name=dataset_name if dataset_name else self._dataset_name,
             task=task if task else self._task,
-            metrics=self._metrics,
+            metrics=metrics if metrics is not None else self._metrics,
             ensemble_class=(
                 ensemble_class if ensemble_class is not None else self._ensemble_class
             ),
@@ -1652,20 +1685,12 @@ def _load_best_individual_model(self):
         return ensemble
 
     def _load_pareto_set(self) -> Sequence[VotingClassifier | VotingRegressor]:
-        if len(self._metrics) <= 1:
-            raise ValueError("Pareto set is only available for two or more metrics.")
-
-        if self._ensemble_class is not None:
+        if self.ensemble_ is None:
             self.ensemble_ = self._backend.load_ensemble(self._seed)
-        else:
-            self.ensemble_ = None
 
         # If no ensemble is loaded we cannot do anything
         if not self.ensemble_:
-
-            raise ValueError(
-                "Pareto set can only be accessed if an ensemble is available."
-            )
+            raise ValueError("Pareto set only available if ensemble can be loaded.")
 
         if isinstance(self.ensemble_, AbstractMultiObjectiveEnsemble):
             pareto_set = self.ensemble_.get_pareto_set()
@@ -1691,8 +1716,10 @@ def _load_pareto_set(self) -> Sequence[VotingClassifier | VotingRegressor]:
                     estimators=None,
                     voting="soft",
                 )
+                kind = "classifier"
             else:
                 voter = VotingRegressor(estimators=None)
+                kind = "regeressor"
 
             if self._resampling_strategy in ("cv", "cv-iterative-fit"):
                 models = self._backend.load_cv_models_by_identifiers(identifiers)
@@ -1705,8 +1732,32 @@ def _load_pareto_set(self) -> Sequence[VotingClassifier | VotingRegressor]:
             weight_vector = []
             estimators = []
             for identifier in identifiers:
-                weight_vector.append(weights[identifier])
-                estimators.append(models[identifier])
+                estimator = models[identifier]
+                weight = weights[identifier]
+
+                # Kind of hacky, really the dummy models should
+                # act like everything else does. Doing this is
+                # required so that the VotingClassifier/Regressor
+                # can use it as intended
+                if not isinstance(estimator, Pipeline):
+                    if kind == "classifier":
+                        steps = [
+                            ("data_preprocessor", None),
+                            ("balancing", None),
+                            ("feature_preprocessor", None),
+                            (kind, estimator),
+                        ]
+                    else:
+                        steps = [
+                            ("data_preprocessor", None),
+                            ("feature_preprocessor", None),
+                            (kind, estimator),
+                        ]
+
+                    estimator = Pipeline(steps=steps)
+
+                weight_vector.append(weight)
+                estimators.append(estimator)
 
             voter.estimators = estimators
             voter.estimators_ = estimators
@@ -2123,7 +2174,7 @@ def show_models(self) -> dict[int, Any]:
 
         ensemble_dict = {}
 
-        if self._ensemble_class is not None:
+        if self._ensemble_class is None:
             warnings.warn(
                 "No models in the ensemble. Kindly provide an ensemble class."
             )
@@ -2138,10 +2189,10 @@ def has_key(rv, key):
             return rv.additional_info and key in rv.additional_info
 
         table_dict = {}
-        for rkey, rval in self.runhistory_.data.items():
-            if has_key(rval, "num_run"):
-                model_id = rval.additional_info["num_run"]
-                table_dict[model_id] = {"model_id": model_id, "cost": rval.cost}
+        for run_key, run_val in self.runhistory_.data.items():
+            if has_key(run_val, "num_run"):
+                model_id = run_val.additional_info["num_run"]
+                table_dict[model_id] = {"model_id": model_id, "cost": run_val.cost}
 
         # Checking if the dictionary is empty
         if not table_dict:
diff --git a/autosklearn/ensemble_building/builder.py b/autosklearn/ensemble_building/builder.py
index 487332cbe1..50f69eb35a 100644
--- a/autosklearn/ensemble_building/builder.py
+++ b/autosklearn/ensemble_building/builder.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Any, Dict, Iterable, Sequence, Type, cast
+from typing import Any, Iterable, Mapping, Sequence, Type, cast
 
 import logging.handlers
 import multiprocessing
@@ -46,7 +46,7 @@ def __init__(
         task_type: int,
         metrics: Sequence[Scorer],
         ensemble_class: Type[AbstractEnsemble] = EnsembleSelection,
-        ensemble_kwargs: Dict[str, Any] | None = None,
+        ensemble_kwargs: Mapping[str, Any] | None = None,
         ensemble_nbest: int | float = 50,
         max_models_on_disc: int | float | None = 100,
         seed: int = 1,
@@ -71,9 +71,11 @@ def __init__(
         metrics: Sequence[Scorer]
             Metrics to optimize the ensemble for. These must be non-duplicated.
 
-        ensemble_class
+        ensemble_class: Type[AbstractEnsemble]
+            Implementation of the ensemble algorithm.
 
-        ensemble_kwargs
+        ensemble_kwargs: Mapping[str, Any] | None
+            Arguments passed to the constructor of the ensemble algorithm.
 
         ensemble_nbest: int | float = 50
 
@@ -169,6 +171,8 @@ def __init__(
         self.validation_performance_ = np.inf
 
         # Data we may need
+        # TODO: The test data is needlessly loaded but automl_common has no concept of
+        # these and is perhaps too rigid
         datamanager: XYDataManager = self.backend.load_datamanager()
         self._X_test: SUPPORTED_FEAT_TYPES | None = datamanager.data.get("X_test", None)
         self._y_test: np.ndarray | None = datamanager.data.get("Y_test", None)
@@ -442,6 +446,17 @@ def main(
             self.logger.debug("Found no runs")
             raise RuntimeError("Found no runs")
 
+        # We load in `X_data` if we need it
+        if any(m._needs_X for m in self.metrics):
+            ensemble_X_data = self.X_data("ensemble")
+
+            if ensemble_X_data is None:
+                msg = "No `X_data` for 'ensemble' which was required by metrics"
+                self.logger.debug(msg)
+                raise RuntimeError(msg)
+        else:
+            ensemble_X_data = None
+
         # Calculate the loss for those that require it
         requires_update = self.requires_loss_update(runs)
         if self.read_at_most is not None:
@@ -450,9 +465,7 @@ def main(
         for run in requires_update:
             run.record_modified_times()  # So we don't count as modified next time
             run.losses = {
-                metric.name: self.loss(
-                    run, metric=metric, X_data=self.X_data("ensemble")
-                )
+                metric.name: self.loss(run, metric=metric, X_data=ensemble_X_data)
                 for metric in self.metrics
             }
 
@@ -549,15 +562,14 @@ def main(
             return self.ensemble_history, self.ensemble_nbest
 
         targets = cast(np.ndarray, self.targets("ensemble"))  # Sure they exist
-        X_data = self.X_data("ensemble")
 
         ensemble = self.fit_ensemble(
             candidates=candidates,
-            X_data=X_data,
             targets=targets,
             runs=runs,
             ensemble_class=self.ensemble_class,
             ensemble_kwargs=self.ensemble_kwargs,
+            X_data=ensemble_X_data,
             task=self.task_type,
             metrics=self.metrics,
             precision=self.precision,
@@ -587,7 +599,15 @@ def main(
 
             run_preds = [r.predictions(kind, precision=self.precision) for r in models]
             pred = ensemble.predict(run_preds)
-            X_data = self.X_data(kind)
+
+            if any(m._needs_X for m in self.metrics):
+                X_data = self.X_data(kind)
+                if X_data is None:
+                    msg = f"No `X` data for '{kind}' which was required by metrics"
+                    self.logger.debug(msg)
+                    raise RuntimeError(msg)
+            else:
+                X_data = None
 
             scores = calculate_scores(
                 solution=pred_targets,
@@ -597,10 +617,19 @@ def main(
                 X_data=X_data,
                 scoring_functions=None,
             )
+
+            # TODO only one metric in history
+            #
+            #   We should probably return for all metrics but this makes
+            #   automl::performance_history a lot more complicated, will
+            #   tackle in a future PR
+            first_metric = self.metrics[0]
             performance_stamp[f"ensemble_{score_name}_score"] = scores[
-                self.metrics[0].name
+                first_metric.name
             ]
-            self.ensemble_history.append(performance_stamp)
+
+        # Add the performance stamp to the history
+        self.ensemble_history.append(performance_stamp)
 
         # Lastly, delete any runs that need to be deleted. We save this as the last step
         # so that we have an ensemble saved that is up to date. If we do not do so,
@@ -805,13 +834,13 @@ def candidate_selection(
 
     def fit_ensemble(
         self,
-        candidates: list[Run],
-        X_data: SUPPORTED_FEAT_TYPES,
-        targets: np.ndarray,
+        candidates: Sequence[Run],
+        runs: Sequence[Run],
         *,
-        runs: list[Run],
+        targets: np.ndarray | None = None,
         ensemble_class: Type[AbstractEnsemble] = EnsembleSelection,
-        ensemble_kwargs: Dict[str, Any] | None = None,
+        ensemble_kwargs: Mapping[str, Any] | None = None,
+        X_data: SUPPORTED_FEAT_TYPES | None = None,
         task: int | None = None,
         metrics: Sequence[Scorer] | None = None,
         precision: int | None = None,
@@ -825,24 +854,24 @@ def fit_ensemble(
 
         Parameters
         ----------
-        candidates: list[Run]
+        candidates: Sequence[Run]
             List of runs to build an ensemble from
 
-        X_data: SUPPORTED_FEAT_TYPES
-            The base level data.
+        runs: Sequence[Run]
+            List of all runs (also pruned ones and dummy runs)
 
-        targets: np.ndarray
+        targets: np.ndarray | None = None
             The targets to build the ensemble with
 
-        runs: list[Run]
-            List of all runs (also pruned ones and dummy runs)
-
-        ensemble_class: AbstractEnsemble
+        ensemble_class: Type[AbstractEnsemble]
             Implementation of the ensemble algorithm.
 
-        ensemble_kwargs: Dict[str, Any]
+        ensemble_kwargs: Mapping[str, Any] | None
             Arguments passed to the constructor of the ensemble algorithm.
 
+        X_data: SUPPORTED_FEAT_TYPES | None = None
+            The base level data.
+
         task: int | None = None
             The kind of task performed
 
@@ -859,24 +888,42 @@ def fit_ensemble(
         -------
         AbstractEnsemble
         """
-        task = task if task is not None else self.task_type
+        # Validate we have targets if None specified
+        if targets is None:
+            targets = self.targets("ensemble")
+            if targets is None:
+                path = self.backend._get_targets_ensemble_filename()
+                raise ValueError(f"`fit_ensemble` could not find any targets at {path}")
+
         ensemble_class = (
             ensemble_class if ensemble_class is not None else self.ensemble_class
         )
-        ensemble_kwargs = (
-            ensemble_kwargs if ensemble_kwargs is not None else self.ensemble_kwargs
-        )
-        ensemble_kwargs = ensemble_kwargs if ensemble_kwargs is not None else {}
-        metrics = metrics if metrics is not None else self.metrics
-        rs = random_state if random_state is not None else self.random_state
 
-        ensemble = ensemble_class(
-            task_type=task,
-            metrics=metrics,
-            random_state=rs,
-            backend=self.backend,
-            **ensemble_kwargs,
-        )  # type: AbstractEnsemble
+        # Create the ensemble_kwargs, favouring in order:
+        # 1) function kwargs, 2) function params 3) init_kwargs 4) init_params
+
+        # Collect func params in dict if they're not None
+        params = {
+            k: v
+            for k, v in [
+                ("task_type", task),
+                ("metrics", metrics),
+                ("random_state", random_state),
+            ]
+            if v is not None
+        }
+
+        kwargs = {
+            "backend": self.backend,
+            "task_type": self.task_type,
+            "metrics": self.metrics,
+            "random_state": self.random_state,
+            **(self.ensemble_kwargs or {}),
+            **params,
+            **(ensemble_kwargs or {}),
+        }
+
+        ensemble = ensemble_class(**kwargs)  # type: AbstractEnsemble
 
         self.logger.debug(f"Fitting ensemble on {len(candidates)} models")
         start_time = time.time()
@@ -995,7 +1042,8 @@ def loss(
         self,
         run: Run,
         metric: Scorer,
-        X_data: SUPPORTED_FEAT_TYPES,
+        *,
+        X_data: SUPPORTED_FEAT_TYPES | None = None,
         kind: str = "ensemble",
     ) -> float:
         """Calculate the loss for a run
@@ -1008,6 +1056,9 @@ def loss(
         metric: Scorer
             The metric to calculate the loss of
 
+        X_data: SUPPORTED_FEAT_TYPES | None = None
+            Any X_data required to be passed to the metric
+
         kind: str = "ensemble"
             The kind of targets to use for the run
 
diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py
index 7144fcc39c..5afd8c597c 100644
--- a/autosklearn/estimators.py
+++ b/autosklearn/estimators.py
@@ -38,6 +38,7 @@
 from autosklearn.ensembles.ensemble_selection import EnsembleSelection
 from autosklearn.metrics import Scorer
 from autosklearn.pipeline.base import BasePipeline
+from autosklearn.util.smac_wrap import SMACCallback
 
 
 class AutoSklearnEstimator(BaseEstimator):
@@ -69,7 +70,7 @@ def __init__(
         metric: Scorer | Sequence[Scorer] | None = None,
         scoring_functions: Optional[List[Scorer]] = None,
         load_models: bool = True,
-        get_trials_callback=None,
+        get_trials_callback: SMACCallback | None = None,
         dataset_compression: Union[bool, Mapping[str, Any]] = True,
         allow_string_features: bool = True,
     ):
@@ -261,8 +262,8 @@ def __init__(
             list are:
 
             * ``'y_optimization'`` : do not save the predictions for the
-              optimization/validation set, which would later on be used to build
-              an ensemble.
+              optimization set, which would later on be used to build an ensemble.
+
             * ``model`` : do not save any model files
 
         smac_scenario_args : dict, optional (None)
@@ -301,10 +302,19 @@ def __init__(
             Whether to load the models after fitting Auto-sklearn.
 
         get_trials_callback: callable
-            Callback function to create an object of subclass defined in module
-            `smac.callbacks <https://automl.github.io/SMAC3/master/apidoc/smac.callbacks.html>`_.
-            This is an advanced feature. Use only if you are familiar with
-            `SMAC <https://automl.github.io/SMAC3/master/index.html>`_.
+            A callable with the following definition.
+
+            * (smac.SMBO, smac.RunInfo, smac.RunValue, time_left: float) -> bool | None
+
+            This will be called after SMAC, the underlying optimizer for autosklearn,
+            finishes training each run.
+
+            You can use this to record your own information about the optimization
+            process. You can also use this to enable a early stopping based on some
+            critera.
+
+            See the example:
+            :ref:`Early Stopping And Callbacks <sphx_glr_examples_40_advanced_example_early_stopping_and_callbacks.py>`.
 
         dataset_compression: Union[bool, Mapping[str, Any]] = True
             We compress datasets so that they fit into some predefined amount of memory.
@@ -601,6 +611,7 @@ def fit_ensemble(
         ensemble_kwargs: Optional[Dict[str, Any]] = None,
         ensemble_nbest: Optional[int] = None,
         ensemble_class: Optional[AbstractEnsemble] = EnsembleSelection,
+        metrics: Scorer | Sequence[Scorer] | None = None,
     ):
         """Fit an ensemble to models trained during an optimization process.
 
@@ -650,12 +661,13 @@ def fit_ensemble(
             to obtain only use the single best model instead of an
             ensemble.
 
+        metrics: Scorer | Sequence[Scorer] | None = None
+            A metric or list of metrics to score the ensemble with
+
         Returns
         -------
         self
-
         """
-
         # User specified `ensemble_size` explicitly, warn them about deprecation
         if ensemble_size is not None:
             # Keep consistent behaviour
@@ -708,6 +720,7 @@ def fit_ensemble(
             ensemble_nbest=ensemble_nbest,
             ensemble_class=ensemble_class,
             ensemble_kwargs=ensemble_kwargs,
+            metrics=metrics,
         )
         return self
 
@@ -1041,31 +1054,31 @@ def additional_info_has_key(rv, key):
             return rv.additional_info and key in rv.additional_info
 
         model_runs = {}
-        for rkey, rval in self.automl_.runhistory_.data.items():
-            if not additional_info_has_key(rval, "num_run"):
+        for run_key, run_val in self.automl_.runhistory_.data.items():
+            if not additional_info_has_key(run_val, "num_run"):
                 continue
             else:
-                model_key = rval.additional_info["num_run"]
+                model_key = run_val.additional_info["num_run"]
                 model_run = {
-                    "model_id": rval.additional_info["num_run"],
-                    "seed": rkey.seed,
-                    "budget": rkey.budget,
-                    "duration": rval.time,
-                    "config_id": rkey.config_id,
-                    "start_time": rval.starttime,
-                    "end_time": rval.endtime,
-                    "status": str(rval.status),
-                    "train_loss": rval.additional_info["train_loss"]
-                    if additional_info_has_key(rval, "train_loss")
+                    "model_id": run_val.additional_info["num_run"],
+                    "seed": run_key.seed,
+                    "budget": run_key.budget,
+                    "duration": run_val.time,
+                    "config_id": run_key.config_id,
+                    "start_time": run_val.starttime,
+                    "end_time": run_val.endtime,
+                    "status": str(run_val.status),
+                    "train_loss": run_val.additional_info["train_loss"]
+                    if additional_info_has_key(run_val, "train_loss")
                     else None,
-                    "config_origin": rval.additional_info["configuration_origin"]
-                    if additional_info_has_key(rval, "configuration_origin")
+                    "config_origin": run_val.additional_info["configuration_origin"]
+                    if additional_info_has_key(run_val, "configuration_origin")
                     else None,
                 }
                 if num_metrics == 1:
-                    model_run["cost"] = rval.cost
+                    model_run["cost"] = run_val.cost
                 else:
-                    for cost_idx, cost in enumerate(rval.cost):
+                    for cost_idx, cost in enumerate(run_val.cost):
                         model_run[f"cost_{cost_idx}"] = cost
                 model_runs[model_key] = model_run
 
diff --git a/autosklearn/evaluation/__init__.py b/autosklearn/evaluation/__init__.py
index aace158c00..ba17513ae0 100644
--- a/autosklearn/evaluation/__init__.py
+++ b/autosklearn/evaluation/__init__.py
@@ -71,7 +71,7 @@ def fit_predict_try_except_decorator(
         #     File "auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 616, in fit_predict_and_loss,  # noqa E501
         #     status=status
         #     File "auto-sklearn/autosklearn/evaluation/abstract_evaluator.py", line 320, in finish_up  # noqa E501
-        #     self.queue.put(rval_dict)
+        #     self.queue.put(return_value_dict)
         #     File "miniconda/3-4.5.4/envs/autosklearn/lib/python3.7/multiprocessing/queues.py", line 87, in put  # noqa E501
         #     self._start_thread()
         #     File "miniconda/3-4.5.4/envs/autosklearn/lib/python3.7/multiprocessing/queues.py", line 170, in _start_thread  # noqa E501
@@ -230,14 +230,7 @@ def __init__(
         self.memory_limit = memory_limit
 
         dm = self.backend.load_datamanager()
-        if "X_valid" in dm.data and "Y_valid" in dm.data:
-            self._get_validation_loss = True
-        else:
-            self._get_validation_loss = False
-        if "X_test" in dm.data and "Y_test" in dm.data:
-            self._get_test_loss = True
-        else:
-            self._get_test_loss = False
+        self._get_test_loss = "X_test" in dm.data and "Y_test" in dm.data
 
         self.port = port
         self.pynisher_context = pynisher_context
@@ -533,21 +526,6 @@ def run(
                 additional_run_info["train_learning_curve"] = train_learning_curve
                 additional_run_info["learning_curve_runtime"] = learning_curve_runtime
 
-            if self._get_validation_loss:
-                validation_learning_curve = (
-                    autosklearn.evaluation.util.extract_learning_curve(
-                        info,
-                        "validation_loss",
-                    )
-                )
-                if len(validation_learning_curve) > 1:
-                    additional_run_info[
-                        "validation_learning_curve"
-                    ] = validation_learning_curve
-                    additional_run_info[
-                        "learning_curve_runtime"
-                    ] = learning_curve_runtime
-
             if self._get_test_loss:
                 test_learning_curve = (
                     autosklearn.evaluation.util.extract_learning_curve(
diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py
index efd87c6cc3..b97f588a45 100644
--- a/autosklearn/evaluation/abstract_evaluator.py
+++ b/autosklearn/evaluation/abstract_evaluator.py
@@ -220,8 +220,6 @@ def __init__(
         self.include = include
         self.exclude = exclude
 
-        self.X_valid = self.datamanager.data.get("X_valid")
-        self.y_valid = self.datamanager.data.get("Y_valid")
         self.X_test = self.datamanager.data.get("X_test")
         self.y_test = self.datamanager.data.get("Y_test")
 
@@ -359,7 +357,6 @@ def finish_up(
         loss: Union[Dict[str, float], float],
         train_loss: Optional[Dict[str, float]],
         opt_pred: np.ndarray,
-        valid_pred: np.ndarray,
         test_pred: np.ndarray,
         additional_run_info: Optional[TYPE_ADDITIONAL_INFO],
         file_output: bool,
@@ -382,19 +379,12 @@ def finish_up(
         self.duration = time.time() - self.starttime
 
         if file_output:
-            file_out_loss, additional_run_info_ = self.file_output(
-                opt_pred,
-                valid_pred,
-                test_pred,
-            )
+            file_out_loss, additional_run_info_ = self.file_output(opt_pred, test_pred)
         else:
             file_out_loss = None
             additional_run_info_ = {}
 
-        validation_loss, test_loss = self.calculate_auxiliary_losses(
-            valid_pred,
-            test_pred,
-        )
+        test_loss = self.calculate_auxiliary_losses(test_pred)
 
         if file_out_loss is not None:
             return self.duration, file_out_loss, self.seed, additional_run_info_
@@ -424,59 +414,38 @@ def finish_up(
                 additional_run_info["train_loss"] = [
                     train_loss[metric.name] for metric in self.metrics
                 ]
-        if validation_loss is not None:
-            additional_run_info["validation_loss"] = validation_loss
         if test_loss is not None:
             additional_run_info["test_loss"] = test_loss
 
-        rval_dict = {
+        return_value_dict = {
             "loss": loss,
             "additional_run_info": additional_run_info,
             "status": status,
         }
         if final_call:
-            rval_dict["final_queue_element"] = True
+            return_value_dict["final_queue_element"] = True
 
-        self.queue.put(rval_dict)
+        self.queue.put(return_value_dict)
         return self.duration, loss_, self.seed, additional_run_info_
 
     def calculate_auxiliary_losses(
         self,
-        Y_valid_pred: np.ndarray,
-        Y_test_pred: np.ndarray,
-    ) -> Tuple[Optional[float | Sequence[float]], Optional[float | Sequence[float]]]:
-        if Y_valid_pred is not None:
-            if self.y_valid is not None:
-                validation_loss: Optional[Union[float, Dict[str, float]]] = self._loss(
-                    self.y_valid, Y_valid_pred
-                )
-                if len(self.metrics) == 1:
-                    validation_loss = validation_loss[self.metrics[0].name]
-            else:
-                validation_loss = None
-        else:
-            validation_loss = None
+        Y_test_pred: np.ndarray | None,
+    ) -> float | dict[str, float] | None:
+        if Y_test_pred is None or self.y_test is None:
+            return None
 
-        if Y_test_pred is not None:
-            if self.y_test is not None:
-                test_loss: Optional[Union[float, Dict[str, float]]] = self._loss(
-                    self.y_test, Y_test_pred
-                )
-                if len(self.metrics) == 1:
-                    test_loss = test_loss[self.metrics[0].name]
-            else:
-                test_loss = None
-        else:
-            test_loss = None
+        test_loss = self._loss(self.y_test, Y_test_pred)
+        if len(self.metrics) == 1:
+            test_loss = test_loss[self.metrics[0].name]
 
-        return validation_loss, test_loss
+        return test_loss
 
     def file_output(
         self,
         Y_optimization_pred: np.ndarray,
-        Y_valid_pred: np.ndarray,
         Y_test_pred: np.ndarray,
-    ) -> Tuple[Optional[float], Dict[str, Union[str, int, float, List, Dict, Tuple]]]:
+    ) -> tuple[float | None, dict[str, Any]]:
         # Abort if self.Y_optimization is None
         # self.Y_optimization can be None if we use partial-cv, then,
         # obviously no output should be saved.
@@ -496,12 +465,7 @@ def file_output(
             )
 
         # Abort if predictions contain NaNs
-        for y, s in [
-            # Y_train_pred deleted here. Fix unittest accordingly.
-            [Y_optimization_pred, "optimization"],
-            [Y_valid_pred, "validation"],
-            [Y_test_pred, "test"],
-        ]:
+        for y, s in [(Y_optimization_pred, "optimization"), (Y_test_pred, "test")]:
             if y is not None and not np.all(np.isfinite(y)):
                 return (
                     1.0,
@@ -553,14 +517,13 @@ def file_output(
             budget=self.budget,
             model=self.model if "model" not in self.disable_file_output else None,
             cv_model=models if "cv_model" not in self.disable_file_output else None,
+            # TODO: below line needs to be deleted once backend is updated
+            valid_predictions=None,
             ensemble_predictions=(
                 Y_optimization_pred
                 if "y_optimization" not in self.disable_file_output
                 else None
             ),
-            valid_predictions=(
-                Y_valid_pred if "y_valid" not in self.disable_file_output else None
-            ),
             test_predictions=(
                 Y_test_pred if "y_test" not in self.disable_file_output else None
             ),
diff --git a/autosklearn/evaluation/test_evaluator.py b/autosklearn/evaluation/test_evaluator.py
index e76186aa06..d624c1a44d 100644
--- a/autosklearn/evaluation/test_evaluator.py
+++ b/autosklearn/evaluation/test_evaluator.py
@@ -67,7 +67,6 @@ def fit_predict_and_loss(self) -> None:
             loss=loss,
             train_loss=None,
             opt_pred=Y_pred,
-            valid_pred=None,
             test_pred=None,
             file_output=False,
             final_call=True,
@@ -78,7 +77,6 @@ def fit_predict_and_loss(self) -> None:
     def predict_and_loss(
         self, train: bool = False
     ) -> Tuple[Union[Dict[str, float], float], np.array, Any, Any]:
-
         if train:
             Y_pred = self.predict_function(
                 self.X_train, self.model, self.task_type, self.Y_train
diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py
index a8433c2136..f19db473bf 100644
--- a/autosklearn/evaluation/train_evaluator.py
+++ b/autosklearn/evaluation/train_evaluator.py
@@ -316,7 +316,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
 
                 Y_train_pred = [None] * self.num_cv_folds
                 Y_optimization_pred = [None] * self.num_cv_folds
-                Y_valid_pred = [None] * self.num_cv_folds
                 Y_test_pred = [None] * self.num_cv_folds
                 train_splits = [None] * self.num_cv_folds
 
@@ -417,7 +416,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                             **fit_params_array[i],
                         )
 
-                        (train_pred, opt_pred, valid_pred, test_pred) = self._predict(
+                        (train_pred, opt_pred, test_pred) = self._predict(
                             model,
                             train_indices=train_indices,
                             test_indices=test_indices,
@@ -425,7 +424,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
 
                         Y_train_pred[i] = train_pred
                         Y_optimization_pred[i] = opt_pred
-                        Y_valid_pred[i] = valid_pred
                         Y_test_pred[i] = test_pred
                         train_splits[i] = train_indices
 
@@ -499,20 +497,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     X_targets = concat_data(X_targets, num_cv_folds=self.num_cv_folds)
                     Y_targets = concat_data(Y_targets, num_cv_folds=self.num_cv_folds)
 
-                    if self.X_valid is not None:
-                        Y_valid_preds = np.array(
-                            [
-                                Y_valid_pred[i]
-                                for i in range(self.num_cv_folds)
-                                if Y_valid_pred[i] is not None
-                            ]
-                        )
-                        # Average the predictions of several models
-                        if len(Y_valid_preds.shape) == 3:
-                            Y_valid_preds = np.nanmean(Y_valid_preds, axis=0)
-                    else:
-                        Y_valid_preds = None
-
                     if self.X_test is not None:
                         Y_test_preds = np.array(
                             [
@@ -544,7 +528,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                         loss=opt_loss,
                         train_loss=train_loss,
                         opt_pred=Y_optimization_pred_concat,
-                        valid_pred=Y_valid_preds,
                         test_pred=Y_test_preds,
                         additional_run_info=additional_run_info,
                         file_output=True,
@@ -558,7 +541,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
 
             Y_train_pred = [None] * self.num_cv_folds
             Y_optimization_pred = [None] * self.num_cv_folds
-            Y_valid_pred = [None] * self.num_cv_folds
             Y_test_pred = [None] * self.num_cv_folds
             train_splits = [None] * self.num_cv_folds
 
@@ -586,7 +568,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     (
                         train_pred,
                         opt_pred,
-                        valid_pred,
                         test_pred,
                         additional_run_info,
                     ) = self._partial_fit_and_predict_standard(
@@ -599,7 +580,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                     (
                         train_pred,
                         opt_pred,
-                        valid_pred,
                         test_pred,
                         additional_run_info,
                     ) = self._partial_fit_and_predict_budget(
@@ -622,7 +602,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
 
                 Y_train_pred[i] = train_pred
                 Y_optimization_pred[i] = opt_pred
-                Y_valid_pred[i] = valid_pred
                 Y_test_pred[i] = test_pred
                 train_splits[i] = train_split
 
@@ -683,18 +662,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
             X_targets = concat_data(X_targets, num_cv_folds=self.num_cv_folds)
             Y_targets = concat_data(Y_targets, num_cv_folds=self.num_cv_folds)
 
-            if self.X_valid is not None:
-                Y_valid_pred = np.array(
-                    [
-                        Y_valid_pred[i]
-                        for i in range(self.num_cv_folds)
-                        if Y_valid_pred[i] is not None
-                    ]
-                )
-                # Average the predictions of several models
-                if len(np.shape(Y_valid_pred)) == 3:
-                    Y_valid_pred = np.nanmean(Y_valid_pred, axis=0)
-
             if self.X_test is not None:
                 Y_test_pred = np.array(
                     [
@@ -746,7 +713,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                 loss=opt_loss,
                 train_loss=train_loss,
                 opt_pred=Y_optimization_pred,
-                valid_pred=Y_valid_pred if self.X_valid is not None else None,
                 test_pred=Y_test_pred if self.X_test is not None else None,
                 additional_run_info=additional_run_info,
                 file_output=True,
@@ -793,7 +759,6 @@ def partial_fit_predict_and_loss(self, fold: int, iterative: bool = False) -> No
             (
                 train_pred,
                 opt_pred,
-                valid_pred,
                 test_pred,
                 additional_run_info,
             ) = self._partial_fit_and_predict_standard(
@@ -819,7 +784,6 @@ def partial_fit_predict_and_loss(self, fold: int, iterative: bool = False) -> No
                 loss=loss,
                 train_loss=train_loss,
                 opt_pred=opt_pred,
-                valid_pred=valid_pred,
                 test_pred=test_pred,
                 file_output=False,
                 final_call=True,
@@ -883,12 +847,7 @@ def _partial_fit_and_predict_iterative(
                     n_iter=n_iter,
                     **fit_params,
                 )
-                (
-                    Y_train_pred,
-                    Y_optimization_pred,
-                    Y_valid_pred,
-                    Y_test_pred,
-                ) = self._predict(
+                (Y_train_pred, Y_optimization_pred, Y_test_pred,) = self._predict(
                     model,
                     train_indices=train_indices,
                     test_indices=test_indices,
@@ -921,7 +880,6 @@ def _partial_fit_and_predict_iterative(
                     loss=loss,
                     train_loss=train_loss,
                     opt_pred=Y_optimization_pred,
-                    valid_pred=Y_valid_pred,
                     test_pred=Y_test_pred,
                     additional_run_info=additional_run_info,
                     file_output=file_output,
@@ -936,7 +894,6 @@ def _partial_fit_and_predict_iterative(
             (
                 Y_train_pred,
                 Y_optimization_pred,
-                Y_valid_pred,
                 Y_test_pred,
                 additional_run_info,
             ) = self._partial_fit_and_predict_standard(
@@ -962,7 +919,6 @@ def _partial_fit_and_predict_iterative(
                 loss=loss,
                 train_loss=train_loss,
                 opt_pred=Y_optimization_pred,
-                valid_pred=Y_valid_pred,
                 test_pred=Y_test_pred,
                 additional_run_info=additional_run_info,
                 file_output=file_output,
@@ -980,7 +936,6 @@ def _partial_fit_and_predict_standard(
     ) -> Tuple[
         PIPELINE_DATA_DTYPE,  # train_pred
         PIPELINE_DATA_DTYPE,  # opt_pred
-        PIPELINE_DATA_DTYPE,  # valid_pred
         PIPELINE_DATA_DTYPE,  # test_pred
         TYPE_ADDITIONAL_INFO,
     ]:
@@ -1020,7 +975,7 @@ def _partial_fit_and_predict_standard(
             else self.Y_train[train_indices]
         )
 
-        train_pred, opt_pred, valid_pred, test_pred = self._predict(
+        train_pred, opt_pred, test_pred = self._predict(
             model=model,
             train_indices=train_indices,
             test_indices=test_indices,
@@ -1029,7 +984,6 @@ def _partial_fit_and_predict_standard(
         return (
             train_pred,
             opt_pred,
-            valid_pred,
             test_pred,
             additional_run_info,
         )
@@ -1043,7 +997,6 @@ def _partial_fit_and_predict_budget(
     ) -> Tuple[
         PIPELINE_DATA_DTYPE,  # train_pred
         PIPELINE_DATA_DTYPE,  # opt_pred
-        PIPELINE_DATA_DTYPE,  # valid_pred
         PIPELINE_DATA_DTYPE,  # test_pred
         TYPE_ADDITIONAL_INFO,
     ]:
@@ -1073,7 +1026,7 @@ def _partial_fit_and_predict_budget(
             task_type=self.task_type,
         )
 
-        train_pred, opt_pred, valid_pred, test_pred = self._predict(
+        train_pred, opt_pred, test_pred = self._predict(
             model,
             train_indices=train_indices,
             test_indices=test_indices,
@@ -1088,19 +1041,13 @@ def _partial_fit_and_predict_budget(
         return (
             train_pred,
             opt_pred,
-            valid_pred,
             test_pred,
             additional_run_info,
         )
 
     def _predict(
         self, model: BaseEstimator, test_indices: List[int], train_indices: List[int]
-    ) -> Tuple[
-        PIPELINE_DATA_DTYPE,
-        PIPELINE_DATA_DTYPE,
-        PIPELINE_DATA_DTYPE,
-        PIPELINE_DATA_DTYPE,
-    ]:
+    ) -> Tuple[PIPELINE_DATA_DTYPE, PIPELINE_DATA_DTYPE, PIPELINE_DATA_DTYPE]:
         train_pred = self.predict_function(
             self.X_train.iloc[train_indices]
             if hasattr(self.X_train, "iloc")
@@ -1123,14 +1070,6 @@ def _predict(
             else self.Y_train[train_indices],
         )
 
-        if self.X_valid is not None:
-            X_valid = self.X_valid.copy()
-            valid_pred = self.predict_function(
-                X_valid, model, self.task_type, self.Y_train[train_indices]
-            )
-        else:
-            valid_pred = None
-
         if self.X_test is not None:
             X_test = self.X_test.copy()
             test_pred = self.predict_function(
@@ -1144,7 +1083,7 @@ def _predict(
         else:
             test_pred = None
 
-        return train_pred, opt_pred, valid_pred, test_pred
+        return train_pred, opt_pred, test_pred
 
     def get_splitter(
         self, D: AbstractDataManager
diff --git a/autosklearn/evaluation/util.py b/autosklearn/evaluation/util.py
index c249c8be1c..158825786b 100644
--- a/autosklearn/evaluation/util.py
+++ b/autosklearn/evaluation/util.py
@@ -12,19 +12,19 @@ def read_queue(
     stack = []
     while True:
         try:
-            rval = queue_.get(timeout=1)
+            return_value = queue_.get(timeout=1)
         except queue.Empty:
             break
 
         # Check if there is a special placeholder value which tells us that
         # we don't have to wait until the queue times out in order to
         # retrieve the final value!
-        if "final_queue_element" in rval:
-            del rval["final_queue_element"]
+        if "final_queue_element" in return_value:
+            del return_value["final_queue_element"]
             do_break = True
         else:
             do_break = False
-        stack.append(rval)
+        stack.append(return_value)
         if do_break:
             break
 
diff --git a/autosklearn/experimental/selector.py b/autosklearn/experimental/selector.py
index 125cba6125..b854c7b440 100644
--- a/autosklearn/experimental/selector.py
+++ b/autosklearn/experimental/selector.py
@@ -297,17 +297,17 @@ def _predict(
             wins = wins / np.sum(wins)
             predictions[X.index[x_idx]] = wins
 
-        rval = {
+        return_value = {
             task_id: {
                 strategy: predictions[task_id][strategy_idx]
                 for strategy_idx, strategy in enumerate(self.strategies_)
             }
             for task_id in X.index
         }
-        rval = pd.DataFrame(rval).transpose().astype(float)
-        rval = rval[self.strategies_]
-        rval = rval.fillna(0.0)
-        return rval
+        return_value = pd.DataFrame(return_value).transpose().astype(float)
+        return_value = return_value[self.strategies_]
+        return_value = return_value.fillna(0.0)
+        return return_value
 
     def fit_pairwise_model(self, X, y, weights, rng, configuration):
         raise NotImplementedError()
@@ -346,14 +346,14 @@ def fit(
     ) -> None:
         self.X_ = X
         self.strategies_ = y.columns
-        self.rval_ = np.array(
+        self.return_value_ = np.array(
             [
                 (len(self.strategies_) - self.default_strategies.index(strategy) - 1)
                 / (len(self.strategies_) - 1)
                 for strategy in self.strategies_
             ]
         )
-        self.rval_ = self.rval_ / np.sum(self.rval_)
+        self.return_value_ = self.return_value_ / np.sum(self.return_value_)
         self.selector.fit(X, y, minima, maxima)
 
     def _predict(
@@ -377,7 +377,7 @@ def _predict(
                 prediction.loc[task_id] = pd.Series(
                     {
                         strategy: value
-                        for strategy, value in zip(self.strategies_, self.rval_)
+                        for strategy, value in zip(self.strategies_, self.return_value_)
                     }
                 )
 
diff --git a/autosklearn/info.py b/autosklearn/info.py
new file mode 100644
index 0000000000..a2c4318497
--- /dev/null
+++ b/autosklearn/info.py
@@ -0,0 +1,205 @@
+"""
+This module servers as an introspection point for things users might
+want to programatically query about autosklearn.
+"""
+from __future__ import annotations
+
+from typing import Any, Generic, Type, TypeVar
+
+from dataclasses import dataclass
+
+from typing_extensions import Literal
+
+from autosklearn.pipeline.components.base import (
+    AutoSklearnClassificationAlgorithm,
+    AutoSklearnComponent,
+    AutoSklearnPreprocessingAlgorithm,
+    AutoSklearnRegressionAlgorithm,
+)
+from autosklearn.pipeline.components.classification import ClassifierChoice
+from autosklearn.pipeline.components.data_preprocessing import DataPreprocessorChoice
+from autosklearn.pipeline.components.feature_preprocessing import (
+    FeaturePreprocessorChoice,
+)
+from autosklearn.pipeline.components.regression import RegressorChoice
+from autosklearn.pipeline.constants import DATASET_PROPERTIES_TO_STRING
+
+# Something that is a type that inherits from AutoSklearnComponent
+T = TypeVar("T", bound=Type[AutoSklearnComponent])
+
+
+def _translate_properties(
+    props: dict[str, Any],
+    kind: Literal["classifier", "regressor", "f_preprocessor", "d_preprocessor"],
+) -> dict[str, Any]:
+    """Converts supported inputs and outputs to strings"""
+    # This is information is conveyed implicitly by being a regressor/classifier ...
+    delwords = ["handles_regression", "handles_classification"]
+
+    # Covered by input type, duplicated info
+    delwords += ["handles_sparse", "handles_dense"]
+
+    # Words we rename (from, to)
+    popwords: list[tuple[str, str]] = [
+        ("input", "supported_inputs"),
+        ("output", "output_kind"),
+        ("is_deterministic", "deterministic"),
+    ]
+
+    if kind in ["classifier", "f_preprocessor", "d_preprocessor"]:
+        delwords += ["handles_multioutput"]
+
+    if kind in ["regressor", "f_preprocessor", "d_preprocessor"]:
+        delwords += ["handles_multiclass", "handles_multilabel"]
+
+    for word in delwords:
+        if word in props:
+            del props[word]
+
+    for frm, to in popwords:
+        props[to] = props.pop(frm)
+
+    props["supported_inputs"] = [
+        DATASET_PROPERTIES_TO_STRING[k] for k in props["supported_inputs"]
+    ]
+    props["output_kind"] = DATASET_PROPERTIES_TO_STRING[props["output_kind"][0]]
+
+    return props
+
+
+@dataclass
+class _ComponentInfo(Generic[T]):
+    type: T  # cls is not possible due to @dataclass conversion
+    name: str
+    shortname: str
+    output_kind: str
+    supported_inputs: list[str]
+    deterministic: bool = False
+
+
+@dataclass
+class RegressorInfo(_ComponentInfo[Type[AutoSklearnRegressionAlgorithm]]):
+    handles_multioutput: bool = False
+    prefers_data_normalized: bool = False
+
+
+@dataclass
+class ClassifierInfo(_ComponentInfo[Type[AutoSklearnClassificationAlgorithm]]):
+    handles_binary: bool = True  # We assume all components support this
+    handles_multiclass: bool = False
+    handles_multilabel: bool = False
+    handles_multilabel_multiclass = False
+
+
+@dataclass
+class FeaturePreprocessorInfo(_ComponentInfo[Type[AutoSklearnPreprocessingAlgorithm]]):
+    pass
+
+
+@dataclass
+class DataPreprocessorInfo(_ComponentInfo[Type[AutoSklearnPreprocessingAlgorithm]]):
+    # There should be more here but our DataPreprocessing part of the pipeline doesn't
+    # pick up on it because there's on FeatTypeSplit available which further has
+    # subcomponents with extra properties
+    pass
+
+
+@dataclass
+class ComponentsInfo:
+    classifiers: dict[str, ClassifierInfo]
+    regressors: dict[str, RegressorInfo]
+    feature_preprocessors: dict[str, FeaturePreprocessorInfo]
+    data_preprocessors: dict[str, DataPreprocessorInfo]
+
+
+def classifiers() -> dict[str, ClassifierInfo]:
+    """Get information about the classifiers available to auto-sklearn
+
+    Returns
+    -------
+    dict[str, ClassifierInfo]
+        The dict of classifiers and some info about them
+    """
+    return {
+        name: ClassifierInfo(
+            **{
+                "type": cls,
+                **_translate_properties(cls.get_properties(), "classifier"),
+            }
+        )
+        for name, cls in ClassifierChoice.get_components().items()
+    }
+
+
+def regressors() -> dict[str, RegressorInfo]:
+    """Get information about the regressors available to auto-sklearn
+
+    Returns
+    -------
+    dict[str, RegressorInfo]
+        The dict of regressors and some info about them
+    """
+    return {
+        name: RegressorInfo(
+            **{"type": cls, **_translate_properties(cls.get_properties(), "regressor")},
+        )
+        for name, cls in RegressorChoice.get_components().items()
+    }
+
+
+def feature_preprocessors() -> dict[str, FeaturePreprocessorInfo]:
+    """Get information about the feature preprocessors available to auto-sklearn
+
+    Returns
+    -------
+    dict[str, FeaturePreprocessorInfo]
+        The dict of feature preprocessors and some info about them
+    """
+    return {
+        name: FeaturePreprocessorInfo(
+            **{
+                "type": cls,
+                **_translate_properties(cls.get_properties(), "f_preprocessor"),
+            }
+        )
+        for name, cls in FeaturePreprocessorChoice.get_components().items()
+    }
+
+
+def data_preprocessors() -> dict[str, DataPreprocessorInfo]:
+    """Get information about the data preprocessors available to auto-sklearn
+
+    Returns
+    -------
+    dict[str, DataPreprocessorInfo]
+        The dict of data preprocessors and some info about them
+    """
+    return {
+        name: DataPreprocessorInfo(
+            **{
+                "type": cls,
+                **_translate_properties(cls.get_properties(), "d_preprocessor"),
+            }
+        )
+        for name, cls in DataPreprocessorChoice.get_components().items()
+    }
+
+
+def components() -> ComponentsInfo:
+    """Get information about all of the components available to auto-sklearn
+
+    Returns
+    -------
+    ComponentsInfo
+        A dataclass with the items
+        * classifiers
+        * regressors
+        * feature_preprocessors
+        * data_preprocessors
+    """
+    return ComponentsInfo(
+        classifiers=classifiers(),
+        regressors=regressors(),
+        feature_preprocessors=feature_preprocessors(),
+        data_preprocessors=data_preprocessors(),
+    )
diff --git a/autosklearn/metalearning/metalearning/kNearestDatasets/kND.py b/autosklearn/metalearning/metalearning/kNearestDatasets/kND.py
index f6c10c95d2..f49ed8ccab 100644
--- a/autosklearn/metalearning/metalearning/kNearestDatasets/kND.py
+++ b/autosklearn/metalearning/metalearning/kNearestDatasets/kND.py
@@ -122,7 +122,7 @@ def kNearestDatasets(self, x, k=1, return_distance=False):
 
         assert k == neighbor_indices.shape[1]
 
-        rval = [
+        return_value = [
             self.metafeatures.index[i]
             # Neighbor indices is 2d, each row is the indices for one
             # dataset in x.
@@ -130,9 +130,9 @@ def kNearestDatasets(self, x, k=1, return_distance=False):
         ]
 
         if return_distance is False:
-            return rval
+            return return_value
         else:
-            return rval, distances[0]
+            return return_value, distances[0]
 
     def kBestSuggestions(self, x, k=1, exclude_double_configurations=True):
         assert type(x) == pd.Series
diff --git a/autosklearn/pipeline/base.py b/autosklearn/pipeline/base.py
index 93c73b4716..3a13364ea6 100644
--- a/autosklearn/pipeline/base.py
+++ b/autosklearn/pipeline/base.py
@@ -495,15 +495,15 @@ def __repr__(self):
             dataset_properties_string.append("}")
             dataset_properties_string = "".join(dataset_properties_string)
 
-            rval = "%s(%s,\n%s)" % (
+            return_value = "%s(%s,\n%s)" % (
                 class_name,
                 configuration,
                 dataset_properties_string,
             )
         else:
-            rval = "%s(%s)" % (class_name, configuration_string)
+            return_value = "%s(%s)" % (class_name, configuration_string)
 
-        return rval
+        return return_value
 
     def _get_pipeline_steps(self, dataset_properties):
         raise NotImplementedError()
diff --git a/autosklearn/pipeline/components/base.py b/autosklearn/pipeline/components/base.py
index c4a95df08c..7f7adc91b5 100644
--- a/autosklearn/pipeline/components/base.py
+++ b/autosklearn/pipeline/components/base.py
@@ -1,4 +1,4 @@
-from typing import Dict
+from __future__ import annotations
 
 import importlib
 import inspect
@@ -10,7 +10,7 @@
 
 from autosklearn.pipeline.constants import SPARSE
 
-_addons = dict()  # type: Dict[str, 'ThirdPartyComponents']
+_addons: dict[str, ThirdPartyComponents] = {}
 
 
 def find_components(package, directory, base_class):
diff --git a/autosklearn/pipeline/components/data_preprocessing/__init__.py b/autosklearn/pipeline/components/data_preprocessing/__init__.py
index 5693efd441..c63a80679f 100644
--- a/autosklearn/pipeline/components/data_preprocessing/__init__.py
+++ b/autosklearn/pipeline/components/data_preprocessing/__init__.py
@@ -12,14 +12,16 @@
     AutoSklearnChoice,
     AutoSklearnPreprocessingAlgorithm,
     ThirdPartyComponents,
+    _addons,
     find_components,
 )
 
-classifier_directory = os.path.split(__file__)[0]
-_preprocessors = find_components(
-    __package__, classifier_directory, AutoSklearnPreprocessingAlgorithm
+data_preprocessing_directory = os.path.split(__file__)[0]
+_data_preprocessors = find_components(
+    __package__, data_preprocessing_directory, AutoSklearnPreprocessingAlgorithm
 )
-_addons = ThirdPartyComponents(AutoSklearnPreprocessingAlgorithm)
+additional_components = ThirdPartyComponents(AutoSklearnPreprocessingAlgorithm)
+_addons["data_preprocessing"] = additional_components
 
 
 def add_preprocessor(preprocessor: Type[AutoSklearnPreprocessingAlgorithm]) -> None:
@@ -30,8 +32,8 @@ class DataPreprocessorChoice(AutoSklearnChoice):
     @classmethod
     def get_components(cls) -> OrderedDict:
         components: OrderedDict = OrderedDict()
-        components.update(_preprocessors)
-        components.update(_addons.components)
+        components.update(_data_preprocessors)
+        components.update(additional_components.components)
         return components
 
     def get_available_components(
diff --git a/autosklearn/pipeline/components/data_preprocessing/feature_type.py b/autosklearn/pipeline/components/data_preprocessing/feature_type.py
index bd42d8a67a..057099309c 100644
--- a/autosklearn/pipeline/components/data_preprocessing/feature_type.py
+++ b/autosklearn/pipeline/components/data_preprocessing/feature_type.py
@@ -215,6 +215,7 @@ def get_properties(
             "handles_multiclass": True,
             "handles_multilabel": True,
             "handles_multioutput": True,
+            "is_deterministic": True,  # Assumption for now
             # TODO find out of this is right!
             "handles_sparse": True,
             "handles_dense": True,
diff --git a/autosklearn/pipeline/components/feature_preprocessing/__init__.py b/autosklearn/pipeline/components/feature_preprocessing/__init__.py
index cd52d6ad34..968a8e11ad 100644
--- a/autosklearn/pipeline/components/feature_preprocessing/__init__.py
+++ b/autosklearn/pipeline/components/feature_preprocessing/__init__.py
@@ -14,9 +14,9 @@
     find_components,
 )
 
-classifier_directory = os.path.split(__file__)[0]
-_preprocessors = find_components(
-    __package__, classifier_directory, AutoSklearnPreprocessingAlgorithm
+feature_preprocessing_directory = os.path.split(__file__)[0]
+_feature_preprocessors = find_components(
+    __package__, feature_preprocessing_directory, AutoSklearnPreprocessingAlgorithm
 )
 additional_components = ThirdPartyComponents(AutoSklearnPreprocessingAlgorithm)
 _addons["feature_preprocessing"] = additional_components
@@ -30,7 +30,7 @@ class FeaturePreprocessorChoice(AutoSklearnChoice):
     @classmethod
     def get_components(cls):
         components = OrderedDict()
-        components.update(_preprocessors)
+        components.update(_feature_preprocessors)
         components.update(additional_components.components)
         return components
 
diff --git a/autosklearn/pipeline/components/feature_preprocessing/liblinear_svc_preprocessor.py b/autosklearn/pipeline/components/feature_preprocessing/liblinear_svc_preprocessor.py
index 546c8742ad..1af3dc1d8e 100644
--- a/autosklearn/pipeline/components/feature_preprocessing/liblinear_svc_preprocessor.py
+++ b/autosklearn/pipeline/components/feature_preprocessing/liblinear_svc_preprocessor.py
@@ -86,6 +86,7 @@ def get_properties(dataset_properties=None):
             "handles_multiclass": True,
             "handles_multilabel": False,
             "handles_multioutput": False,
+            "is_deterministic": False,
             "input": (SPARSE, DENSE, UNSIGNED_DATA),
             "output": (INPUT,),
         }
diff --git a/autosklearn/pipeline/components/regression/sgd.py b/autosklearn/pipeline/components/regression/sgd.py
index 3b3f939fa8..38164e8d3f 100644
--- a/autosklearn/pipeline/components/regression/sgd.py
+++ b/autosklearn/pipeline/components/regression/sgd.py
@@ -179,7 +179,6 @@ def get_properties(dataset_properties=None):
             "handles_multilabel": False,
             "handles_multioutput": False,
             "is_deterministic": True,
-            "handles_sparse": True,
             "input": (DENSE, SPARSE, UNSIGNED_DATA),
             "output": (PREDICTIONS,),
         }
diff --git a/autosklearn/util/smac_wrap.py b/autosklearn/util/smac_wrap.py
new file mode 100644
index 0000000000..bf3202bbb3
--- /dev/null
+++ b/autosklearn/util/smac_wrap.py
@@ -0,0 +1,43 @@
+from __future__ import annotations
+
+from typing import Callable, Union
+
+from smac.callbacks import IncorporateRunResultCallback
+from smac.optimizer.smbo import SMBO
+from smac.runhistory.runhistory import RunInfo, RunValue
+
+SMACCallback = Callable[[SMBO, RunInfo, RunValue, float], Union[bool, None]]
+
+
+class SmacRunCallback(IncorporateRunResultCallback):
+    def __init__(self, f: SMACCallback):
+        self.f = f
+
+    def __call__(
+        self,
+        smbo: SMBO,
+        run_info: RunInfo,
+        result: RunValue,
+        time_left: float,
+    ) -> bool | None:
+        """
+        Parameters
+        ----------
+        smbo: SMBO
+            The SMAC SMBO object
+
+        run_info: RunInfo
+            Information about the run completed
+
+        result: RunValue
+            The results of the run
+
+        time_left: float
+            How much time is left for the remaining runs
+
+        Returns
+        -------
+        bool | None
+            If False is returned, the optimization loop will stop
+        """
+        return self.f(smbo, run_info, result, time_left)
diff --git a/doc/manual.rst b/doc/manual.rst
index 7cdb162881..7ab31ce727 100644
--- a/doc/manual.rst
+++ b/doc/manual.rst
@@ -374,3 +374,8 @@ Other
     according to its performance on the validation set. Setting the initial
     configurations found by meta-learning to zero makes *auto-sklearn* use the
     regular SMAC algorithm for suggesting new hyperparameter configurations.
+
+.. collapse:: <b>Early stopping and Callbacks</b>
+
+   By using the parameter ``get_trials_callback``, we can get access to the results
+   of runs as they occur. See this example :ref:`Early Stopping And Callbacks <sphx_glr_examples_40_advanced_example_early_stopping_and_callbacks.py>` for more!
diff --git a/examples/40_advanced/example_early_stopping_and_callbacks.py b/examples/40_advanced/example_early_stopping_and_callbacks.py
new file mode 100644
index 0000000000..84dae5dced
--- /dev/null
+++ b/examples/40_advanced/example_early_stopping_and_callbacks.py
@@ -0,0 +1,81 @@
+"""
+============================
+Early stopping and Callbacks
+============================
+
+The example below shows how we can use the ``get_trials_callback`` parameter of
+auto-sklearn to implement an early-stopping mechanism through a callback.
+
+These callbacks give access to the result of each model + hyperparameter configuration
+optimized by SMAC, the underlying optimizer for autosklearn. By checking the cost of
+a result, we can implement a simple yet effective early stopping mechanism!
+
+Do note however, this does not provide any access to the ensembles that autosklearn
+produces, only the individual models. You may wish to perform a more sophisticated
+early stopping mechanism such that there are enough good models for autosklearn to build
+and ensemble with. This is here to provide a simple example.
+"""
+from __future__ import annotations
+
+from pprint import pprint
+
+import sklearn.datasets
+import sklearn.metrics
+
+import autosklearn.classification
+
+from smac.optimizer.smbo import SMBO
+from smac.runhistory.runhistory import RunInfo, RunValue
+
+
+############################################################################
+# Build and fit a classifier
+# ==========================
+def callback(
+    smbo: SMBO,
+    run_info: RunInfo,
+    result: RunValue,
+    time_left: float,
+) -> bool | None:
+    """Stop early if we get a very low cost value for a single run
+
+    The return value indicates to SMAC whether to stop or not. False will
+    stop the search process while any other value will mean it continues.
+    """
+    # You can find out the parameters in the SMAC documentation
+    # https://automl.github.io/SMAC3/main/
+    if result.cost <= 0.02:
+        print("Stopping!")
+        print(run_info)
+        print(result)
+        return False
+
+
+X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+    X, y, random_state=1
+)
+
+automl = autosklearn.classification.AutoSklearnClassifier(
+    time_left_for_this_task=120, per_run_time_limit=30, get_trials_callback=callback
+)
+automl.fit(X_train, y_train, dataset_name="breast_cancer")
+
+############################################################################
+# View the models found by auto-sklearn
+# =====================================
+
+print(automl.leaderboard())
+
+############################################################################
+# Print the final ensemble constructed by auto-sklearn
+# ====================================================
+
+pprint(automl.show_models(), indent=4)
+
+###########################################################################
+# Get the Score of the final ensemble
+# ===================================
+
+predictions = automl.predict(X_test)
+print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions))
diff --git a/scripts/03_calculate_metafeatures.py b/scripts/03_calculate_metafeatures.py
index 95d857145a..d7e08ffea8 100644
--- a/scripts/03_calculate_metafeatures.py
+++ b/scripts/03_calculate_metafeatures.py
@@ -5,6 +5,7 @@
 import os
 import sys
 import unittest.mock
+import tempfile
 
 import arff
 import joblib
@@ -82,10 +83,7 @@ def calculate_metafeatures(task_id):
 
     for task_type in ("classification", "regression"):
         output_directory = os.path.join(working_directory, "metafeatures", task_type)
-        try:
-            os.makedirs(output_directory)
-        except:
-            pass
+        os.makedirs(output_directory, exist_ok=True)
 
         all_metafeatures = {}
 
@@ -100,13 +98,10 @@ def calculate_metafeatures(task_id):
         tasks = copy.deepcopy(tasks)
         np.random.shuffle(tasks)
 
-        def producer():
-            for task_id in tasks:
-                yield task_id
-
-        memory = joblib.Memory(location="/tmp/joblib", verbose=10)
+        tmpdir = os.path.join(tempfile.gettempdir(), "joblib")
+        memory = joblib.Memory(location=tmpdir, verbose=10)
         cached_calculate_metafeatures = memory.cache(calculate_metafeatures)
-        mfs = [cached_calculate_metafeatures(task_id) for task_id in producer()]
+        mfs = [cached_calculate_metafeatures(task_id) for task_id in tasks]
 
         for mf in mfs:
             if mf is not None:
diff --git a/scripts/2015_nips_paper/run/score_ensemble.py b/scripts/2015_nips_paper/run/score_ensemble.py
index 1e873f01fd..9842359225 100644
--- a/scripts/2015_nips_paper/run/score_ensemble.py
+++ b/scripts/2015_nips_paper/run/score_ensemble.py
@@ -227,14 +227,14 @@ def evaluate(input_directory, validation_files, test_files, ensemble_size=50):
 
     ensemble_time = time.time() - start
 
-    rval = {
+    return_value = {
         "ensemble_time": ensemble_time,
         "time_function_evaluation": time_function_evaluation,
         "ensemble_error": ensemble_error,
         "ensemble_test_error": ensemble_test_error,
     }
 
-    return rval
+    return return_value
 
 
 if __name__ == "__main__":
diff --git a/test/fixtures/ensemble_building.py b/test/fixtures/ensemble_building.py
index cca68f76d0..548d1c5d72 100644
--- a/test/fixtures/ensemble_building.py
+++ b/test/fixtures/ensemble_building.py
@@ -164,6 +164,10 @@ def _make(
                 backend.save_additional_data(
                     datamanager.data["Y_train"], what="targets_ensemble"
                 )
+            if "X_train" in datamanager.data:
+                backend.save_additional_data(
+                    datamanager.data["X_train"], what="input_ensemble"
+                )
 
         builder = EnsembleBuilder(
             backend=backend,
diff --git a/test/fixtures/metrics.py b/test/fixtures/metrics.py
new file mode 100644
index 0000000000..8bf754aea5
--- /dev/null
+++ b/test/fixtures/metrics.py
@@ -0,0 +1,26 @@
+from typing import Any
+
+import numpy as np
+
+from autosklearn.metrics import accuracy, make_scorer
+
+
+def _accuracy_requiring_X_data(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    X_data: Any,
+) -> float:
+    """Dummy metric that needs X Data"""
+    if X_data is None:
+        raise ValueError()
+    return accuracy(y_true, y_pred)
+
+
+acc_with_X_data = make_scorer(
+    name="acc_with_X_data",
+    score_func=_accuracy_requiring_X_data,
+    needs_X=True,
+    optimum=1,
+    worst_possible_result=0,
+    greater_is_better=True,
+)
diff --git a/test/test_automl/__init__.py b/test/test_automl/__init__.py
index dae354a675..e69de29bb2 100644
--- a/test/test_automl/__init__.py
+++ b/test/test_automl/__init__.py
@@ -1 +0,0 @@
-# -*- encoding: utf-8 -*-
diff --git a/test/test_automl/cases.py b/test/test_automl/cases.py
index c80d3d3b5f..9583c7b31d 100644
--- a/test/test_automl/cases.py
+++ b/test/test_automl/cases.py
@@ -14,7 +14,6 @@
     {fitted} - If the automl case has been fitted
     {cv, holdout} - Whether explicitly cv or holdout was used
     {no_ensemble} - Fit with no ensemble size
-    {cached} - If the resulting case is then cached
     {multiobjective} - If the automl instance is multiobjective
 """
 from __future__ import annotations
@@ -24,10 +23,15 @@
 from pathlib import Path
 
 import numpy as np
+import sklearn.model_selection
 
 import autosklearn.metrics
 from autosklearn.automl import AutoMLClassifier, AutoMLRegressor
 from autosklearn.automl_common.common.utils.backend import Backend
+from autosklearn.evaluation.abstract_evaluator import (
+    MyDummyClassifier,
+    MyDummyRegressor,
+)
 
 from pytest_cases import case, parametrize
 
@@ -35,6 +39,11 @@
 from test.fixtures.caching import Cache
 
 
+def stop_at_first(smbo, run_info, result, time_left) -> bool:
+    """Used in some cases to enforce the only valid model is the dummy model"""
+    return False
+
+
 @case(tags=["classifier"])
 def case_classifier(
     tmp_dir: str,
@@ -60,7 +69,7 @@ def case_regressor(
 # ###################################
 # The following are fitted and cached
 # ###################################
-@case(tags=["classifier", "fitted", "holdout", "cached"])
+@case(tags=["classifier", "fitted", "holdout"])
 @parametrize("dataset", ["iris"])
 def case_classifier_fitted_holdout_iterative(
     dataset: str,
@@ -97,7 +106,7 @@ def case_classifier_fitted_holdout_iterative(
     return model
 
 
-@case(tags=["classifier", "fitted", "cv", "cached"])
+@case(tags=["classifier", "fitted", "cv"])
 @parametrize("dataset", ["iris"])
 def case_classifier_fitted_cv(
     make_cache: Callable[[str], Cache],
@@ -134,7 +143,7 @@ def case_classifier_fitted_cv(
     return model
 
 
-@case(tags=["classifier", "fitted", "holdout", "cached", "multiobjective"])
+@case(tags=["classifier", "fitted", "holdout", "multiobjective"])
 @parametrize("dataset", ["iris"])
 def case_classifier_fitted_holdout_multiobjective(
     dataset: str,
@@ -177,7 +186,7 @@ def case_classifier_fitted_holdout_multiobjective(
     return model
 
 
-@case(tags=["regressor", "fitted", "holdout", "cached"])
+@case(tags=["regressor", "fitted", "holdout"])
 @parametrize("dataset", ["boston"])
 def case_regressor_fitted_holdout(
     make_cache: Callable[[str], Cache],
@@ -212,7 +221,7 @@ def case_regressor_fitted_holdout(
     return model
 
 
-@case(tags=["regressor", "fitted", "cv", "cached"])
+@case(tags=["regressor", "fitted", "cv"])
 @parametrize("dataset", ["boston"])
 def case_regressor_fitted_cv(
     make_cache: Callable[[str], Cache],
@@ -249,7 +258,7 @@ def case_regressor_fitted_cv(
     return model
 
 
-@case(tags=["classifier", "fitted", "no_ensemble", "cached"])
+@case(tags=["classifier", "fitted", "no_ensemble"])
 @parametrize("dataset", ["iris"])
 def case_classifier_fitted_no_ensemble(
     make_cache: Callable[[str], Cache],
@@ -258,8 +267,7 @@ def case_classifier_fitted_no_ensemble(
     make_automl_classifier: Callable[..., AutoMLClassifier],
     make_sklearn_dataset: Callable[..., Tuple[np.ndarray, ...]],
 ) -> AutoMLClassifier:
-    """Case of a fitted classifier but ensemble was disabled by
-    not writing models to disk"""
+    """Case of a fitted classifier but ensemble was disabled"""
     key = f"case_classifier_fitted_no_ensemble_{dataset}"
 
     # This locks the cache for this item while we check, required for pytest-xdist
@@ -270,7 +278,6 @@ def case_classifier_fitted_no_ensemble(
                 temporary_directory=cache.path("backend"),
                 delete_tmp_folder_after_terminate=False,
                 ensemble_class=None,
-                disable_evaluator_output=True,
             )
 
             X, y, Xt, yt = make_sklearn_dataset(name=dataset)
@@ -282,3 +289,85 @@ def case_classifier_fitted_no_ensemble(
     model._backend = copy_backend(old=model._backend, new=make_backend())
 
     return model
+
+
+@case(tags=["classifier", "fitted"])
+def case_classifier_fitted_only_dummy(
+    make_cache: Callable[[str], Cache],
+    make_backend: Callable[..., Backend],
+    make_automl_classifier: Callable[..., AutoMLClassifier],
+) -> AutoMLClassifier:
+    """Case of a fitted classifier but only dummy was found"""
+    key = "case_classifier_fitted_only_dummy"
+
+    # This locks the cache for this item while we check, required for pytest-xdist
+
+    with make_cache(key) as cache:
+        if "model" not in cache:
+            model = make_automl_classifier(
+                temporary_directory=cache.path("backend"),
+                delete_tmp_folder_after_terminate=False,
+                include={"classifier": ["bernoulli_nb"]},  # Just a meh model
+                get_trials_callback=stop_at_first,
+            )
+            rand = np.random.RandomState(2)
+            _X = rand.random((100, 50))
+            _y = rand.randint(0, 2, (100,))
+            X, Xt, y, yt = sklearn.model_selection.train_test_split(
+                _X, _y, random_state=1  # Required to ensure dummy is best
+            )
+            model.fit(X, y, dataset_name="random")
+
+            # We now validate that indeed, the only model is the Dummy
+            members = list(model.models_.values())
+            if len(members) != 1 and not isinstance(members[0], MyDummyClassifier):
+                raise ValueError("Should only have one model, dummy\n", members)
+
+            cache.save(model, "model")
+
+    model = cache.load("model")
+    model._backend = copy_backend(old=model._backend, new=make_backend())
+
+    return model
+
+
+@case(tags=["regressor", "fitted"])
+def case_regressor_fitted_only_dummy(
+    make_cache: Callable[[str], Cache],
+    make_backend: Callable[..., Backend],
+    make_automl_regressor: Callable[..., AutoMLRegressor],
+) -> AutoMLRegressor:
+    """Case of a fitted classifier but only dummy was found"""
+    key = "case_regressor_fitted_only_dummy"
+
+    # This locks the cache for this item while we check, required for pytest-xdist
+
+    with make_cache(key) as cache:
+        if "model" not in cache:
+            model = make_automl_regressor(
+                temporary_directory=cache.path("backend"),
+                delete_tmp_folder_after_terminate=False,
+                include={"regressor": ["k_nearest_neighbors"]},  # Just a meh model
+                get_trials_callback=stop_at_first,
+            )
+
+            rand = np.random.RandomState(2)
+            _X = rand.random((100, 50))
+            _y = rand.random((100,))
+
+            X, Xt, y, yt = sklearn.model_selection.train_test_split(
+                _X, _y, random_state=1  # Required to ensure dummy is best
+            )
+            model.fit(X, y, dataset_name="random")
+
+            # We now validate that indeed, the only model is the Dummy
+            members = list(model.models_.values())
+            if len(members) != 1 and not isinstance(members[0], MyDummyRegressor):
+                raise ValueError("Should only have one model, dummy\n", members)
+
+            cache.save(model, "model")
+
+    model = cache.load("model")
+    model._backend = copy_backend(old=model._backend, new=make_backend())
+
+    return model
diff --git a/test/test_automl/test_construction.py b/test/test_automl/test_construction.py
index 5b15812acd..5b68d35118 100644
--- a/test/test_automl/test_construction.py
+++ b/test/test_automl/test_construction.py
@@ -1,9 +1,4 @@
-"""Property based Tests
-
-These test are for checking properties of already fitted models. Any test that does
-tests using cases should not modify the state as these models are cached between tests
-to reduce training time.
-"""
+"""Test things related to only constructing an AutoML instance"""
 from typing import Any, Dict, Optional, Union
 
 from autosklearn.automl import AutoML
diff --git a/test/test_automl/test_dataset_compression.py b/test/test_automl/test_dataset_compression.py
index d50869ebbf..0a7e5a18bf 100644
--- a/test/test_automl/test_dataset_compression.py
+++ b/test/test_automl/test_dataset_compression.py
@@ -1,3 +1,4 @@
+"""Test things related to how AutoML compresses the dataset size"""
 from typing import Any, Callable, Dict
 
 import numpy as np
diff --git a/test/test_automl/test_dummy_predictions.py b/test/test_automl/test_dummy_predictions.py
index c4aa560791..c593e7f4cf 100644
--- a/test/test_automl/test_dummy_predictions.py
+++ b/test/test_automl/test_dummy_predictions.py
@@ -1,3 +1,7 @@
+"""Test the dummy predictor of AutoML
+
+Dummy models can serve as an early warning of issues with parameters during fit
+"""
 from __future__ import annotations
 
 from typing import Callable, Sequence, Tuple
@@ -183,6 +187,11 @@ def test_crash_due_to_memory_exception(
 
 
 def test_raises_if_no_metric_set(make_automl: Callable[..., AutoML]) -> None:
+    """
+    Expects
+    -------
+    * raise if there was no metric set when calling `_do_dummy_prediction()`
+    """
     automl = make_automl()
     with pytest.raises(ValueError, match="Metric/Metrics was/were not set"):
         automl._do_dummy_prediction()
@@ -193,10 +202,17 @@ def test_raises_invalid_metric(
     make_automl: Callable[..., AutoML],
     make_sklearn_dataset: Callable[..., XYDataManager],
 ) -> None:
+    """
+    Expects
+    -------
+    * Should raise an error if the given metric is not applicable to a given task type
+    """
+    # `precision` is not applicable to MULTICLASS_CLASSIFICATION
     dataset = "iris"
     task = MULTICLASS_CLASSIFICATION
+    metrics = [accuracy, precision]
 
-    automl = make_automl(metrics=[accuracy, precision])
+    automl = make_automl(metrics=metrics)
     automl._logger = mock_logger
 
     datamanager = make_sklearn_dataset(
diff --git a/test/test_automl/test_early_stopping.py b/test/test_automl/test_early_stopping.py
new file mode 100644
index 0000000000..4aa7192180
--- /dev/null
+++ b/test/test_automl/test_early_stopping.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable
+
+if TYPE_CHECKING:
+    import numpy as np
+    from smac.optimizer.smbo import SMBO
+    from smac.runhistory.runhistory import RunInfo, RunValue
+
+    from autosklearn.automl import AutoMLClassifier
+
+
+def test_early_stopping(
+    make_automl_classifier: Callable[..., AutoMLClassifier],
+    make_sklearn_dataset: Callable[..., tuple[np.ndarray, ...]],
+) -> None:
+    """
+    Expects
+    -------
+    * Should early after fitting 2 models
+    """
+
+    def callback(
+        smbo: SMBO,
+        run_info: RunInfo,
+        result: RunValue,
+        time_left: float,
+    ) -> bool | None:
+        if int(result.additional_info["num_run"]) >= 2:
+            return False
+
+    automl = make_automl_classifier(get_trials_callback=callback)
+
+    X_train, Y_train, X_test, Y_test = make_sklearn_dataset("iris")
+    automl.fit(X_train, Y_train)
+
+    assert len(automl.runhistory_.data) == 2
diff --git a/test/test_automl/test_fit.py b/test/test_automl/test_fit.py
index 2defa2518b..02992e5e13 100644
--- a/test/test_automl/test_fit.py
+++ b/test/test_automl/test_fit.py
@@ -1,3 +1,4 @@
+"""Test specific ways of calling `fit` of AutoML"""
 from typing import Any, Callable, Dict, Optional, Tuple, Union
 
 import numpy as np
diff --git a/test/test_automl/test_fit_pipeline.py b/test/test_automl/test_fit_pipeline.py
index 137b57a5c3..5d1d648fc4 100644
--- a/test/test_automl/test_fit_pipeline.py
+++ b/test/test_automl/test_fit_pipeline.py
@@ -1 +1 @@
-"""TODO"""
+"""Test specific ways of calling `fit_pipeline`"""
diff --git a/test/test_automl/test_model_predict.py b/test/test_automl/test_model_predict.py
index a301d1a9a5..b43c488220 100644
--- a/test/test_automl/test_model_predict.py
+++ b/test/test_automl/test_model_predict.py
@@ -1,3 +1,4 @@
+"""Test the _model_predict helper function such that it shapes output correctly"""
 from typing import Callable, Dict, Tuple
 
 import warnings
@@ -19,7 +20,10 @@
 
 
 class WarningModel:
+    """Simple model that returns incorrect shape and issues warning"""
+
     def predict(self, X: np.ndarray) -> np.ndarray:
+        """Shout a warning during prediction"""
         warnings.warn("shout")
         return X
 
diff --git a/test/test_automl/test_outputs.py b/test/test_automl/test_outputs.py
deleted file mode 100644
index 5b31e60331..0000000000
--- a/test/test_automl/test_outputs.py
+++ /dev/null
@@ -1,118 +0,0 @@
-from pathlib import Path
-
-from autosklearn.automl import AutoML
-from autosklearn.ensemble_building.builder import CANDIDATES_FILENAME
-
-from pytest import mark
-from pytest_cases import parametrize_with_cases
-from pytest_cases.filters import has_tag
-
-import test.test_automl.cases as cases
-from test.conftest import DEFAULT_SEED
-
-# Some filters
-has_ensemble = has_tag("fitted") & ~has_tag("no_ensemble")
-no_ensemble = has_tag("fitted") & has_tag("no_ensemble")
-
-
-@mark.todo
-def test_datamanager_stored_contents() -> None:
-    ...
-
-
-@parametrize_with_cases("automl", cases=cases, filter=has_ensemble)
-def test_paths_created(automl: AutoML) -> None:
-    """
-    Parameters
-    ----------
-    automl : AutoML
-        A previously fitted automl
-
-    Expects
-    -------
-    * The given paths should exist after the automl has been run and fitted
-    """
-    assert automl._backend is not None
-
-    partial = Path(automl._backend.internals_directory)
-    expected = [
-        partial / fixture
-        for fixture in (
-            "true_targets_ensemble.npy",
-            f"start_time_{DEFAULT_SEED}",
-            "datamanager.pkl",
-            "runs",
-        )
-    ]
-
-    for path in expected:
-        assert path.exists()
-
-
-@parametrize_with_cases("automl", cases=cases, filter=has_ensemble)
-def test_paths_created_with_ensemble(automl: AutoML) -> None:
-    """
-    Parameters
-    ----------
-    automl : AutoML
-        A previously fitted automl
-
-    Expects
-    -------
-    * The given paths for an automl with an ensemble should include paths
-    specific to ensemble building
-    """
-    assert automl._backend is not None
-
-    partial = Path(automl._backend.internals_directory)
-    expected = [
-        partial / fixture
-        for fixture in (
-            "ensembles",
-            "ensemble_history.json",
-            CANDIDATES_FILENAME,
-        )
-    ]
-
-    for path in expected:
-        assert path.exists()
-
-
-@parametrize_with_cases("automl", cases=cases, filter=has_ensemble)
-def test_at_least_one_model_and_predictions(automl: AutoML) -> None:
-    """
-    Expects
-    -------
-    * There should be at least one models saved
-    * Each model saved should have predictions for the ensemble
-    """
-    assert automl._backend is not None
-    runs_dir = Path(automl._backend.get_runs_directory())
-
-    runs = list(runs_dir.iterdir())
-    assert len(runs) > 0
-
-    at_least_one = False
-    for run in runs:
-        prediction_files = run.glob("predictions_ensemble*.npy")
-        model_files = run.glob("*.*.model")
-
-        if any(prediction_files):
-            at_least_one = True
-            assert any(model_files), "Run produced prediction but no model"
-
-    assert at_least_one, "No runs produced predictions"
-
-
-@parametrize_with_cases("automl", cases=cases, filter=has_ensemble)
-def test_at_least_one_ensemble(automl: AutoML) -> None:
-    """
-    Expects
-    -------
-    * There should be at least one ensemble generated
-    """
-    assert automl._backend is not None
-    ens_dir = Path(automl._backend.get_ensemble_dir())
-
-    # TODO make more generic
-    assert len(list(ens_dir.glob("*.ensemble"))) > 0
diff --git a/test/test_automl/test_pareto_front.py b/test/test_automl/test_pareto_front.py
new file mode 100644
index 0000000000..8ff38a04a6
--- /dev/null
+++ b/test/test_automl/test_pareto_front.py
@@ -0,0 +1,36 @@
+"""Test the output of loading the pareto set from an automl instance"""
+from autosklearn.automl import AutoML
+
+from pytest_cases import parametrize_with_cases
+from pytest_cases.filters import has_tag
+
+import test.test_automl.cases as cases
+
+has_ensemble = has_tag("fitted") & ~has_tag("no_ensemble")
+
+single_objective = has_ensemble & ~has_tag("multiobjective")
+multi_objective = has_ensemble & has_tag("multiobjective")
+
+
+@parametrize_with_cases("automl", cases=cases, filter=single_objective)
+def test_can_output_pareto_front_singleobjective(automl: AutoML) -> None:
+    """
+    Expects
+    -------
+    * Non-multiobjective instances should have a pareto set of size 1
+    """
+    pareto_set = automl._load_pareto_set()
+
+    assert len(pareto_set) == 1
+
+
+@parametrize_with_cases("automl", cases=cases, filter=multi_objective)
+def test_can_output_pareto_front_multiobjective(automl: AutoML) -> None:
+    """
+    Expects
+    -------
+    * Multiobjective ensembles should return >= 1, #TODO should test it's pareto optimal
+    """
+    pareto_set = automl._load_pareto_set()
+
+    assert len(pareto_set) >= 1
diff --git a/test/test_automl/test_performance.py b/test/test_automl/test_performance.py
index e69de29bb2..76c1a0d9d4 100644
--- a/test/test_automl/test_performance.py
+++ b/test/test_automl/test_performance.py
@@ -0,0 +1,47 @@
+"""Test the performance of automl instances after fitting"""
+
+import numpy as np
+from sklearn.ensemble import VotingClassifier, VotingRegressor
+
+from autosklearn.automl import AutoML
+
+from pytest_cases import parametrize_with_cases
+
+import test.test_automl.cases as cases
+
+
+@parametrize_with_cases("automl", cases.case_classifier_fitted_holdout_multiobjective)
+def test_performance_with_multiobjective(automl: AutoML) -> None:
+    """
+    Expects
+    -------
+    * Auto-sklearn can predict/predict_proba and has a model
+    * Each ensemble in the pareto_set can predict/predict_proba
+    """
+    # TODO: This test is hyperspecific to this one case
+    #
+    #   Long term we probably want to return additional info about the case so we can
+    #   test things for other than this case
+
+    # Check that the predict function works
+    X = np.array([[1.0, 1.0, 1.0, 1.0]])
+
+    assert automl.predict_proba(X).shape == (1, 3)
+    assert automl.predict(X).shape == (1,)
+
+    pareto_front = automl._load_pareto_set()
+    for ensemble in pareto_front:
+
+        assert isinstance(ensemble, (VotingClassifier, VotingRegressor))
+
+        y_pred = ensemble.predict_proba(X)
+        assert y_pred.shape == (1, 3)
+
+        y_pred = ensemble.predict(X)
+        assert y_pred in ["setosa", "versicolor", "virginica"]
+
+    statistics = automl.sprint_statistics()
+    assert "Metrics" in statistics
+    assert ("Best validation score: 0.9" in statistics) or (
+        "Best validation score: 1.0" in statistics
+    ), statistics
diff --git a/test/test_automl/test_performance_over_time.py b/test/test_automl/test_performance_over_time.py
index d5cc327a41..f38bdd7ee2 100644
--- a/test/test_automl/test_performance_over_time.py
+++ b/test/test_automl/test_performance_over_time.py
@@ -1,3 +1,4 @@
+"""Test the performance over time functionality of automl instances"""
 from autosklearn.automl import AutoML
 
 from pytest_cases import parametrize_with_cases
diff --git a/test/test_automl/test_post_fit.py b/test/test_automl/test_post_fit.py
index ccc5f25b9b..37fcd63eca 100644
--- a/test/test_automl/test_post_fit.py
+++ b/test/test_automl/test_post_fit.py
@@ -1,11 +1,20 @@
-import numpy as np
-from sklearn.ensemble import VotingClassifier, VotingRegressor
+"""Check the internal state of the automl instances after it has been fitted"""
+
+from pathlib import Path
 
 from autosklearn.automl import AutoML
+from autosklearn.ensemble_building.builder import CANDIDATES_FILENAME
 
+from pytest import mark
 from pytest_cases import parametrize_with_cases
+from pytest_cases.filters import has_tag
 
 import test.test_automl.cases as cases
+from test.conftest import DEFAULT_SEED
+
+# Some filters
+has_ensemble = has_tag("fitted") & ~has_tag("no_ensemble")
+no_ensemble = has_tag("fitted") & has_tag("no_ensemble")
 
 
 @parametrize_with_cases("automl", cases=cases, has_tag=["fitted", "holdout"])
@@ -52,7 +61,7 @@ def test_cv_loaded_models(automl: AutoML) -> None:
     assert set(automl.cv_models_.keys()) == set(ensemble_identifiers)
 
 
-@parametrize_with_cases("automl", cases=cases, has_tag=["fitted", "no_ensemble"])
+@parametrize_with_cases("automl", cases=cases, has_tag=no_ensemble)
 def test_no_ensemble(automl: AutoML) -> None:
     """
     Parameters
@@ -71,36 +80,99 @@ def test_no_ensemble(automl: AutoML) -> None:
     assert len(automl.cv_models_) == 0
 
 
-@parametrize_with_cases("automl", cases, has_tag=["multiobjective"])
-def test__load_pareto_front(automl: AutoML) -> None:
+@mark.todo
+def test_datamanager_stored_contents() -> None:
     """
-    Parameters
-    ----------
-    automl : AutoML
-        An AutoML object fitted with multiple objective metrics
+    Expects
+    -------
+    * TODO
+    """
+    ...
+
+
+@parametrize_with_cases("automl", cases=cases, filter=has_ensemble)
+def test_paths_created(automl: AutoML) -> None:
+    """
+    Expects
+    -------
+    * The given paths should exist after the automl has been run and fitted
+    """
+    assert automl._backend is not None
+
+    partial = Path(automl._backend.internals_directory)
+    expected = [
+        partial / fixture
+        for fixture in (
+            "true_targets_ensemble.npy",
+            f"start_time_{DEFAULT_SEED}",
+            "datamanager.pkl",
+            "runs",
+        )
+    ]
+
+    for path in expected:
+        assert path.exists()
+
+
+@parametrize_with_cases("automl", cases=cases, filter=has_ensemble)
+def test_paths_created_with_ensemble(automl: AutoML) -> None:
+    """
+    Expects
+    -------
+    * The given paths for an automl with an ensemble should include paths
+    specific to ensemble building
+    """
+    assert automl._backend is not None
+
+    partial = Path(automl._backend.internals_directory)
+    expected = [
+        partial / fixture
+        for fixture in (
+            "ensembles",
+            "ensemble_history.json",
+            CANDIDATES_FILENAME,
+        )
+    ]
+
+    for path in expected:
+        assert path.exists()
 
+
+@parametrize_with_cases("automl", cases=cases, filter=has_ensemble)
+def test_at_least_one_model_and_predictions(automl: AutoML) -> None:
+    """
     Expects
     -------
-    * Auto-sklearn can predict and has a model
-    * _load_pareto_front returns one scikit-learn ensemble
-    """
-    # Check that the predict function works
-    X = np.array([[1.0, 1.0, 1.0, 1.0]])
-
-    assert automl.predict_proba(X).shape == (1, 3)
-    assert automl.predict(X).shape == (1,)
-
-    pareto_front = automl._load_pareto_set()
-    assert len(pareto_front) == 1
-    for ensemble in pareto_front:
-        assert isinstance(ensemble, (VotingClassifier, VotingRegressor))
-        y_pred = ensemble.predict_proba(X)
-        assert y_pred.shape == (1, 3)
-        y_pred = ensemble.predict(X)
-        assert y_pred in ["setosa", "versicolor", "virginica"]
-
-    statistics = automl.sprint_statistics()
-    assert "Metrics" in statistics
-    assert ("Best validation score: 0.9" in statistics) or (
-        "Best validation score: 1.0" in statistics
-    ), statistics
+    * There should be at least one models saved
+    * Each model saved should have predictions for the ensemble
+    """
+    assert automl._backend is not None
+    runs_dir = Path(automl._backend.get_runs_directory())
+
+    runs = list(runs_dir.iterdir())
+    assert len(runs) > 0
+
+    at_least_one = False
+    for run in runs:
+        prediction_files = run.glob("predictions_ensemble*.npy")
+        model_files = run.glob("*.*.model")
+
+        if any(prediction_files):
+            at_least_one = True
+            assert any(model_files), "Run produced prediction but no model"
+
+    assert at_least_one, "No runs produced predictions"
+
+
+@parametrize_with_cases("automl", cases=cases, filter=has_ensemble)
+def test_at_least_one_ensemble(automl: AutoML) -> None:
+    """
+    Expects
+    -------
+    * There should be at least one ensemble generated
+    """
+    assert automl._backend is not None
+    ens_dir = Path(automl._backend.get_ensemble_dir())
+
+    # TODO make more generic
+    assert len(list(ens_dir.glob("*.ensemble"))) > 0
diff --git a/test/test_automl/test_predict.py b/test/test_automl/test_predict.py
index 137b57a5c3..4bad9859be 100644
--- a/test/test_automl/test_predict.py
+++ b/test/test_automl/test_predict.py
@@ -1 +1 @@
-"""TODO"""
+"""Test predictions of an automl instance"""
diff --git a/test/test_automl/test_refit.py b/test/test_automl/test_refit.py
index 341486ab13..0f5b42fff0 100644
--- a/test/test_automl/test_refit.py
+++ b/test/test_automl/test_refit.py
@@ -1,3 +1,4 @@
+"""Test the refitting functionality of an automl instance"""
 from typing import Callable, Union
 
 from itertools import repeat
diff --git a/test/test_automl/test_show_models.py b/test/test_automl/test_show_models.py
index 72b2e4f8d6..93a4aac651 100644
--- a/test/test_automl/test_show_models.py
+++ b/test/test_automl/test_show_models.py
@@ -1,3 +1,4 @@
+"""Test the show models functinality of an automl instance"""
 from autosklearn.automl import AutoML
 
 from pytest_cases import parametrize_with_cases
diff --git a/test/test_automl/test_sklearn_compliance.py b/test/test_automl/test_sklearn_compliance.py
index ce747e1bb8..c96468e0e2 100644
--- a/test/test_automl/test_sklearn_compliance.py
+++ b/test/test_automl/test_sklearn_compliance.py
@@ -1,7 +1,9 @@
-"""
+"""Test that autosklearn is sklearn compliant
+
 Note
 ----
-This is far from complete at the moment
+* This is far from complete at the moment
+* This should probably be tested on AutoSklearnEstimators not AutoML
 """
 from typing import List, Union
 
diff --git a/test/test_ensemble_builder/test_ensemble_builder.py b/test/test_ensemble_builder/test_ensemble_builder.py
index a46da42ef1..9a7927384d 100644
--- a/test/test_ensemble_builder/test_ensemble_builder.py
+++ b/test/test_ensemble_builder/test_ensemble_builder.py
@@ -10,14 +10,15 @@
 
 from autosklearn.automl_common.common.utils.backend import Backend
 from autosklearn.ensemble_building import EnsembleBuilder, Run
-from autosklearn.metrics import make_scorer
+from autosklearn.metrics import Scorer, accuracy, make_scorer
 from autosklearn.util.functional import bound, pairs
 
 import pytest
 from pytest_cases import fixture, parametrize
-from unittest.mock import patch
+from unittest.mock import Mock, patch
 
 from test.conftest import DEFAULT_SEED
+from test.fixtures.metrics import acc_with_X_data
 
 
 @fixture
@@ -673,6 +674,27 @@ def test_delete_runs_does_not_delete_dummy(
     assert set(loaded.values()) == set(dummy_runs)
 
 
+def test_fit_ensemble_with_no_targets_raises(
+    builder: EnsembleBuilder,
+    make_run: Callable[..., Run],
+) -> None:
+    """
+    Expects
+    -------
+    * If no ensemble targets can be found then `fit_ensemble` should fail
+    """
+    # Delete the targets and then try fit ensemble
+    targets_path = Path(builder.backend._get_targets_ensemble_filename())
+    targets_path.unlink()
+
+    candidates = [make_run(backend=builder.backend) for _ in range(5)]
+    with pytest.raises(ValueError, match="`fit_ensemble` could not find any .*"):
+        builder.fit_ensemble(
+            candidates=candidates,
+            runs=candidates,
+        )
+
+
 def test_fit_ensemble_produces_ensemble(
     builder: EnsembleBuilder,
     make_run: Callable[..., Run],
@@ -682,16 +704,13 @@ def test_fit_ensemble_produces_ensemble(
     -------
     * Should produce an ensemble if all runs have predictions
     """
-    X_data = builder.X_data("ensemble")
     targets = builder.targets("ensemble")
     assert targets is not None
 
     predictions = targets
     runs = [make_run(predictions={"ensemble": predictions}) for _ in range(10)]
 
-    ensemble = builder.fit_ensemble(
-        candidates=runs, X_data=X_data, targets=targets, runs=runs
-    )
+    ensemble = builder.fit_ensemble(candidates=runs, runs=runs)
 
     assert ensemble is not None
 
@@ -823,3 +842,160 @@ def test_deletion_will_not_break_current_ensemble(
 
     for run in new_runs:
         assert run in available_runs
+
+
+@parametrize("metrics", [accuracy, acc_with_X_data, [accuracy, acc_with_X_data]])
+def test_will_build_ensemble_with_different_metrics(
+    make_ensemble_builder: Callable[..., EnsembleBuilder],
+    make_run: Callable[..., Run],
+    metrics: Scorer | list[Scorer],
+) -> None:
+    """
+    Expects
+    -------
+    * Should be able to build a valid ensemble with different combinations of metrics
+    * Should produce a validation score for both "ensemble" and "test" scores
+    """
+    if not isinstance(metrics, list):
+        metrics = [metrics]
+
+    builder = make_ensemble_builder(metrics=metrics)
+
+    # Make some runs and stick them in the same backend as the builder
+    # Dummy just has a terrible loss for all metrics
+    make_run(
+        dummy=True,
+        losses={m.name: 1000 for m in metrics},
+        backend=builder.backend,
+    )
+
+    # "Proper" runs will have the correct targets and so be better than dummy
+    run_predictions = {
+        "ensemble": builder.targets("ensemble"),
+        "test": builder.targets("test"),
+    }
+    for _ in range(5):
+        make_run(predictions=run_predictions, backend=builder.backend)
+
+    history, nbest = builder.main()
+
+    # Should only produce one step
+    assert len(history) == 1
+    hist = history[0]
+
+    # Each of these two keys should be present
+    for key in ["ensemble_optimization_score", "ensemble_test_score"]:
+        assert key in hist
+
+        # TODO should be updated in next PR
+        #   Each of these scores should contain all the metrics
+        # for metric in metrics:
+        #   assert metric.name in hist[key]
+
+
+@parametrize("n_least_prioritized", [1, 2, 3, 4])
+@parametrize("metrics", [accuracy, acc_with_X_data, [accuracy, acc_with_X_data]])
+def test_fit_ensemble_kwargs_priorities(
+    make_ensemble_builder: Callable[..., EnsembleBuilder],
+    make_run: Callable[..., Run],
+    metrics: Scorer | list[Scorer],
+    n_least_prioritized: int,
+) -> None:
+    """
+    Expects
+    -------
+    * Should favour 1) function kwargs, 2) function params 3) init_kwargs 4) init_params
+    """
+    if not isinstance(metrics, list):
+        metrics = [metrics]
+
+    class FakeEnsembleClass:
+        def __init__(self, *args, **kwargs):
+            self.args = args
+            self.kwargs = kwargs
+
+        def fit(*args, **kwargs) -> None:
+            pass
+
+    # We establish the priorty order and give each one of them a custom metric
+    priority = ["function_kwargs", "function_params", "init_kwargs", "init_params"]
+
+    # We reverse the priority and use the `n_least_prioritized` ones
+    # with `n_least_prioritized = 3`
+    #   reversed =  ["init_params", "init_kwargs", "function_params", "function_kwargs"]
+    #   used =      ["init_params", "init_kwargs", "function_params"]
+    #   highest =   "function_params"
+    reversed_priority = list(reversed(priority))
+    used = reversed_priority[:n_least_prioritized]
+    highest_priority = used[-1]
+
+    def S(name: str) -> Scorer:
+        return make_scorer(name, lambda: None)
+
+    # We now pass in all the places this arguments could be specified
+    # Naming them specifically to make it more clear in setup below
+    builder_metric = [S("init_params")] if "init_params" in used else None
+    fit_ensemble_metric = [S("function_params")] if "function_params" in used else None
+
+    builder_ensemble_kwargs = (
+        {"metrics": [S("init_kwargs")]} if "init_kwargs" in used else None
+    )
+    fit_ensemble_kwargs = (
+        {"metrics": [S("function_kwargs")]} if "function_kwargs" in used else None
+    )
+
+    builder = make_ensemble_builder(
+        metrics=builder_metric,
+        ensemble_kwargs=builder_ensemble_kwargs,
+    )
+
+    candidates = [make_run() for _ in range(5)]  # Just so something can be run
+
+    ensemble = builder.fit_ensemble(
+        metrics=fit_ensemble_metric,
+        ensemble_class=FakeEnsembleClass,
+        ensemble_kwargs=fit_ensemble_kwargs,
+        candidates=candidates,
+        runs=candidates,
+    )
+
+    # These are the final metrics passed to the ensemble builder when constructed
+    passed_metrics = ensemble.kwargs["metrics"]
+    metric = passed_metrics[0]
+
+    assert metric.name == highest_priority
+
+
+@parametrize("metric, should_be_loaded", [(accuracy, False), (acc_with_X_data, True)])
+def test_X_data_only_loaded_when_required(
+    make_ensemble_builder: Callable[..., EnsembleBuilder],
+    make_run: Callable[..., Run],
+    metric: Scorer,
+    should_be_loaded: bool,
+) -> None:
+    """
+    Expects
+    -------
+    * Should only load X_train if it's required
+    * TODO should only load X_test if it's required
+    """
+    metrics = [metric]
+    builder = make_ensemble_builder(metrics=metrics)
+
+    # Make a dummy which is required for the whole pipeline to run
+    make_run(dummy=True, losses={metric.name: 1000}, backend=builder.backend)
+
+    # Make a run that has no losses recorded, forcing us to use the metric
+    make_run(
+        dummy=False,
+        predictions={"ensemble": builder.targets("ensemble")},
+        losses=None,
+        backend=builder.backend,
+    )
+
+    ret_value = builder.X_data()
+    builder.X_data = Mock(return_value=ret_value)
+
+    builder.main()
+
+    assert builder.X_data.called == should_be_loaded
diff --git a/test/test_evaluation/evaluation_util.py b/test/test_evaluation/evaluation_util.py
index 62623a50ba..38040f2e4e 100644
--- a/test/test_evaluation/evaluation_util.py
+++ b/test/test_evaluation/evaluation_util.py
@@ -133,27 +133,14 @@ def get_multiclass_classification_datamanager():
     np.random.shuffle(indices)
     X_train = X_train[indices]
     Y_train = Y_train[indices]
-
-    X_valid = X_test[
-        :25,
-    ]
-    Y_valid = Y_test[
-        :25,
-    ]
-    X_test = X_test[
-        25:,
-    ]
-    Y_test = Y_test[
-        25:,
-    ]
+    X_test = X_test[25:]
+    Y_test = Y_test[25:]
 
     D = Dummy()
     D.info = {"task": MULTICLASS_CLASSIFICATION, "is_sparse": False, "label_num": 3}
     D.data = {
         "X_train": X_train,
         "Y_train": Y_train,
-        "X_valid": X_valid,
-        "Y_valid": Y_valid,
         "X_test": X_test,
         "Y_test": Y_test,
     }
@@ -196,34 +183,16 @@ def get_multilabel_classification_datamanager():
     Y_train = Y_train[indices]
 
     Y_train = np.array(convert_to_bin(Y_train, 3))
-    # for i in range(Y_train_.shape[0]):
-    #    Y_train_[:, Y_train[i]] = 1
-    # Y_train = Y_train_
     Y_test = np.array(convert_to_bin(Y_test, 3))
-    # for i in range(Y_test_.shape[0]):
-    #    Y_test_[:, Y_test[i]] = 1
-    # Y_test = Y_test_
 
-    X_valid = X_test[
-        :25,
-    ]
-    Y_valid = Y_test[
-        :25,
-    ]
-    X_test = X_test[
-        25:,
-    ]
-    Y_test = Y_test[
-        25:,
-    ]
+    X_test = X_test[25:]
+    Y_test = Y_test[25:]
 
     D = Dummy()
     D.info = {"task": MULTILABEL_CLASSIFICATION, "is_sparse": False, "label_num": 3}
     D.data = {
         "X_train": X_train,
         "Y_train": Y_train,
-        "X_valid": X_valid,
-        "Y_valid": Y_valid,
         "X_test": X_test,
         "Y_test": Y_test,
     }
@@ -247,26 +216,14 @@ def get_binary_classification_datamanager():
     X_test = X_test[eliminate_class_two]
     Y_test = Y_test[eliminate_class_two]
 
-    X_valid = X_test[
-        :25,
-    ]
-    Y_valid = Y_test[
-        :25,
-    ]
-    X_test = X_test[
-        25:,
-    ]
-    Y_test = Y_test[
-        25:,
-    ]
+    X_test = X_test[25:]
+    Y_test = Y_test[25:]
 
     D = Dummy()
     D.info = {"task": BINARY_CLASSIFICATION, "is_sparse": False, "label_num": 2}
     D.data = {
         "X_train": X_train,
         "Y_train": Y_train.reshape((-1, 1)),
-        "X_valid": X_valid,
-        "Y_valid": Y_valid.reshape((-1, 1)),
         "X_test": X_test,
         "Y_test": Y_test.reshape((-1, 1)),
     }
@@ -282,26 +239,14 @@ def get_regression_datamanager():
     X_train = X_train[indices]
     Y_train = Y_train[indices]
 
-    X_valid = X_test[
-        :200,
-    ]
-    Y_valid = Y_test[
-        :200,
-    ]
-    X_test = X_test[
-        200:,
-    ]
-    Y_test = Y_test[
-        200:,
-    ]
+    X_test = X_test[200:]
+    Y_test = Y_test[200:]
 
     D = Dummy()
     D.info = {"task": REGRESSION, "is_sparse": False, "label_num": 1}
     D.data = {
         "X_train": X_train,
         "Y_train": Y_train.reshape((-1, 1)),
-        "X_valid": X_valid,
-        "Y_valid": Y_valid.reshape((-1, 1)),
         "X_test": X_test,
         "Y_test": Y_test.reshape((-1, 1)),
     }
@@ -334,8 +279,6 @@ def get_500_classes_datamanager():
     D.data = {
         "X_train": X[:700],
         "Y_train": Y[:700],
-        "X_valid": X[700:710],
-        "Y_valid": Y[700:710],
         "X_test": X[710:],
         "Y_test": Y[710:],
     }
diff --git a/test/test_evaluation/test_abstract_evaluator.py b/test/test_evaluation/test_abstract_evaluator.py
index 7bd52c0f76..e2473d738b 100644
--- a/test/test_evaluation/test_abstract_evaluator.py
+++ b/test/test_evaluation/test_abstract_evaluator.py
@@ -71,7 +71,6 @@ def test_finish_up_model_predicts_NaN(self):
         ae.Y_optimization = rs.rand(33, 3)
         predictions_ensemble = rs.rand(33, 3)
         predictions_test = rs.rand(25, 3)
-        predictions_valid = rs.rand(25, 3)
 
         # NaNs in prediction ensemble
         predictions_ensemble[5, 2] = np.NaN
@@ -79,7 +78,6 @@ def test_finish_up_model_predicts_NaN(self):
             loss=0.1,
             train_loss=0.1,
             opt_pred=predictions_ensemble,
-            valid_pred=predictions_valid,
             test_pred=predictions_test,
             additional_run_info=None,
             final_call=True,
@@ -89,37 +87,15 @@ def test_finish_up_model_predicts_NaN(self):
         self.assertEqual(loss, 1.0)
         self.assertEqual(
             additional_run_info,
-            {"error": "Model predictions for optimization set " "contains NaNs."},
+            {"error": "Model predictions for optimization set contains NaNs."},
         )
 
-        # NaNs in prediction validation
-        predictions_ensemble[5, 2] = 0.5
-        predictions_valid[5, 2] = np.NaN
-        _, loss, _, additional_run_info = ae.finish_up(
-            loss=0.1,
-            train_loss=0.1,
-            opt_pred=predictions_ensemble,
-            valid_pred=predictions_valid,
-            test_pred=predictions_test,
-            additional_run_info=None,
-            final_call=True,
-            file_output=True,
-            status=StatusType.SUCCESS,
-        )
-        self.assertEqual(loss, 1.0)
-        self.assertEqual(
-            additional_run_info,
-            {"error": "Model predictions for validation set " "contains NaNs."},
-        )
-
-        # NaNs in prediction test
-        predictions_valid[5, 2] = 0.5
+        predictions_ensemble = rs.rand(33, 3)
         predictions_test[5, 2] = np.NaN
         _, loss, _, additional_run_info = ae.finish_up(
             loss=0.1,
             train_loss=0.1,
             opt_pred=predictions_ensemble,
-            valid_pred=predictions_valid,
             test_pred=predictions_test,
             additional_run_info=None,
             final_call=True,
@@ -129,9 +105,8 @@ def test_finish_up_model_predicts_NaN(self):
         self.assertEqual(loss, 1.0)
         self.assertEqual(
             additional_run_info,
-            {"error": "Model predictions for test set contains " "NaNs."},
+            {"error": "Model predictions for test set contains NaNs."},
         )
-
         self.assertEqual(self.backend_mock.save_predictions_as_npy.call_count, 0)
 
     def test_disable_file_output(self):
@@ -150,11 +125,9 @@ def test_disable_file_output(self):
 
         predictions_ensemble = rs.rand(33, 3)
         predictions_test = rs.rand(25, 3)
-        predictions_valid = rs.rand(25, 3)
 
         loss_, additional_run_info_ = ae.file_output(
             predictions_ensemble,
-            predictions_valid,
             predictions_test,
         )
 
@@ -179,7 +152,6 @@ def test_disable_file_output(self):
 
             loss_, additional_run_info_ = ae.file_output(
                 predictions_ensemble,
-                predictions_valid,
                 predictions_test,
             )
 
@@ -211,11 +183,6 @@ def test_disable_file_output(self):
                     "ensemble_predictions"
                 ]
             )
-            self.assertIsNotNone(
-                self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][
-                    "valid_predictions"
-                ]
-            )
             self.assertIsNotNone(
                 self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][
                     "test_predictions"
@@ -237,7 +204,6 @@ def test_disable_file_output(self):
 
         loss_, additional_run_info_ = ae.file_output(
             predictions_ensemble,
-            predictions_valid,
             predictions_test,
         )
 
@@ -249,11 +215,6 @@ def test_disable_file_output(self):
                 "ensemble_predictions"
             ]
         )
-        self.assertIsNotNone(
-            self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][
-                "valid_predictions"
-            ]
-        )
         self.assertIsNotNone(
             self.backend_mock.save_numrun_to_dir.call_args_list[-1][1][
                 "test_predictions"
@@ -296,11 +257,9 @@ def test_file_output(self):
             ae.Y_optimization = rs.rand(33, 3)
             predictions_ensemble = rs.rand(33, 3)
             predictions_test = rs.rand(25, 3)
-            predictions_valid = rs.rand(25, 3)
 
             ae.file_output(
                 Y_optimization_pred=predictions_ensemble,
-                Y_valid_pred=predictions_valid,
                 Y_test_pred=predictions_test,
             )
 
diff --git a/test/test_evaluation/test_test_evaluator.py b/test/test_evaluation/test_test_evaluator.py
index 457661df03..02eedcca91 100644
--- a/test/test_evaluation/test_test_evaluator.py
+++ b/test/test_evaluation/test_test_evaluator.py
@@ -80,10 +80,10 @@ def test_datasets(self):
                 )
 
                 evaluator.fit_predict_and_loss()
-                rval = read_queue(evaluator.queue)
-                self.assertEqual(len(rval), 1)
-                self.assertEqual(len(rval[0]), 3)
-                self.assertTrue(np.isfinite(rval[0]["loss"]))
+                return_value = read_queue(evaluator.queue)
+                self.assertEqual(len(return_value), 1)
+                self.assertEqual(len(return_value[0]), 3)
+                self.assertTrue(np.isfinite(return_value[0]["loss"]))
 
 
 class FunctionsTest(unittest.TestCase):
@@ -124,11 +124,11 @@ def test_eval_test(self):
             port=self.port,
             additional_components=dict(),
         )
-        rval = read_queue(self.queue)
-        self.assertEqual(len(rval), 1)
-        self.assertAlmostEqual(rval[0]["loss"], 0.07999999999999996)
-        self.assertEqual(rval[0]["status"], StatusType.SUCCESS)
-        self.assertNotIn("bac_metric", rval[0]["additional_run_info"])
+        return_value = read_queue(self.queue)
+        self.assertEqual(len(return_value), 1)
+        self.assertAlmostEqual(return_value[0]["loss"], 0.07999999999999996)
+        self.assertEqual(return_value[0]["status"], StatusType.SUCCESS)
+        self.assertNotIn("bac_metric", return_value[0]["additional_run_info"])
 
     def test_eval_test_multi_objective(self):
         metrics = {
@@ -151,12 +151,12 @@ def test_eval_test_multi_objective(self):
             port=self.port,
             additional_components=dict(),
         )
-        rval = read_queue(self.queue)
-        self.assertEqual(len(rval), 1)
+        return_value = read_queue(self.queue)
+        self.assertEqual(len(return_value), 1)
         for metric, loss in metrics.items():
-            self.assertAlmostEqual(rval[0]["loss"][metric.name], loss)
-        self.assertEqual(rval[0]["status"], StatusType.SUCCESS)
-        self.assertNotIn("bac_metric", rval[0]["additional_run_info"])
+            self.assertAlmostEqual(return_value[0]["loss"][metric.name], loss)
+        self.assertEqual(return_value[0]["status"], StatusType.SUCCESS)
+        self.assertNotIn("bac_metric", return_value[0]["additional_run_info"])
 
     def test_eval_test_all_loss_functions(self):
         eval_t(
@@ -175,8 +175,8 @@ def test_eval_test_all_loss_functions(self):
             port=self.port,
             additional_components=dict(),
         )
-        rval = read_queue(self.queue)
-        self.assertEqual(len(rval), 1)
+        return_value = read_queue(self.queue)
+        self.assertEqual(len(return_value), 1)
 
         # Note: All metric here should be minimized
         fixture = {
@@ -195,7 +195,7 @@ def test_eval_test_all_loss_functions(self):
             "num_run": -1,
         }
 
-        additional_run_info = rval[0]["additional_run_info"]
+        additional_run_info = return_value[0]["additional_run_info"]
         for key, value in fixture.items():
             self.assertAlmostEqual(additional_run_info[key], fixture[key], msg=key)
         self.assertEqual(
@@ -204,5 +204,5 @@ def test_eval_test_all_loss_functions(self):
             msg=sorted(additional_run_info.items()),
         )
         self.assertIn("duration", additional_run_info)
-        self.assertAlmostEqual(rval[0]["loss"], 0.040000000000000036)
-        self.assertEqual(rval[0]["status"], StatusType.SUCCESS)
+        self.assertAlmostEqual(return_value[0]["loss"], 0.040000000000000036)
+        self.assertEqual(return_value[0]["status"], StatusType.SUCCESS)
diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py
index 9413af5509..c8fe1c5f87 100644
--- a/test/test_evaluation/test_train_evaluator.py
+++ b/test/test_evaluation/test_train_evaluator.py
@@ -7,6 +7,7 @@
 import shutil
 import sys
 import tempfile
+from itertools import chain
 
 import numpy as np
 import sklearn.model_selection
@@ -68,6 +69,24 @@
 )
 
 
+class LossSideEffect(object):
+    """Some kind of re-used fixture for losses calculated"""
+
+    def __init__(self):
+        # The 3 below is related to train, test, opt sets
+        self.losses = [
+            {"accuracy": value}
+            for value in chain.from_iterable(
+                [i] * 3 for i in [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]
+            )
+        ]
+        self.iteration = 0
+
+    def side_effect(self, *args, **kwargs):
+        self.iteration += 1
+        return self.losses[self.iteration - 1]
+
+
 class Dummy(object):
     def __init__(self):
         self.name = "dummy"
@@ -150,24 +169,23 @@ def test_holdout(self, pipeline_mock):
 
         evaluator.fit_predict_and_loss()
 
-        rval = read_queue(evaluator.queue)
-        self.assertEqual(len(rval), 1)
-        result = rval[0]["loss"]
-        self.assertEqual(len(rval[0]), 3)
+        return_value = read_queue(evaluator.queue)
+        self.assertEqual(len(return_value), 1)
+        result = return_value[0]["loss"]
+        self.assertEqual(len(return_value[0]), 3)
         self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
 
         self.assertEqual(evaluator.file_output.call_count, 1)
         self.assertEqual(result, 0.45833333333333337)
         self.assertEqual(pipeline_mock.fit.call_count, 1)
-        # four calls because of train, holdout, validation and test set
-        self.assertEqual(pipeline_mock.predict_proba.call_count, 4)
+        # four calls because of train, holdout and test set
+        self.assertEqual(pipeline_mock.predict_proba.call_count, 3)
         self.assertEqual(evaluator.file_output.call_count, 1)
+
         self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 24)
+
         self.assertEqual(
-            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0]
-        )
-        self.assertEqual(
-            evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0]
+            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_test"].shape[0]
         )
         self.assertEqual(evaluator.model.fit.call_count, 1)
 
@@ -240,46 +258,12 @@ def configuration_fully_fitted(self):
 
         class LossSideEffect(object):
             def __init__(self):
+                # The 3 below is related to train, test, opt sets
                 self.losses = [
                     {"accuracy": value}
-                    for value in [
-                        1.0,
-                        1.0,
-                        1.0,
-                        1.0,
-                        0.9,
-                        0.9,
-                        0.9,
-                        0.9,
-                        0.8,
-                        0.8,
-                        0.8,
-                        0.8,
-                        0.7,
-                        0.7,
-                        0.7,
-                        0.7,
-                        0.6,
-                        0.6,
-                        0.6,
-                        0.6,
-                        0.5,
-                        0.5,
-                        0.5,
-                        0.5,
-                        0.4,
-                        0.4,
-                        0.4,
-                        0.4,
-                        0.3,
-                        0.3,
-                        0.3,
-                        0.3,
-                        0.2,
-                        0.2,
-                        0.2,
-                        0.2,
-                    ]
+                    for value in chain.from_iterable(
+                        [i] * 3 for i in [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]
+                    )
                 ]
                 self.iteration = 0
 
@@ -294,15 +278,15 @@ def side_effect(self, *args, **kwargs):
         self.assertEqual(evaluator.file_output.call_count, 9)
 
         for i in range(1, 10):
-            rval = evaluator.queue.get(timeout=1)
-            result = rval["loss"]
+            return_value = evaluator.queue.get(timeout=1)
+            result = return_value["loss"]
             self.assertAlmostEqual(result, 1.0 - (0.1 * (i - 1)))
             if i < 9:
-                self.assertEqual(rval["status"], StatusType.DONOTADVANCE)
-                self.assertEqual(len(rval), 3)
+                self.assertEqual(return_value["status"], StatusType.DONOTADVANCE)
+                self.assertEqual(len(return_value), 3)
             else:
-                self.assertEqual(rval["status"], StatusType.SUCCESS)
-                self.assertEqual(len(rval), 4)
+                self.assertEqual(return_value["status"], StatusType.SUCCESS)
+                self.assertEqual(len(return_value), 4)
         self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
 
         self.assertEqual(pipeline_mock.iterative_fit.call_count, 9)
@@ -310,16 +294,12 @@ def side_effect(self, *args, **kwargs):
             [cal[1]["n_iter"] for cal in pipeline_mock.iterative_fit.call_args_list],
             [2, 2, 4, 8, 16, 32, 64, 128, 256],
         )
-        # 20 calls because of train, holdout, validation and test set
-        # and a total of five calls because of five iterations of fitting
-        self.assertEqual(evaluator.model.predict_proba.call_count, 36)
-        # 1/3 of 69
+
+        # 9 per split type
+        self.assertEqual(evaluator.model.predict_proba.call_count, 27)
         self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 23)
         self.assertEqual(
-            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0]
-        )
-        self.assertEqual(
-            evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0]
+            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_test"].shape[0]
         )
         self.assertEqual(evaluator.file_output.call_count, 9)
         self.assertEqual(evaluator.model.fit.call_count, 0)
@@ -438,20 +418,19 @@ def side_effect(self, *args, **kwargs):
         self.assertEqual(evaluator.file_output.call_count, 2)
 
         for i in range(1, 3):
-            rval = evaluator.queue.get(timeout=1)
-            self.assertAlmostEqual(rval["loss"], 1.0 - (0.2 * i))
+            return_value = evaluator.queue.get(timeout=1)
+            self.assertAlmostEqual(return_value["loss"], 1.0 - (0.2 * i))
         self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
 
         self.assertEqual(pipeline_mock.iterative_fit.call_count, 2)
-        # eight calls because of train, holdout, the validation and the test set
+
+        # 6 calls because of train, holdout and test set
         # and a total of two calls each because of two iterations of fitting
-        self.assertEqual(evaluator.model.predict_proba.call_count, 8)
+        self.assertEqual(evaluator.model.predict_proba.call_count, 6)
+
         self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 23)
         self.assertEqual(
-            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0]
-        )
-        self.assertEqual(
-            evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0]
+            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_test"].shape[0]
         )
         self.assertEqual(evaluator.file_output.call_count, 2)
         self.assertEqual(evaluator.model.fit.call_count, 0)
@@ -499,19 +478,18 @@ def test_iterative_holdout_not_iterative(self, pipeline_mock):
         evaluator.fit_predict_and_loss(iterative=True)
         self.assertEqual(evaluator.file_output.call_count, 1)
 
-        rval = evaluator.queue.get(timeout=1)
-        self.assertAlmostEqual(rval["loss"], 0.47826086956521741)
+        return_value = evaluator.queue.get(timeout=1)
+        self.assertAlmostEqual(return_value["loss"], 0.47826086956521741)
         self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
 
         self.assertEqual(pipeline_mock.iterative_fit.call_count, 0)
-        # four calls for train, opt, valid and test
-        self.assertEqual(evaluator.model.predict_proba.call_count, 4)
+
+        # 3 calls for train, opt and test
+        self.assertEqual(evaluator.model.predict_proba.call_count, 3)
+
         self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 23)
         self.assertEqual(
-            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0]
-        )
-        self.assertEqual(
-            evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0]
+            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_test"].shape[0]
         )
         self.assertEqual(evaluator.file_output.call_count, 1)
         self.assertEqual(evaluator.model.fit.call_count, 1)
@@ -554,26 +532,23 @@ def test_cv(self, pipeline_mock):
 
         evaluator.fit_predict_and_loss()
 
-        rval = read_queue(evaluator.queue)
-        self.assertEqual(len(rval), 1)
-        result = rval[0]["loss"]
-        self.assertEqual(len(rval[0]), 3)
+        return_value = read_queue(evaluator.queue)
+        self.assertEqual(len(return_value), 1)
+        result = return_value[0]["loss"]
+        self.assertEqual(len(return_value[0]), 3)
         self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
 
         self.assertEqual(evaluator.file_output.call_count, 1)
         self.assertEqual(result, 0.463768115942029)
         self.assertEqual(pipeline_mock.fit.call_count, 5)
-        # Fifteen calls because of the training, holdout, validation and
-        # test set (4 sets x 5 folds = 20)
-        self.assertEqual(pipeline_mock.predict_proba.call_count, 20)
+
+        # 15 calls because of the training (5), holdout (5) and test set (5)
+        self.assertEqual(pipeline_mock.predict_proba.call_count, 15)
         self.assertEqual(
             evaluator.file_output.call_args[0][0].shape[0], D.data["Y_train"].shape[0]
         )
         self.assertEqual(
-            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_valid"].shape[0]
-        )
-        self.assertEqual(
-            evaluator.file_output.call_args[0][2].shape[0], D.data["Y_test"].shape[0]
+            evaluator.file_output.call_args[0][1].shape[0], D.data["Y_test"].shape[0]
         )
         # The model prior to fitting is saved, this cannot be directly tested
         # because of the way the mock module is used. Instead, we test whether
@@ -623,13 +598,13 @@ def test_partial_cv(self, pipeline_mock):
 
         evaluator.partial_fit_predict_and_loss(fold=1)
 
-        rval = evaluator.queue.get(timeout=1)
+        return_value = evaluator.queue.get(timeout=1)
         self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
 
         self.assertEqual(evaluator.file_output.call_count, 0)
-        self.assertEqual(rval["loss"], 0.5)
+        self.assertEqual(return_value["loss"], 0.5)
         self.assertEqual(pipeline_mock.fit.call_count, 1)
-        self.assertEqual(pipeline_mock.predict_proba.call_count, 4)
+        self.assertEqual(pipeline_mock.predict_proba.call_count, 3)
         # The model prior to fitting is saved, this cannot be directly tested
         # because of the way the mock module is used. Instead, we test whether
         # the if block in which model assignment is done is accessed
@@ -703,55 +678,6 @@ def configuration_fully_fitted(self):
         evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
         evaluator.file_output.return_value = (None, {})
 
-        class LossSideEffect(object):
-            def __init__(self):
-                self.losses = [
-                    {"accuracy": value}
-                    for value in [
-                        1.0,
-                        1.0,
-                        1.0,
-                        1.0,
-                        0.9,
-                        0.9,
-                        0.9,
-                        0.9,
-                        0.8,
-                        0.8,
-                        0.8,
-                        0.8,
-                        0.7,
-                        0.7,
-                        0.7,
-                        0.7,
-                        0.6,
-                        0.6,
-                        0.6,
-                        0.6,
-                        0.5,
-                        0.5,
-                        0.5,
-                        0.5,
-                        0.4,
-                        0.4,
-                        0.4,
-                        0.4,
-                        0.3,
-                        0.3,
-                        0.3,
-                        0.3,
-                        0.2,
-                        0.2,
-                        0.2,
-                        0.2,
-                    ]
-                ]
-                self.iteration = 0
-
-            def side_effect(self, *args, **kwargs):
-                self.iteration += 1
-                return self.losses[self.iteration - 1]
-
         evaluator._loss = unittest.mock.Mock()
         evaluator._loss.side_effect = LossSideEffect().side_effect
 
@@ -760,12 +686,12 @@ def side_effect(self, *args, **kwargs):
         self.assertEqual(evaluator.file_output.call_count, 0)
 
         for i in range(1, 10):
-            rval = evaluator.queue.get(timeout=1)
-            self.assertAlmostEqual(rval["loss"], 1.0 - (0.1 * (i - 1)))
+            return_value = evaluator.queue.get(timeout=1)
+            self.assertAlmostEqual(return_value["loss"], 1.0 - (0.1 * (i - 1)))
             if i < 9:
-                self.assertEqual(rval["status"], StatusType.DONOTADVANCE)
+                self.assertEqual(return_value["status"], StatusType.DONOTADVANCE)
             else:
-                self.assertEqual(rval["status"], StatusType.SUCCESS)
+                self.assertEqual(return_value["status"], StatusType.SUCCESS)
         self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
 
         self.assertEqual(pipeline_mock.iterative_fit.call_count, 9)
@@ -773,13 +699,9 @@ def side_effect(self, *args, **kwargs):
             [cal[1]["n_iter"] for cal in pipeline_mock.iterative_fit.call_args_list],
             [2, 2, 4, 8, 16, 32, 64, 128, 256],
         )
-        # fifteen calls because of the holdout, the validation and the test set
-        # and a total of five calls because of five iterations of fitting
         self.assertTrue(hasattr(evaluator, "model"))
         self.assertEqual(pipeline_mock.iterative_fit.call_count, 9)
-        # 20 calls because of train, holdout, the validation and the test set
-        # and a total of five calls because of five iterations of fitting
-        self.assertEqual(pipeline_mock.predict_proba.call_count, 36)
+        self.assertEqual(pipeline_mock.predict_proba.call_count, 27)
 
     @unittest.mock.patch.object(TrainEvaluator, "_loss")
     @unittest.mock.patch.object(TrainEvaluator, "_get_model")
@@ -809,13 +731,9 @@ def test_file_output(self, loss_mock, model_mock):
         self.backend_mock.get_model_dir.return_value = True
         evaluator.model = "model"
         evaluator.Y_optimization = D.data["Y_train"]
-        rval = evaluator.file_output(
-            D.data["Y_train"],
-            D.data["Y_valid"],
-            D.data["Y_test"],
-        )
+        return_value = evaluator.file_output(D.data["Y_train"], D.data["Y_test"])
 
-        self.assertEqual(rval, (None, {}))
+        self.assertEqual(return_value, (None, {}))
         self.assertEqual(self.backend_mock.save_additional_data.call_count, 2)
         self.assertEqual(self.backend_mock.save_numrun_to_dir.call_count, 1)
         self.assertEqual(
@@ -826,8 +744,8 @@ def test_file_output(self, loss_mock, model_mock):
                 "budget",
                 "model",
                 "cv_model",
+                "valid_predictions",  # TODO remove once backend updated
                 "ensemble_predictions",
-                "valid_predictions",
                 "test_predictions",
             },
         )
@@ -839,12 +757,8 @@ def test_file_output(self, loss_mock, model_mock):
         )
 
         evaluator.models = ["model2", "model2"]
-        rval = evaluator.file_output(
-            D.data["Y_train"],
-            D.data["Y_valid"],
-            D.data["Y_test"],
-        )
-        self.assertEqual(rval, (None, {}))
+        return_value = evaluator.file_output(D.data["Y_train"], D.data["Y_test"])
+        self.assertEqual(return_value, (None, {}))
         self.assertEqual(self.backend_mock.save_additional_data.call_count, 4)
         self.assertEqual(self.backend_mock.save_numrun_to_dir.call_count, 2)
         self.assertEqual(
@@ -855,8 +769,8 @@ def test_file_output(self, loss_mock, model_mock):
                 "budget",
                 "model",
                 "cv_model",
+                "valid_predictions",  # TODO remove once backend updated
                 "ensemble_predictions",
-                "valid_predictions",
                 "test_predictions",
             },
         )
@@ -867,29 +781,10 @@ def test_file_output(self, loss_mock, model_mock):
             self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]["cv_model"]
         )
 
-        # Check for not containing NaNs - that the models don't predict nonsense
-        # for unseen data
-        D.data["Y_valid"][0] = np.NaN
-        rval = evaluator.file_output(
-            D.data["Y_train"],
-            D.data["Y_valid"],
-            D.data["Y_test"],
-        )
-        self.assertEqual(
-            rval,
-            (
-                1.0,
-                {"error": "Model predictions for validation set contains NaNs."},
-            ),
-        )
         D.data["Y_train"][0] = np.NaN
-        rval = evaluator.file_output(
-            D.data["Y_train"],
-            D.data["Y_valid"],
-            D.data["Y_test"],
-        )
+        return_value = evaluator.file_output(D.data["Y_train"], D.data["Y_test"])
         self.assertEqual(
-            rval,
+            return_value,
             (
                 1.0,
                 {"error": "Model predictions for optimization set contains NaNs."},
@@ -1086,7 +981,6 @@ def test_fit_predict_and_loss_standard_additional_run_info(
         _partial_fit_and_predict_mock.return_value = (
             np.array([[0.1, 0.9]] * 46),
             np.array([[0.1, 0.9]] * 23),
-            np.array([[0.1, 0.9]] * 25),
             np.array([[0.1, 0.9]] * 6),
             {"a": 5},
         )
@@ -1112,8 +1006,8 @@ def test_fit_predict_and_loss_standard_additional_run_info(
         evaluator.X_targets[0] = np.array([1, 0] * 23)
         evaluator.Y_targets[0] = np.array([1] * 23)
         evaluator.Y_train_targets = np.array([1] * 69)
-        rval = evaluator.fit_predict_and_loss(iterative=False)
-        self.assertIsNone(rval)
+        return_value = evaluator.fit_predict_and_loss(iterative=False)
+        self.assertIsNone(return_value)
         element = queue_.get()
         self.assertEqual(element["status"], StatusType.SUCCESS)
         self.assertEqual(element["additional_run_info"]["a"], 5)
@@ -1129,7 +1023,6 @@ def __call__(self, *args, **kwargs):
                     return (
                         np.array([[0.1, 0.9]] * 34),
                         np.array([[0.1, 0.9]] * 35),
-                        np.array([[0.1, 0.9]] * 25),
                         np.array([[0.1, 0.9]] * 6),
                         {"a": 5},
                     )
@@ -1137,7 +1030,6 @@ def __call__(self, *args, **kwargs):
                     return (
                         np.array([[0.1, 0.9]] * 34),
                         np.array([[0.1, 0.9]] * 34),
-                        np.array([[0.1, 0.9]] * 25),
                         np.array([[0.1, 0.9]] * 6),
                         {"a": 5},
                     )
@@ -1219,8 +1111,8 @@ def __call__(self):
         evaluator.file_output.return_value = (None, {})
         evaluator.Y_targets[0] = np.array([1] * 23).reshape((-1, 1))
         evaluator.Y_train_targets = np.array([1] * 69).reshape((-1, 1))
-        rval = evaluator.fit_predict_and_loss(iterative=True)
-        self.assertIsNone(rval)
+        return_value = evaluator.fit_predict_and_loss(iterative=True)
+        self.assertIsNone(return_value)
         self.assertEqual(finish_up_mock.call_count, 1)
         self.assertEqual(finish_up_mock.call_args[1]["additional_run_info"], 14678)
 
@@ -1265,8 +1157,8 @@ def test_fit_predict_and_loss_iterative_noniterativemodel_additional_run_info(
 
         evaluator.Y_targets[0] = np.array([1] * 23).reshape((-1, 1))
         evaluator.Y_train_targets = np.array([1] * 69).reshape((-1, 1))
-        rval = evaluator.fit_predict_and_loss(iterative=True)
-        self.assertIsNone(rval)
+        return_value = evaluator.fit_predict_and_loss(iterative=True)
+        self.assertIsNone(return_value)
         self.assertEqual(finish_up_mock.call_count, 1)
         self.assertEqual(finish_up_mock.call_args[1]["additional_run_info"], 14678)
 
@@ -1326,8 +1218,8 @@ def __call__(self):
 
         evaluator.Y_targets[0] = np.array([1] * 23).reshape((-1, 1))
         evaluator.Y_train_targets = np.array([1] * 69).reshape((-1, 1))
-        rval = evaluator.fit_predict_and_loss(iterative=False)
-        self.assertIsNone(rval)
+        return_value = evaluator.fit_predict_and_loss(iterative=False)
+        self.assertIsNone(return_value)
         self.assertEqual(finish_up_mock.call_count, 1)
         self.assertEqual(
             finish_up_mock.call_args[1]["additional_run_info"], {"val": 14678}
@@ -1373,8 +1265,8 @@ def test_fit_predict_and_loss_budget_2_additional_run_info(
 
         evaluator.Y_targets[0] = np.array([1] * 23).reshape((-1, 1))
         evaluator.Y_train_targets = np.array([1] * 69).reshape((-1, 1))
-        rval = evaluator.fit_predict_and_loss(iterative=False)
-        self.assertIsNone(rval)
+        return_value = evaluator.fit_predict_and_loss(iterative=False)
+        self.assertIsNone(return_value)
         self.assertEqual(finish_up_mock.call_count, 1)
         self.assertEqual(
             finish_up_mock.call_args[1]["additional_run_info"], {"val": 14678}
@@ -1422,8 +1314,8 @@ def test_datasets(self):
                 )
 
                 evaluator.fit_predict_and_loss()
-                rval = evaluator.queue.get(timeout=1)
-                self.assertTrue(np.isfinite(rval["loss"]))
+                return_value = evaluator.queue.get(timeout=1)
+                self.assertTrue(np.isfinite(return_value["loss"]))
 
     ############################################################################
     # Test obtaining a splitter object from scikit-learn
@@ -3053,8 +2945,8 @@ def test_eval_holdout_all_loss_functions(self):
             metrics=[accuracy],
             additional_components=dict(),
         )
-        rval = read_queue(self.queue)
-        self.assertEqual(len(rval), 1)
+        return_value = read_queue(self.queue)
+        self.assertEqual(len(return_value), 1)
 
         fixture = {
             "accuracy": 0.030303030303030276,
@@ -3070,12 +2962,11 @@ def test_eval_holdout_all_loss_functions(self):
             "recall_micro": 0.030303030303030276,
             "recall_weighted": 0.030303030303030276,
             "num_run": 1,
-            "validation_loss": 0.0,
             "test_loss": 0.04,
             "train_loss": 0.0,
         }
 
-        additional_run_info = rval[0]["additional_run_info"]
+        additional_run_info = return_value[0]["additional_run_info"]
         for key, value in fixture.items():
             self.assertAlmostEqual(additional_run_info[key], fixture[key], msg=key)
         self.assertIn("duration", additional_run_info)
@@ -3085,8 +2976,8 @@ def test_eval_holdout_all_loss_functions(self):
             msg=sorted(additional_run_info.items()),
         )
 
-        self.assertAlmostEqual(rval[0]["loss"], 0.030303030303030276, places=3)
-        self.assertEqual(rval[0]["status"], StatusType.SUCCESS)
+        self.assertAlmostEqual(return_value[0]["loss"], 0.030303030303030276, places=3)
+        self.assertEqual(return_value[0]["status"], StatusType.SUCCESS)
 
     def test_eval_holdout_iterative_fit_no_timeout(self):
         eval_iterative_holdout(
@@ -3107,11 +2998,11 @@ def test_eval_holdout_iterative_fit_no_timeout(self):
             metrics=[accuracy],
             additional_components=dict(),
         )
-        rval = read_queue(self.queue)
-        self.assertEqual(len(rval), 9)
-        self.assertAlmostEqual(rval[-1]["loss"], 0.030303030303030276)
-        self.assertEqual(rval[0]["status"], StatusType.DONOTADVANCE)
-        self.assertEqual(rval[-1]["status"], StatusType.SUCCESS)
+        return_value = read_queue(self.queue)
+        self.assertEqual(len(return_value), 9)
+        self.assertAlmostEqual(return_value[-1]["loss"], 0.030303030303030276)
+        self.assertEqual(return_value[0]["status"], StatusType.DONOTADVANCE)
+        self.assertEqual(return_value[-1]["status"], StatusType.SUCCESS)
 
     def test_eval_holdout_iterative_fit_no_timeout_multi_objective(self):
         metrics = {
@@ -3136,12 +3027,12 @@ def test_eval_holdout_iterative_fit_no_timeout_multi_objective(self):
             metrics=list(metrics.keys()),
             additional_components=dict(),
         )
-        rval = read_queue(self.queue)
-        self.assertEqual(len(rval), 9)
+        return_value = read_queue(self.queue)
+        self.assertEqual(len(return_value), 9)
         for metric, loss in metrics.items():
-            self.assertAlmostEqual(rval[-1]["loss"][metric.name], loss)
-        self.assertEqual(rval[0]["status"], StatusType.DONOTADVANCE)
-        self.assertEqual(rval[-1]["status"], StatusType.SUCCESS)
+            self.assertAlmostEqual(return_value[-1]["loss"][metric.name], loss)
+        self.assertEqual(return_value[0]["status"], StatusType.DONOTADVANCE)
+        self.assertEqual(return_value[-1]["status"], StatusType.SUCCESS)
 
     def test_eval_holdout_budget_iterations(self):
         eval_holdout(
@@ -3405,11 +3296,11 @@ def test_eval_cv(self):
             metrics=[accuracy],
             additional_components=dict(),
         )
-        rval = read_queue(self.queue)
-        self.assertEqual(len(rval), 1)
-        self.assertAlmostEqual(rval[0]["loss"], 0.04999999999999997)
-        self.assertEqual(rval[0]["status"], StatusType.SUCCESS)
-        self.assertNotIn("bac_metric", rval[0]["additional_run_info"])
+        return_value = read_queue(self.queue)
+        self.assertEqual(len(return_value), 1)
+        self.assertAlmostEqual(return_value[0]["loss"], 0.04999999999999997)
+        self.assertEqual(return_value[0]["status"], StatusType.SUCCESS)
+        self.assertNotIn("bac_metric", return_value[0]["additional_run_info"])
 
     def test_eval_cv_all_loss_functions(self):
         eval_cv(
@@ -3430,8 +3321,8 @@ def test_eval_cv_all_loss_functions(self):
             metrics=[accuracy],
             additional_components=dict(),
         )
-        rval = read_queue(self.queue)
-        self.assertEqual(len(rval), 1)
+        return_value = read_queue(self.queue)
+        self.assertEqual(len(return_value), 1)
 
         fixture = {
             "accuracy": 0.04999999999999997,
@@ -3447,12 +3338,11 @@ def test_eval_cv_all_loss_functions(self):
             "recall_micro": 0.04999999999999997,
             "recall_weighted": 0.04999999999999997,
             "num_run": 1,
-            "validation_loss": 0.04,
             "test_loss": 0.04,
             "train_loss": 0.0,
         }
 
-        additional_run_info = rval[0]["additional_run_info"]
+        additional_run_info = return_value[0]["additional_run_info"]
         for key, value in fixture.items():
             self.assertAlmostEqual(additional_run_info[key], fixture[key], msg=key)
         self.assertIn("duration", additional_run_info)
@@ -3462,8 +3352,8 @@ def test_eval_cv_all_loss_functions(self):
             msg=sorted(additional_run_info.items()),
         )
 
-        self.assertAlmostEqual(rval[0]["loss"], 0.04999999999999997)
-        self.assertEqual(rval[0]["status"], StatusType.SUCCESS)
+        self.assertAlmostEqual(return_value[0]["loss"], 0.04999999999999997)
+        self.assertEqual(return_value[0]["status"], StatusType.SUCCESS)
 
     # def test_eval_cv_on_subset(self):
     #     backend_api = backend.create(self.tmp_dir, self.tmp_dir)
@@ -3504,10 +3394,10 @@ def test_eval_partial_cv(self):
                 metrics=[accuracy],
                 additional_components=dict(),
             )
-            rval = read_queue(self.queue)
-            self.assertEqual(len(rval), 1)
-            self.assertAlmostEqual(rval[0]["loss"], results[fold])
-            self.assertEqual(rval[0]["status"], StatusType.SUCCESS)
+            return_value = read_queue(self.queue)
+            self.assertEqual(len(return_value), 1)
+            self.assertAlmostEqual(return_value[0]["loss"], results[fold])
+            self.assertEqual(return_value[0]["status"], StatusType.SUCCESS)
 
     def test_eval_partial_cv_multi_objective(self):
         metrics = {
@@ -3547,8 +3437,8 @@ def test_eval_partial_cv_multi_objective(self):
                 metrics=list(metrics.keys()),
                 additional_components=dict(),
             )
-            rval = read_queue(self.queue)
-            self.assertEqual(len(rval), 1)
+            return_value = read_queue(self.queue)
+            self.assertEqual(len(return_value), 1)
             for metric, loss in metrics.items():
-                self.assertAlmostEqual(rval[0]["loss"][metric.name], loss[fold])
-            self.assertEqual(rval[0]["status"], StatusType.SUCCESS)
+                self.assertAlmostEqual(return_value[0]["loss"][metric.name], loss[fold])
+            self.assertEqual(return_value[0]["status"], StatusType.SUCCESS)
diff --git a/test/test_pipeline/components/regression/test_mlp.py b/test/test_pipeline/components/regression/test_mlp.py
index 9e2a92acac..941e30bf32 100644
--- a/test/test_pipeline/components/regression/test_mlp.py
+++ b/test/test_pipeline/components/regression/test_mlp.py
@@ -29,7 +29,7 @@ class MLPComponentTest(BaseRegressionComponentTest):
     #
     # Seems there is a consistently different values for boston so:
     # * include two valuess for n_iter in 'boston_iterative_n_iter'
-    #   known-values = [236, 331]
+    #   known-values = [236, 331, 327]
     #
     # * decreased places from 6 -> 5 in 'default_boston_{sparse,_iterative_sparse}'
     #   to check for for iterations and expanded the default places for checking
@@ -47,7 +47,7 @@ class MLPComponentTest(BaseRegressionComponentTest):
     res["default_boston"] = 0.2750079862455884
     res["default_boston_places"] = 1
     res["boston_n_calls"] = [8, 9]
-    res["boston_iterative_n_iter"] = [236, 331]
+    res["boston_iterative_n_iter"] = [236, 331, 327]
     res["default_boston_iterative"] = res["default_boston"]
     res["default_boston_iterative_places"] = 1
     res["default_boston_sparse"] = -0.10972947168054104
diff --git a/test/test_pipeline/test_classification.py b/test/test_pipeline/test_classification.py
index 7be8038119..88f091772a 100644
--- a/test/test_pipeline/test_classification.py
+++ b/test/test_pipeline/test_classification.py
@@ -172,7 +172,7 @@ def test_find_preprocessors(self):
         * At least 1 preprocessor component can be found
         * The inherit from AutoSklearnPreprocessingAlgorithm
         """
-        preprocessors = preprocessing_components._preprocessors
+        preprocessors = preprocessing_components._feature_preprocessors
         self.assertGreaterEqual(len(preprocessors), 1)
         for key in preprocessors:
             if hasattr(preprocessors[key], "get_components"):
diff --git a/test/test_pipeline/test_regression.py b/test/test_pipeline/test_regression.py
index 3a50decb8c..788e347b1e 100644
--- a/test/test_pipeline/test_regression.py
+++ b/test/test_pipeline/test_regression.py
@@ -76,7 +76,7 @@ def test_find_regressors(self):
             self.assertIn(AutoSklearnRegressionAlgorithm, regressors[key].__bases__)
 
     def test_find_preprocessors(self):
-        preprocessors = preprocessing_components._preprocessors
+        preprocessors = preprocessing_components._feature_preprocessors
         self.assertGreaterEqual(len(preprocessors), 1)
         for key in preprocessors:
             if hasattr(preprocessors[key], "get_components"):
diff --git a/test/test_scripts/test_metadata_generation.py b/test/test_scripts/test_metadata_generation.py
index 929b90e029..25c4855b08 100644
--- a/test/test_scripts/test_metadata_generation.py
+++ b/test/test_scripts/test_metadata_generation.py
@@ -4,6 +4,7 @@
 import shutil
 import socket
 import subprocess
+import tempfile
 
 import arff
 import numpy as np
@@ -15,10 +16,12 @@
 
 class TestMetadataGeneration(unittest.TestCase):
     def setUp(self):
-        self.working_directory = "/tmp/autosklearn-unittest-tmp-dir-%s-%d-%d" % (
-            socket.gethostname(),
-            os.getpid(),
-            random.randint(0, 1000000),
+        host = socket.gethostname()
+        pid = os.getpid()
+        rint = random.randint(0, 1000000)
+
+        self.working_directory = os.path.join(
+            tempfile.gettempdir(), f"autosklearn-unittest-tmp-dir-{host}-{pid}-{rint}"
         )
 
     def print_files(self):
@@ -27,7 +30,6 @@ def print_files(self):
             print(dirpath, dirnames, filenames)
 
     def test_metadata_generation(self):
-
         regression_task_id = 360029
         regression_dataset_name = "SWD".lower()
         classification_task_id = 245
@@ -52,10 +54,15 @@ def test_metadata_generation(self):
             script_filename,
             self.working_directory,
         )
-        rval = subprocess.run(
-            cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+
+        return_value = subprocess.run(
+            cmd,
+            shell=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            timeout=30,
         )
-        self.assertEqual(rval.returncode, 0, msg=str(rval))
+        self.assertEqual(return_value.returncode, 0, msg=f"{cmd}\n{str(return_value)}")
 
         # 4. run one of the commands to get some data
         commands_output_file = os.path.join(
@@ -99,11 +106,16 @@ def test_metadata_generation(self):
             # for training. In production, it would use twice as much!
             cmd = cmd.replace("-s 1", "-s 1 --unittest")
             print("COMMAND: %s" % cmd)
-            rval = subprocess.run(
-                cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+
+            return_value = subprocess.run(
+                cmd,
+                shell=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                timeout=180,
             )
-            print("STDOUT: %s" % repr(rval.stdout), flush=True)
-            print("STDERR: %s" % repr(rval.stderr), flush=True)
+            print("STDOUT: %s" % repr(return_value.stdout), flush=True)
+            print("STDERR: %s" % repr(return_value.stderr), flush=True)
 
             self.print_files()
 
@@ -123,7 +135,11 @@ def test_metadata_generation(self):
             )
             with open(smac_log) as fh:
                 smac_output = fh.read()
-            self.assertEqual(rval.returncode, 0, msg=str(rval) + "\n" + smac_output)
+            self.assertEqual(
+                return_value.returncode,
+                0,
+                msg=f"{cmd}\n{str(return_value)}" + "\n" + smac_output,
+            )
             expected_validation_output = os.path.join(
                 expected_output_directory, "..", "validation_trajectory_1.json"
             )
@@ -172,12 +188,17 @@ def test_metadata_generation(self):
             self.working_directory,
         )
         print("COMMAND: %s" % cmd)
-        rval = subprocess.run(
-            cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+
+        return_value = subprocess.run(
+            cmd,
+            shell=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            timeout=60,
         )
-        print("STDOUT: %s" % repr(rval.stdout), flush=True)
-        print("STDERR: %s" % repr(rval.stderr), flush=True)
-        self.assertEqual(rval.returncode, 0, msg=str(rval))
+        print("STDOUT: %s" % repr(return_value.stdout), flush=True)
+        print("STDERR: %s" % repr(return_value.stderr), flush=True)
+        self.assertEqual(return_value.returncode, 0, msg=f"{cmd}\n{str(return_value)}")
 
         for file in [
             "algorithm_runs.arff",
@@ -215,10 +236,14 @@ def test_metadata_generation(self):
             script_filename,
             self.working_directory,
         )
-        rval = subprocess.run(
-            cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        return_value = subprocess.run(
+            cmd,
+            shell=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            timeout=90,
         )
-        self.assertEqual(rval.returncode, 0, msg=str(rval))
+        self.assertEqual(return_value.returncode, 0, msg=f"{cmd}\n{str(return_value)}")
         for task_type in ("classification", "regression"):
             for file in [
                 "calculation_times.csv",
@@ -271,10 +296,15 @@ def test_metadata_generation(self):
             script_filename,
             self.working_directory,
         )
-        rval = subprocess.run(
-            cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+
+        return_value = subprocess.run(
+            cmd,
+            shell=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            timeout=45,
         )
-        self.assertEqual(rval.returncode, 0, msg=str(rval))
+        self.assertEqual(return_value.returncode, 0, msg=f"{cmd}\n{str(return_value)}")
 
         for metric_, combination in (
             (metric, "%s_binary.classification_dense" % metric),