automl · eddiebergman · Jun 9, 2022 · Jun 14, 2022 · Jun 14, 2022 · Jun 14, 2022
diff --git a/autosklearn/info.py b/autosklearn/info.py
@@ -0,0 +1,205 @@
+"""
+This module servers as an introspection point for things users might
+want to programatically query about autosklearn.
+"""
+from __future__ import annotations
+
+from typing import Any, Generic, Type, TypeVar
+
+from dataclasses import dataclass
+
+from typing_extensions import Literal
+
+from autosklearn.pipeline.components.base import (
+    AutoSklearnClassificationAlgorithm,
+    AutoSklearnComponent,
+    AutoSklearnPreprocessingAlgorithm,
+    AutoSklearnRegressionAlgorithm,
+)
+from autosklearn.pipeline.components.classification import ClassifierChoice
+from autosklearn.pipeline.components.data_preprocessing import DataPreprocessorChoice
+from autosklearn.pipeline.components.feature_preprocessing import (
+    FeaturePreprocessorChoice,
+)
+from autosklearn.pipeline.components.regression import RegressorChoice
+from autosklearn.pipeline.constants import DATASET_PROPERTIES_TO_STRING
+
+# Something that is a type that inherits from AutoSklearnComponent
+T = TypeVar("T", bound=Type[AutoSklearnComponent])
+
+
+def _translate_properties(
+    props: dict[str, Any],
+    kind: Literal["classifier", "regressor", "f_preprocessor", "d_preprocessor"],
+) -> dict[str, Any]:
+    """Converts supported inputs and outputs to strings"""
+    # This is information is conveyed implicitly by being a regressor/classifier ...
+    delwords = ["handles_regression", "handles_classification"]
+
+    # Covered by input type, duplicated info
+    delwords += ["handles_sparse", "handles_dense"]
+
+    # Words we rename (from, to)
+    popwords: list[tuple[str, str]] = [
+        ("input", "supported_inputs"),
+        ("output", "output_kind"),
+        ("is_deterministic", "deterministic"),
+    ]
+
+    if kind in ["classifier", "f_preprocessor", "d_preprocessor"]:
+        delwords += ["handles_multioutput"]
+
+    if kind in ["regressor", "f_preprocessor", "d_preprocessor"]:
+        delwords += ["handles_multiclass", "handles_multilabel"]
+
+    for word in delwords:
+        if word in props:
+            del props[word]
+
+    for frm, to in popwords:
+        props[to] = props.pop(frm)
+
+    props["supported_inputs"] = [
+        DATASET_PROPERTIES_TO_STRING[k] for k in props["supported_inputs"]
+    ]
+    props["output_kind"] = DATASET_PROPERTIES_TO_STRING[props["output_kind"][0]]
+
+    return props
+
+
+@dataclass
+class _ComponentInfo(Generic[T]):
+    type: T  # cls is not possible due to @dataclass conversion
+    name: str
+    shortname: str
+    output_kind: str
+    supported_inputs: list[str]
+    deterministic: bool = False
+
+
+@dataclass
+class RegressorInfo(_ComponentInfo[Type[AutoSklearnRegressionAlgorithm]]):
+    handles_multioutput: bool = False
+    prefers_data_normalized: bool = False
+
+
+@dataclass
+class ClassifierInfo(_ComponentInfo[Type[AutoSklearnClassificationAlgorithm]]):
+    handles_binary: bool = True  # We assume all components support this
+    handles_multiclass: bool = False
+    handles_multilabel: bool = False
+    handles_multilabel_multiclass = False
+
+
+@dataclass
+class FeaturePreprocessorInfo(_ComponentInfo[Type[AutoSklearnPreprocessingAlgorithm]]):
+    pass
+
+
+@dataclass
+class DataPreprocessorInfo(_ComponentInfo[Type[AutoSklearnPreprocessingAlgorithm]]):
+    # There should be more here but our DataPreprocessing part of the pipeline doesn't
+    # pick up on it because there's on FeatTypeSplit available which further has
+    # subcomponents with extra properties
+    pass
+
+
+@dataclass
+class ComponentsInfo:
+    classifiers: dict[str, ClassifierInfo]
+    regressors: dict[str, RegressorInfo]
+    feature_preprocessors: dict[str, FeaturePreprocessorInfo]
+    data_preprocessors: dict[str, DataPreprocessorInfo]
+
+
+def classifiers() -> dict[str, ClassifierInfo]:
+    """Get information about the classifiers available to auto-sklearn
+
+    Returns
+    -------
+    dict[str, ClassifierInfo]
+        The dict of classifiers and some info about them
+    """
+    return {
+        name: ClassifierInfo(
+            **{
+                "type": cls,
+                **_translate_properties(cls.get_properties(), "classifier"),
+            }
+        )
+        for name, cls in ClassifierChoice.get_components().items()
+    }
+
+
+def regressors() -> dict[str, RegressorInfo]:
+    """Get information about the regressors available to auto-sklearn
+
+    Returns
+    -------
+    dict[str, RegressorInfo]
+        The dict of regressors and some info about them
+    """
+    return {
+        name: RegressorInfo(
+            **{"type": cls, **_translate_properties(cls.get_properties(), "regressor")},
+        )
+        for name, cls in RegressorChoice.get_components().items()
+    }
+
+
+def feature_preprocessors() -> dict[str, FeaturePreprocessorInfo]:
+    """Get information about the feature preprocessors available to auto-sklearn
+
+    Returns
+    -------
+    dict[str, FeaturePreprocessorInfo]
+        The dict of feature preprocessors and some info about them
+    """
+    return {
+        name: FeaturePreprocessorInfo(
+            **{
+                "type": cls,
+                **_translate_properties(cls.get_properties(), "f_preprocessor"),
+            }
+        )
+        for name, cls in FeaturePreprocessorChoice.get_components().items()
+    }
+
+
+def data_preprocessors() -> dict[str, DataPreprocessorInfo]:
+    """Get information about the data preprocessors available to auto-sklearn
+
+    Returns
+    -------
+    dict[str, DataPreprocessorInfo]
+        The dict of data preprocessors and some info about them
+    """
+    return {
+        name: DataPreprocessorInfo(
+            **{
+                "type": cls,
+                **_translate_properties(cls.get_properties(), "d_preprocessor"),
+            }
+        )
+        for name, cls in DataPreprocessorChoice.get_components().items()
+    }
+
+
+def components() -> ComponentsInfo:
+    """Get information about all of the components available to auto-sklearn
+
+    Returns
+    -------
+    ComponentsInfo
+        A dataclass with the items
+        * classifiers
+        * regressors
+        * feature_preprocessors
+        * data_preprocessors
+    """
+    return ComponentsInfo(
+        classifiers=classifiers(),
+        regressors=regressors(),
+        feature_preprocessors=feature_preprocessors(),
+        data_preprocessors=data_preprocessors(),
+    )
diff --git a/autosklearn/pipeline/components/base.py b/autosklearn/pipeline/components/base.py
@@ -1,4 +1,4 @@
-from typing import Dict
+from __future__ import annotations
 
 import importlib
 import inspect
@@ -10,7 +10,7 @@
 
 from autosklearn.pipeline.constants import SPARSE
 
-_addons = dict()  # type: Dict[str, 'ThirdPartyComponents']
+_addons: dict[str, ThirdPartyComponents] = {}
 
 
 def find_components(package, directory, base_class):

diff --git a/autosklearn/pipeline/components/data_preprocessing/__init__.py b/autosklearn/pipeline/components/data_preprocessing/__init__.py
@@ -12,14 +12,16 @@
     AutoSklearnChoice,
     AutoSklearnPreprocessingAlgorithm,
     ThirdPartyComponents,
+    _addons,
     find_components,
 )
 
-classifier_directory = os.path.split(__file__)[0]
-_preprocessors = find_components(
-    __package__, classifier_directory, AutoSklearnPreprocessingAlgorithm
+data_preprocessing_directory = os.path.split(__file__)[0]
+_data_preprocessors = find_components(
+    __package__, data_preprocessing_directory, AutoSklearnPreprocessingAlgorithm
 )
-_addons = ThirdPartyComponents(AutoSklearnPreprocessingAlgorithm)
+additional_components = ThirdPartyComponents(AutoSklearnPreprocessingAlgorithm)
+_addons["data_preprocessing"] = additional_components
 
 
 def add_preprocessor(preprocessor: Type[AutoSklearnPreprocessingAlgorithm]) -> None:
@@ -30,8 +32,8 @@ class DataPreprocessorChoice(AutoSklearnChoice):
     @classmethod
     def get_components(cls) -> OrderedDict:
         components: OrderedDict = OrderedDict()
-        components.update(_preprocessors)
-        components.update(_addons.components)
+        components.update(_data_preprocessors)
+        components.update(additional_components.components)
         return components
 
     def get_available_components(

diff --git a/autosklearn/pipeline/components/data_preprocessing/feature_type.py b/autosklearn/pipeline/components/data_preprocessing/feature_type.py
@@ -215,6 +215,7 @@ def get_properties(
             "handles_multiclass": True,
             "handles_multilabel": True,
             "handles_multioutput": True,
+            "is_deterministic": True,  # Assumption for now
             # TODO find out of this is right!
             "handles_sparse": True,
             "handles_dense": True,

diff --git a/autosklearn/pipeline/components/feature_preprocessing/__init__.py b/autosklearn/pipeline/components/feature_preprocessing/__init__.py
@@ -14,9 +14,9 @@
     find_components,
 )
 
-classifier_directory = os.path.split(__file__)[0]
-_preprocessors = find_components(
-    __package__, classifier_directory, AutoSklearnPreprocessingAlgorithm
+feature_preprocessing_directory = os.path.split(__file__)[0]
+_feature_preprocessors = find_components(
+    __package__, feature_preprocessing_directory, AutoSklearnPreprocessingAlgorithm
 )
 additional_components = ThirdPartyComponents(AutoSklearnPreprocessingAlgorithm)
 _addons["feature_preprocessing"] = additional_components
@@ -30,7 +30,7 @@ class FeaturePreprocessorChoice(AutoSklearnChoice):
     @classmethod
     def get_components(cls):
         components = OrderedDict()
-        components.update(_preprocessors)
+        components.update(_feature_preprocessors)
         components.update(additional_components.components)
         return components
 

diff --git a/autosklearn/pipeline/components/feature_preprocessing/liblinear_svc_preprocessor.py b/autosklearn/pipeline/components/feature_preprocessing/liblinear_svc_preprocessor.py
@@ -86,6 +86,7 @@ def get_properties(dataset_properties=None):
             "handles_multiclass": True,
             "handles_multilabel": False,
             "handles_multioutput": False,
+            "is_deterministic": False,
             "input": (SPARSE, DENSE, UNSIGNED_DATA),
             "output": (INPUT,),
         }

diff --git a/autosklearn/pipeline/components/regression/sgd.py b/autosklearn/pipeline/components/regression/sgd.py
@@ -179,7 +179,6 @@ def get_properties(dataset_properties=None):
             "handles_multilabel": False,
             "handles_multioutput": False,
             "is_deterministic": True,
-            "handles_sparse": True,
             "input": (DENSE, SPARSE, UNSIGNED_DATA),
             "output": (PREDICTIONS,),
         }