|
1 | 1 | from typing import Any, Dict, List, Optional, Union
|
2 | 2 |
|
3 | 3 | from ConfigSpace.configuration_space import ConfigurationSpace
|
4 |
| -from ConfigSpace.hyperparameters import ( |
5 |
| - CategoricalHyperparameter |
6 |
| -) |
| 4 | +from ConfigSpace.hyperparameters import CategoricalHyperparameter |
7 | 5 |
|
8 | 6 | import numpy as np
|
9 | 7 |
|
|
15 | 13 |
|
16 | 14 |
|
17 | 15 | class SimpleImputer(BaseImputer):
|
18 |
| - """ |
19 |
| - Impute missing values for categorical columns with '!missing!' |
20 |
| - (In case of numpy data, the constant value is set to -1, under |
21 |
| - the assumption that categorical data is fit with an Ordinal Scaler) |
| 16 | + """An imputer for categorical and numerical columns |
| 17 | +
|
| 18 | + Impute missing values for categorical columns with 'constant_!missing!' |
| 19 | +
|
| 20 | + Note: |
| 21 | + In case of numpy data, the constant value is set to -1, under the assumption |
| 22 | + that categorical data is fit with an Ordinal Scaler. |
| 23 | +
|
| 24 | + Attributes: |
| 25 | + random_state (Optional[np.random.RandomState]): |
| 26 | + The random state to use for the imputer. |
| 27 | + numerical_strategy (str: default='mean'): |
| 28 | + The strategy to use for imputing numerical columns. |
| 29 | + Can be one of ['most_frequent', 'constant_!missing!'] |
| 30 | + categorical_strategy (str: default='most_frequent') |
| 31 | + The strategy to use for imputing categorical columns. |
| 32 | + Can be one of ['mean', 'median', 'most_frequent', 'constant_zero'] |
22 | 33 | """
|
23 | 34 |
|
24 |
| - def __init__(self, |
25 |
| - random_state: Optional[Union[np.random.RandomState, int]] = None, |
26 |
| - numerical_strategy: str = 'mean', |
27 |
| - categorical_strategy: str = 'most_frequent'): |
| 35 | + def __init__( |
| 36 | + self, |
| 37 | + random_state: Optional[np.random.RandomState] = None, |
| 38 | + numerical_strategy: str = 'mean', |
| 39 | + categorical_strategy: str = 'most_frequent' |
| 40 | + ): |
| 41 | + """ |
| 42 | + Note: |
| 43 | + 'constant' as numerical_strategy uses 0 as the default fill_value while |
| 44 | + 'constant_!missing!' uses a fill_value of -1. |
| 45 | + This behaviour should probably be fixed. |
| 46 | + """ |
28 | 47 | super().__init__()
|
29 | 48 | self.random_state = random_state
|
30 | 49 | self.numerical_strategy = numerical_strategy
|
31 | 50 | self.categorical_strategy = categorical_strategy
|
32 | 51 |
|
33 |
| - def fit(self, X: Dict[str, Any], y: Any = None) -> BaseImputer: |
34 |
| - """ |
35 |
| - The fit function calls the fit function of the underlying model |
36 |
| - and returns the transformed array. |
| 52 | + def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer: |
| 53 | + """ Fits the underlying model and returns the transformed array. |
| 54 | +
|
37 | 55 | Args:
|
38 |
| - X (np.ndarray): input features |
39 |
| - y (Optional[np.ndarray]): input labels |
| 56 | + X (np.ndarray): |
| 57 | + The input features to fit on |
| 58 | + y (Optional[np.ndarray]): |
| 59 | + The labels for the input features `X` |
40 | 60 |
|
41 | 61 | Returns:
|
42 |
| - instance of self |
| 62 | + SimpleImputer: |
| 63 | + returns self |
43 | 64 | """
|
44 | 65 | self.check_requirements(X, y)
|
45 |
| - categorical_columns = X['dataset_properties']['categorical_columns'] \ |
46 |
| - if isinstance(X['dataset_properties']['categorical_columns'], List) else [] |
47 |
| - if len(categorical_columns) != 0: |
| 66 | + |
| 67 | + # Choose an imputer for any categorical columns |
| 68 | + categorical_columns = X['dataset_properties']['categorical_columns'] |
| 69 | + |
| 70 | + if isinstance(categorical_columns, List) and len(categorical_columns) != 0: |
48 | 71 | if self.categorical_strategy == 'constant_!missing!':
|
49 |
| - self.preprocessor['categorical'] = SklearnSimpleImputer(strategy='constant', |
50 |
| - # Train data is numpy |
51 |
| - # as of this point, where |
52 |
| - # Ordinal Encoding is using |
53 |
| - # for categorical. Only |
54 |
| - # Numbers are allowed |
55 |
| - # fill_value='!missing!', |
56 |
| - fill_value=-1, |
57 |
| - copy=False) |
| 72 | + # Train data is numpy as of this point, where an Ordinal Encoding is used |
| 73 | + # for categoricals. Only Numbers are allowed for `fill_value` |
| 74 | + imputer = SklearnSimpleImputer(strategy='constant', fill_value=-1, copy=False) |
| 75 | + self.preprocessor['categorical'] = imputer |
58 | 76 | else:
|
59 |
| - self.preprocessor['categorical'] = SklearnSimpleImputer(strategy=self.categorical_strategy, |
60 |
| - copy=False) |
61 |
| - numerical_columns = X['dataset_properties']['numerical_columns'] \ |
62 |
| - if isinstance(X['dataset_properties']['numerical_columns'], List) else [] |
63 |
| - if len(numerical_columns) != 0: |
| 77 | + imputer = SklearnSimpleImputer(strategy=self.categorical_strategy, copy=False) |
| 78 | + self.preprocessor['categorical'] = imputer |
| 79 | + |
| 80 | + # Choose an imputer for any numerical columns |
| 81 | + numerical_columns = X['dataset_properties']['numerical_columns'] |
| 82 | + |
| 83 | + if isinstance(numerical_columns, List) and len(numerical_columns) > 0: |
64 | 84 | if self.numerical_strategy == 'constant_zero':
|
65 |
| - self.preprocessor['numerical'] = SklearnSimpleImputer(strategy='constant', |
66 |
| - fill_value=0, |
67 |
| - copy=False) |
| 85 | + imputer = SklearnSimpleImputer(strategy='constant', fill_value=0, copy=False) |
| 86 | + self.preprocessor['numerical'] = imputer |
68 | 87 | else:
|
69 |
| - self.preprocessor['numerical'] = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False) |
| 88 | + imputer = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False) |
| 89 | + self.preprocessor['numerical'] = imputer |
70 | 90 |
|
71 | 91 | return self
|
72 | 92 |
|
73 | 93 | @staticmethod
|
74 | 94 | def get_hyperparameter_search_space(
|
75 | 95 | dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
|
76 |
| - numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='numerical_strategy', |
77 |
| - value_range=("mean", "median", |
78 |
| - "most_frequent", |
79 |
| - "constant_zero"), |
80 |
| - default_value="mean", |
81 |
| - ), |
| 96 | + numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace( |
| 97 | + hyperparameter='numerical_strategy', |
| 98 | + value_range=("mean", "median", "most_frequent", "constant_zero"), |
| 99 | + default_value="mean", |
| 100 | + ), |
82 | 101 | categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
|
83 | 102 | hyperparameter='categorical_strategy',
|
84 |
| - value_range=("most_frequent", |
85 |
| - "constant_!missing!"), |
86 |
| - default_value="most_frequent") |
| 103 | + value_range=("most_frequent", "constant_!missing!"), |
| 104 | + default_value="most_frequent" |
| 105 | + ) |
87 | 106 | ) -> ConfigurationSpace:
|
| 107 | + """Get the hyperparameter search space for the SimpleImputer |
| 108 | +
|
| 109 | + Args: |
| 110 | + dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]) |
| 111 | + Properties that describe the dataset |
| 112 | + Note: Not actually Optional, just adhering to its supertype |
| 113 | + numerical_strategy (HyperparameterSearchSpace: default = ...) |
| 114 | + The strategy to use for numerical imputation |
| 115 | + caterogical_strategy (HyperparameterSearchSpace: default = ...) |
| 116 | + The strategy to use for categorical imputation |
| 117 | +
|
| 118 | + Returns: |
| 119 | + ConfigurationSpace |
| 120 | + The space of possible configurations for a SimpleImputer with the given |
| 121 | + `dataset_properties` |
| 122 | + """ |
88 | 123 | cs = ConfigurationSpace()
|
89 |
| - assert dataset_properties is not None, "To create hyperparameter search space" \ |
90 |
| - ", dataset_properties should not be None" |
91 |
| - if len(dataset_properties['numerical_columns']) \ |
92 |
| - if isinstance(dataset_properties['numerical_columns'], List) else 0 != 0: |
| 124 | + |
| 125 | + if dataset_properties is None: |
| 126 | + raise ValueError("SimpleImputer requires `dataset_properties` for generating" |
| 127 | + " a search space.") |
| 128 | + |
| 129 | + if ( |
| 130 | + isinstance(dataset_properties['numerical_columns'], List) |
| 131 | + and len(dataset_properties['numerical_columns']) != 0 |
| 132 | + ): |
93 | 133 | add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter)
|
94 | 134 |
|
95 |
| - if len(dataset_properties['categorical_columns']) \ |
96 |
| - if isinstance(dataset_properties['categorical_columns'], List) else 0 != 0: |
| 135 | + if ( |
| 136 | + isinstance(dataset_properties['categorical_columns'], List) |
| 137 | + and len(dataset_properties['categorical_columns']) |
| 138 | + ): |
97 | 139 | add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter)
|
98 | 140 |
|
99 | 141 | return cs
|
100 | 142 |
|
101 | 143 | @staticmethod
|
102 |
| - def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None |
103 |
| - ) -> Dict[str, Union[str, bool]]: |
| 144 | + def get_properties( |
| 145 | + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None |
| 146 | + ) -> Dict[str, Union[str, bool]]: |
| 147 | + """Get the properties of the SimpleImputer class and what it can handle |
| 148 | +
|
| 149 | + Returns: |
| 150 | + Dict[str, Union[str, bool]]: |
| 151 | + A dict from property names to values |
| 152 | + """ |
104 | 153 | return {
|
105 | 154 | 'shortname': 'SimpleImputer',
|
106 | 155 | 'name': 'Simple Imputer',
|
|
0 commit comments