Skip to content

Commit 0ae9cbf

Browse files
eddiebergmanravinkohli
authored andcommitted
Cleanup of simple_imputer (#346)
* cleanup of simple_imputer * Fixed doc and typo * Fixed docs * Made changes, added test * Fixed init statement * Fixed docs * Flake'd
1 parent 6992609 commit 0ae9cbf

File tree

2 files changed

+117
-56
lines changed

2 files changed

+117
-56
lines changed

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py

Lines changed: 105 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
from typing import Any, Dict, List, Optional, Union
22

33
from ConfigSpace.configuration_space import ConfigurationSpace
4-
from ConfigSpace.hyperparameters import (
5-
CategoricalHyperparameter
6-
)
4+
from ConfigSpace.hyperparameters import CategoricalHyperparameter
75

86
import numpy as np
97

@@ -15,92 +13,143 @@
1513

1614

1715
class SimpleImputer(BaseImputer):
18-
"""
19-
Impute missing values for categorical columns with '!missing!'
20-
(In case of numpy data, the constant value is set to -1, under
21-
the assumption that categorical data is fit with an Ordinal Scaler)
16+
"""An imputer for categorical and numerical columns
17+
18+
Impute missing values for categorical columns with 'constant_!missing!'
19+
20+
Note:
21+
In case of numpy data, the constant value is set to -1, under the assumption
22+
that categorical data is fit with an Ordinal Scaler.
23+
24+
Attributes:
25+
random_state (Optional[np.random.RandomState]):
26+
The random state to use for the imputer.
27+
numerical_strategy (str: default='mean'):
28+
The strategy to use for imputing numerical columns.
29+
Can be one of ['most_frequent', 'constant_!missing!']
30+
categorical_strategy (str: default='most_frequent')
31+
The strategy to use for imputing categorical columns.
32+
Can be one of ['mean', 'median', 'most_frequent', 'constant_zero']
2233
"""
2334

24-
def __init__(self,
25-
random_state: Optional[Union[np.random.RandomState, int]] = None,
26-
numerical_strategy: str = 'mean',
27-
categorical_strategy: str = 'most_frequent'):
35+
def __init__(
36+
self,
37+
random_state: Optional[np.random.RandomState] = None,
38+
numerical_strategy: str = 'mean',
39+
categorical_strategy: str = 'most_frequent'
40+
):
41+
"""
42+
Note:
43+
'constant' as numerical_strategy uses 0 as the default fill_value while
44+
'constant_!missing!' uses a fill_value of -1.
45+
This behaviour should probably be fixed.
46+
"""
2847
super().__init__()
2948
self.random_state = random_state
3049
self.numerical_strategy = numerical_strategy
3150
self.categorical_strategy = categorical_strategy
3251

33-
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseImputer:
34-
"""
35-
The fit function calls the fit function of the underlying model
36-
and returns the transformed array.
52+
def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer:
53+
""" Fits the underlying model and returns the transformed array.
54+
3755
Args:
38-
X (np.ndarray): input features
39-
y (Optional[np.ndarray]): input labels
56+
X (np.ndarray):
57+
The input features to fit on
58+
y (Optional[np.ndarray]):
59+
The labels for the input features `X`
4060
4161
Returns:
42-
instance of self
62+
SimpleImputer:
63+
returns self
4364
"""
4465
self.check_requirements(X, y)
45-
categorical_columns = X['dataset_properties']['categorical_columns'] \
46-
if isinstance(X['dataset_properties']['categorical_columns'], List) else []
47-
if len(categorical_columns) != 0:
66+
67+
# Choose an imputer for any categorical columns
68+
categorical_columns = X['dataset_properties']['categorical_columns']
69+
70+
if isinstance(categorical_columns, List) and len(categorical_columns) != 0:
4871
if self.categorical_strategy == 'constant_!missing!':
49-
self.preprocessor['categorical'] = SklearnSimpleImputer(strategy='constant',
50-
# Train data is numpy
51-
# as of this point, where
52-
# Ordinal Encoding is using
53-
# for categorical. Only
54-
# Numbers are allowed
55-
# fill_value='!missing!',
56-
fill_value=-1,
57-
copy=False)
72+
# Train data is numpy as of this point, where an Ordinal Encoding is used
73+
# for categoricals. Only Numbers are allowed for `fill_value`
74+
imputer = SklearnSimpleImputer(strategy='constant', fill_value=-1, copy=False)
75+
self.preprocessor['categorical'] = imputer
5876
else:
59-
self.preprocessor['categorical'] = SklearnSimpleImputer(strategy=self.categorical_strategy,
60-
copy=False)
61-
numerical_columns = X['dataset_properties']['numerical_columns'] \
62-
if isinstance(X['dataset_properties']['numerical_columns'], List) else []
63-
if len(numerical_columns) != 0:
77+
imputer = SklearnSimpleImputer(strategy=self.categorical_strategy, copy=False)
78+
self.preprocessor['categorical'] = imputer
79+
80+
# Choose an imputer for any numerical columns
81+
numerical_columns = X['dataset_properties']['numerical_columns']
82+
83+
if isinstance(numerical_columns, List) and len(numerical_columns) > 0:
6484
if self.numerical_strategy == 'constant_zero':
65-
self.preprocessor['numerical'] = SklearnSimpleImputer(strategy='constant',
66-
fill_value=0,
67-
copy=False)
85+
imputer = SklearnSimpleImputer(strategy='constant', fill_value=0, copy=False)
86+
self.preprocessor['numerical'] = imputer
6887
else:
69-
self.preprocessor['numerical'] = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False)
88+
imputer = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False)
89+
self.preprocessor['numerical'] = imputer
7090

7191
return self
7292

7393
@staticmethod
7494
def get_hyperparameter_search_space(
7595
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
76-
numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='numerical_strategy',
77-
value_range=("mean", "median",
78-
"most_frequent",
79-
"constant_zero"),
80-
default_value="mean",
81-
),
96+
numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
97+
hyperparameter='numerical_strategy',
98+
value_range=("mean", "median", "most_frequent", "constant_zero"),
99+
default_value="mean",
100+
),
82101
categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
83102
hyperparameter='categorical_strategy',
84-
value_range=("most_frequent",
85-
"constant_!missing!"),
86-
default_value="most_frequent")
103+
value_range=("most_frequent", "constant_!missing!"),
104+
default_value="most_frequent"
105+
)
87106
) -> ConfigurationSpace:
107+
"""Get the hyperparameter search space for the SimpleImputer
108+
109+
Args:
110+
dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]])
111+
Properties that describe the dataset
112+
Note: Not actually Optional, just adhering to its supertype
113+
numerical_strategy (HyperparameterSearchSpace: default = ...)
114+
The strategy to use for numerical imputation
115+
caterogical_strategy (HyperparameterSearchSpace: default = ...)
116+
The strategy to use for categorical imputation
117+
118+
Returns:
119+
ConfigurationSpace
120+
The space of possible configurations for a SimpleImputer with the given
121+
`dataset_properties`
122+
"""
88123
cs = ConfigurationSpace()
89-
assert dataset_properties is not None, "To create hyperparameter search space" \
90-
", dataset_properties should not be None"
91-
if len(dataset_properties['numerical_columns']) \
92-
if isinstance(dataset_properties['numerical_columns'], List) else 0 != 0:
124+
125+
if dataset_properties is None:
126+
raise ValueError("SimpleImputer requires `dataset_properties` for generating"
127+
" a search space.")
128+
129+
if (
130+
isinstance(dataset_properties['numerical_columns'], List)
131+
and len(dataset_properties['numerical_columns']) != 0
132+
):
93133
add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter)
94134

95-
if len(dataset_properties['categorical_columns']) \
96-
if isinstance(dataset_properties['categorical_columns'], List) else 0 != 0:
135+
if (
136+
isinstance(dataset_properties['categorical_columns'], List)
137+
and len(dataset_properties['categorical_columns'])
138+
):
97139
add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter)
98140

99141
return cs
100142

101143
@staticmethod
102-
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
103-
) -> Dict[str, Union[str, bool]]:
144+
def get_properties(
145+
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
146+
) -> Dict[str, Union[str, bool]]:
147+
"""Get the properties of the SimpleImputer class and what it can handle
148+
149+
Returns:
150+
Dict[str, Union[str, bool]]:
151+
A dict from property names to values
152+
"""
104153
return {
105154
'shortname': 'SimpleImputer',
106155
'name': 'Simple Imputer',

test/test_pipeline/components/preprocessing/test_imputers.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import numpy as np
44
from numpy.testing import assert_array_equal
55

6+
import pytest
7+
68
from sklearn.base import BaseEstimator, clone
79
from sklearn.compose import make_column_transformer
810

@@ -213,6 +215,16 @@ def test_constant_imputation(self):
213215
[7.0, '0', 9],
214216
[4.0, '0', '0']], dtype=str))
215217

218+
def test_imputation_without_dataset_properties_raises_error(self):
219+
"""Tests SimpleImputer checks for dataset properties when querying for
220+
HyperparameterSearchSpace, even though the arg is marked `Optional`.
221+
222+
Expects:
223+
* Should raise a ValueError that no dataset_properties were passed
224+
"""
225+
with pytest.raises(ValueError):
226+
SimpleImputer.get_hyperparameter_search_space()
227+
216228

217229
if __name__ == '__main__':
218230
unittest.main()

0 commit comments

Comments
 (0)