-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path__init__.py
75 lines (65 loc) · 2.26 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
""" ML missing values
Substitute missing values not present in the inital dataset with the most frequent
"""
import numpy as np
from sklearn.compose import ColumnTransformer
# Algorithm to substitute missing values of dataset
from sklearn.impute import SimpleImputer
# IterativeInputer is a more complex algorithm to substitute missing values with estimations
X = [
[20, np.nan],
[np.nan, 'm'],
[30, 'f'],
[35, 'f'],
[np.nan, np.nan],
]
"""
# Generic transformer appliable to both columns
[
# Generic input to fill both A & B columns
'inputer',
# Apply SimpleImputer algorithm to dataset transformer
SimpleImputer(
# Use most frequent occurences to fill missing values because dataset also contains string datatypes.
# Numeric values strategies: "mean" => average and "median" => middle value
# String values strategies: "most_frequent" => most frequent value
# "contastant" => fill with constant "fill_value" parameter
strategy='most_frequent',
# Define the format of expected missing values
missing_values=np.nan
),
# Apply the algorithm to both columns A & B
[0, 1]
],
"""
transfomers = [
[
# Specific age imputer to handle missing values in age feature
'age_imputer',
# Apply SimpleImputer algorithm to dataset transformer with median strategy
SimpleImputer(strategy='mean'),
# Apply the algoritmh only to first(age) column
[0]
],
[
# Specific gender imputer to handle missing values in gender feature
'gender_inputer',
# Apply SimpleImputer algorithm to dataset transformer with most frequent strategy
SimpleImputer(strategy='constant', fill_value='n.d.'),
# Apply the algoritmh only to second(gender) column
[1]
]
]
ct = ColumnTransformer(transformers=transfomers)
# Overwrite the inital dataset with transformed one
X = ct.fit_transform(X)
print(X)
"""
The most frequent strategy is procedural so it will substitute the missing values
with the most frequent values found until the assignment time.
[[20 'f']
[20 'm'] # Substitued np.nan with 20 because is the most frequent value untill now
[30 'f']
[25 'f']
[20 'f']] # Substituted both with most frequent values 20 & f
"""