@@ -21,7 +21,14 @@ class ProblemType(Enum):
21
21
FeatureMultilabel = 4
22
22
23
23
class DataManager (object ):
24
+ """ Load data from multiple sources and formants"""
25
+
24
26
def __init__ (self , verbose = 0 ):
27
+ """Construct the DataManager
28
+
29
+ Keyword Arguments:
30
+ verbose {bool} -- Whether to print stuff. (default: {0})
31
+ """
25
32
self .verbose = verbose
26
33
self .X_train , self .Y_train = None , None
27
34
self .X_test , self .Y_test = None , None
@@ -33,6 +40,16 @@ def __init__(self, verbose=0):
33
40
self .categorical_features = None
34
41
35
42
def read_data (self , file_name , test_split = 0.0 , is_classification = None , random_seed = 0 , ** kwargs ):
43
+ """Read the data.
44
+
45
+ Arguments:
46
+ file_name {str} -- The name of the file to load. Different Readers are associated with different filenames.
47
+
48
+ Keyword Arguments:
49
+ test_split {float} -- Amount of data to use as test split (default: {0.0})
50
+ is_classification {bool} -- Whether the data is a classification task (default: {None})
51
+ random_seed {int} -- a random seed (default: {0})
52
+ """
36
53
print ("Read:" + file_name )
37
54
reader = self ._get_reader (file_name , is_classification )
38
55
reader .read ()
@@ -53,6 +70,18 @@ def read_data(self, file_name, test_split=0.0, is_classification=None, random_se
53
70
self ._split_data (test_split , random_seed )
54
71
55
72
def _get_reader (self , file_name , is_classification ):
73
+ """Get the reader associated with the filename.
74
+
75
+ Arguments:
76
+ file_name {str} -- The file to load
77
+ is_classification {bool} -- Whether the data is a classification task or not
78
+
79
+ Raises:
80
+ ValueError: The given file type is not supported
81
+
82
+ Returns:
83
+ DataReader -- A reader that is able to read the data type
84
+ """
56
85
if file_name .endswith (".csv" ):
57
86
reader = CSVReader (file_name , is_classification = is_classification )
58
87
elif file_name .startswith ("openml:" ):
@@ -65,6 +94,17 @@ def _get_reader(self, file_name, is_classification):
65
94
return reader
66
95
67
96
def generate_classification (self , num_classes , num_features , num_samples , test_split = 0.1 , seed = 0 ):
97
+ """Generate a classification task
98
+
99
+ Arguments:
100
+ num_classes {int} -- Number of classes
101
+ num_features {int} -- Number of features
102
+ num_samples {int} -- Number of samples
103
+
104
+ Keyword Arguments:
105
+ test_split {float} -- Size of test split (default: {0.1})
106
+ seed {int} -- A random seed (default: {0})
107
+ """
68
108
#X, Y = make_classification(n_samples=800, n_features=num_feats, n_classes=num_classes, n_informative=4)
69
109
X , y = make_multilabel_classification (
70
110
n_samples = num_samples , n_features = num_features , n_classes = num_classes , n_labels = 0.01 ,
@@ -78,13 +118,29 @@ def generate_classification(self, num_classes, num_features, num_samples, test_s
78
118
self ._split_data (test_split , seed )
79
119
80
120
def generate_regression (self , num_features , num_samples , test_split = 0.1 , seed = 0 ):
121
+ """Generate a regression task
122
+
123
+ Arguments:
124
+ num_features {int} -- Number of features
125
+ num_samples {int} -- Number of samples
126
+
127
+ Keyword Arguments:
128
+ test_split {float} -- Size of test split (default: {0.1})
129
+ seed {int} -- a random seed (default: {0})
130
+ """
81
131
X , Y = make_regression (n_samples = num_samples , n_features = num_features , random_state = seed )
82
132
self .categorical_features = [False ] * num_features
83
133
self .problem_type = ProblemType .FeatureRegression
84
134
self .X , self .Y = X , Y
85
135
self ._split_data (test_split , seed )
86
136
87
137
def _split_data (self , test_split , seed ):
138
+ """Split the data in test (, valid) and training set.
139
+
140
+ Arguments:
141
+ test_split {[type]} -- [description]
142
+ seed {[type]} -- [description]
143
+ """
88
144
valid_specified = self .X_valid is not None and self .Y_valid is not None
89
145
test_specified = self .X_test is not None and self .Y_test is not None
90
146
@@ -101,6 +157,17 @@ def _split_data(self, test_split, seed):
101
157
self .Y_train = self .Y
102
158
103
159
def deterministic_shuffle_and_split (X , Y , split , seed ):
160
+ """Split the data deterministically given the seed
161
+
162
+ Arguments:
163
+ X {array} -- The feature data
164
+ Y {array} -- The targets
165
+ split {float} -- The size of the split
166
+ seed {int} -- A random seed
167
+
168
+ Returns:
169
+ tuple -- Tuple of full data and the two splits
170
+ """
104
171
rng = np .random .RandomState (seed )
105
172
p = rng .permutation (X .shape [0 ])
106
173
@@ -110,4 +177,4 @@ def deterministic_shuffle_and_split(X, Y, split, seed):
110
177
split = int (split * X .shape [0 ])
111
178
return X , Y , X [0 :- split ], Y [0 :- split ], X [- split :], Y [- split :]
112
179
else :
113
- return X , Y , X , Y , None , None
180
+ return X , Y , X , Y , None , None
0 commit comments