-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhelpers.py
271 lines (225 loc) · 10.2 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures , LabelEncoder , MinMaxScaler , StandardScaler, scale
import pandas as pd
import numpy as np
class Model:
# Cache
lastLinearModel = None
lastClassificationModel = None
def fitLinearModel(self, features, label, evaluationFunction = None,
validationFeatures = None, validationLabel = None):
'''
Parameters :
features : data
label : target
-optional- evaluationMetric :
an Sk-Learn evaluation function to display the final
performance of the model after fitting
Returns :
sklearn LinearRegression Model fit on data
'''
linearRegressionModel = linear_model.LinearRegression()
linearRegressionModel.fit(features,label)
# there's a probably cleaner way to do this
if evaluationFunction is not None:
prediction = linearRegressionModel.predict(features)
print(f"Final model {evaluationFunction.__name__} on train: {evaluationFunction(label, prediction)}")
if validationFeatures is not None:
prediction = linearRegressionModel.predict(validationFeatures)
print(f"Final model {evaluationFunction.__name__} on validation: {evaluationFunction(validationLabel, prediction)}")
self.lastLinearModel = linearRegressionModel
return self.lastLinearModel
def fitPolyModel(self, features, label, polynomialDegree = 1,
evaluationFunction = None,
validationFeatures = None, validationLabel = None):
'''
Transforms data to a polynomial degree then fits a linear model on it
Parameters :
features : data
label : target
-optional- evaluationMetric :
an Sk-Learn evaluation function to display the final
performance of the model after fitting
Returns :
sklearn LinearRegression Model fit on data after polynomial transformation
'''
features = self.changeDegreeOf(features, polynomialDegree)
if validationFeatures is not None:
validationFeatures = self.changeDegreeOf(validationFeatures, polynomialDegree)
return self.fitLinearModel(features, label, evaluationFunction,
validationFeatures,
validationLabel)
def fitClassificationModel(self, modelClass, params: dict, features=None, label=None,
evaluationFunction=metrics.accuracy_score,
validationFeatures=None, validationLabel=None, dataDictionary=None):
'''
fit any given classification model on data and print the accuracy + other metric provided
Params:
:param modelClass : any class that implements .fit(), .score() and .predict()
:param params : dictionary containing the paramaeters of modelClass
:param features : data features (X)
:param label : data label (Y)
:param evaluationFunction : function to assess the model against after fitting
:param -optional- validationFeatures, validationLabel : validation data to test model after fitting
:param -optional- dataDictionary : a dictionary containing train and test features and labels (if provided
features, label, validationFeatures and validationLabel will be ignored and data will be fetched from the dict
Returns :
modelClass instance fit on data
'''
if dataDictionary is not None:
features = dataDictionary['trainFeatures']
label = dataDictionary['trainLabel']
validationFeatures = dataDictionary['testFeatures']
validationLabel = dataDictionary['testLabel']
model = modelClass(**params)
model.fit(features, label)
print(f"Training Accuracy : {model.score(features, label)}")
trainPred = model.predict(features)
evaluationParams = {'y_true' : label, 'y_pred' : trainPred}
if evaluationFunction is metrics.f1_score:
evaluationParams['average'] = 'micro'
print(f"Training {evaluationFunction.__name__} : {evaluationFunction(**evaluationParams)}")
if validationFeatures is not None:
evaluationParams['y_true'] = validationLabel
evaluationParams['y_pred'] = model.predict(validationFeatures)
print(f"\nTest Accuracy : {model.score(dataDictionary['testFeatures'], dataDictionary['testLabel'])}")
print(f"Test {evaluationFunction.__name__} : {evaluationFunction(**evaluationParams)}")
self.lastClassificationModel = model
return model
def changeDegreeOf(self, features, degree = 1):
polynomialFunction = PolynomialFeatures(degree=degree)
polynomialDataFeatures = polynomialFunction.fit_transform(features)
return polynomialDataFeatures
def crossValidateOn(self, model, features, label, polynomialDegree = 1
, k = 5, metric = metrics.mean_squared_error):
'''
Performs K-Fold cross Validation on data using given model
Parameters :
model : an Sk-Learn model or any models that implements .fit() function
features : data
label : target
polynomialDegree : change features to polynomial, default is 1 (no change)
Metric :
an Sk-Learn evaluation function to asses the model on (default is MSE)
'''
features = self.changeDegreeOf(features, polynomialDegree)
scores = cross_val_score(model, features, label, cv = k, scoring = metrics.make_scorer(metric))
print(f"Average Score : {sum(scores) / k}")
def splitData(self, dataFeatures, dataLabel, test_size = 0.2):
'''
Splits data into train test set
return :
dictionary containing split data
Example:
data = splitData(features, label)
data.keys() -> view names used to access data
data["trainFeatures"] -> training data
data["testLabel"] -> test labels
'''
data = tuple(train_test_split(dataFeatures, dataLabel, shuffle=True, random_state=10, test_size= test_size))
return {"trainFeatures":data[0], "testFeatures": data[1], "trainLabel": data[2], "testLabel": data[3]}
class PreProcessing:
encoders = {}
scalingCache = {}
featuresUniqueValues = {}
nameFeatureMap = {'Taxi': 0,
'Shared': 1, 'UberPool': 1, 'Lyft': 1, 'WAV': 1, 'UberX': 1,
'UberXL': 2, 'Lyft XL': 2, 'Lux': 2,
'Black': 3, 'Lux Black': 3,
'Black SUV': 4, 'Lux Black XL': 4}
labelFeatureMap = {'unknown' : 0, 'cheap' : 1, 'moderate': 2, 'expensive' : 3, 'very expensive': 4}
def scale(self,dataFeatures,type = "minMax",validationFeatures = None):
'''
Scale data in specific range
dataFeatures : data
type: method of scaling to use
return: scaled features
'''
scalerType = None
scaler = None
if type == "minMax":
scalerType = MinMaxScaler
elif type == "standardization":
scalerType = StandardScaler
for feature in dataFeatures:
scaler = scalerType()
scaler.clip = False
self.scalingCache[feature] = scaler.fit(np.reshape(dataFeatures[feature].to_numpy(),(-1,1)))
dataFeatures[feature] = scaler.fit_transform(np.reshape(dataFeatures[feature].to_numpy(),(-1,1)))
validationFeatures[feature] = scaler.transform(np.reshape(validationFeatures[feature].to_numpy(),(-1,1)))
def scaleCached(self,dataFeatures):
'''
scale features with priviously fit scalers
dataFeatures: features to scale
'''
for feature in dataFeatures:
self.scalingCache[feature].clip = False
dataFeatures[feature] = self.scalingCache[feature].transform(np.reshape(dataFeatures[feature].to_numpy(),(-1,1)))
def encode(self,data,features):
'''
change data features values from strings to numeric values
data: dataframe
features: non integer features to encode
method: encoding technique to change feature values
return: encoded features
'''
for feature in features:
encoder = LabelEncoder()
self.featuresUniqueValues[feature] = list(data[feature].unique())
self.featuresUniqueValues[feature].append("Unknown")
self.encoders[feature] = encoder.fit(self.featuresUniqueValues[feature])
data[feature] = encoder.transform(data[feature])
def encode_cached(self,data,features):
'''
Encode categorical features with previously fit encoders
data: Dataframe
features: features to be encoded
return: encoded features
'''
for feature in features:
data[feature] = [value if value in self.featuresUniqueValues[feature] else 'Unknown' for value in data[feature]]
data[feature] = self.encoders[feature].transform(data[feature])
def reduceDimentionsOf(self,dataFeatures,reduceTo = 1):
'''
reduce features of data from dimention n to dimention k using PCA algorithm
dataFeatures: features to reduce
reduceTo: number of components to keep
return reduced data in lower dimentions
'''
pca = PCA(n_components=1)
reducedDataFeatures = pca.fit_transform(dataFeatures)
return reducedDataFeatures
def encode_name(self,names):
"""
DEPRECATED
encoding the name col to numbers that represent the price for each class
:param names: the column to be encoded
:return: encoded column
"""
services = ['Taxi', 'Shared', 'UberPool', 'Lyft', 'WAV', 'UberX', 'UberXL', 'Lyft XL', 'Lux', 'Black',
'Lux Black', 'Black SUV', 'Lux Black XL']
'''
0: unknown, 1: cheap, 2: moderate, 3: expensive, 4: very expensive
'''
labels = [0, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4]
names.replace(services, labels, inplace=True)
for name in names:
if name not in labels:
names.replace(name,0,inplace=True)
def encodeManually(self, dataColumn:pd.Series, labelMap:dict):
dataColumn.replace(labelMap, inplace = True)
for name in dataColumn:
if name not in labelMap.values():
dataColumn.replace(name,0,inplace=True)
def drop_adjust(self,data):
"""
dropping unnecessary columns
and giving a col a meaningful name
:param data: whole dataframe
:return: adujest dataframe
"""
data.drop(['date', 'id', 'product_id', 'location'], axis=1, inplace=True)
data.rename(columns={'name': 'ride_class'}, inplace=True)