@@ -24,6 +24,8 @@ def __init__(self, origdst, synthdst):
24
24
def to_cat (dtr , dts ):
25
25
26
26
target_cols = list (dtr .columns [11 :- 3 ])
27
+ target_cols .insert (0 , dtr .columns [1 ]) # channel
28
+ target_cols .insert (0 , dtr .columns [2 ]) # program_title
27
29
target_cols .insert (0 , dtr .columns [3 ]) # genre
28
30
29
31
# flag_same_demographic_column_values = True
@@ -118,17 +120,28 @@ def jensen_shannon(self):
118
120
real_cat , synth_cat = self .to_cat (self .origdst , self .synthdst )
119
121
120
122
target_columns = list (self .origdst .columns [11 :- 3 ])
123
+ target_columns .append (self .origdst .columns [1 ]) # channel
124
+ target_columns .append (self .origdst .columns [2 ]) # program_title
121
125
target_columns .append (self .origdst .columns [3 ]) # genre
122
126
123
127
js_dict = {}
124
128
125
129
for col in target_columns :
126
- col_counts_orig = real_cat [col ].value_counts (normalize = True ).sort_index (ascending = True )
127
- col_counts_synth = synth_cat [col ].value_counts (normalize = True ).sort_index (ascending = True )
128
130
129
- js = distance .jensenshannon (asarray (col_counts_orig .tolist ()), asarray (col_counts_synth .tolist ()), base = 2 )
131
+ try :
132
+ col_counts_orig = real_cat [col ].value_counts (normalize = True ).sort_index (ascending = True )
133
+ col_counts_synth = synth_cat [col ].value_counts (normalize = True ).sort_index (ascending = True )
130
134
131
- js_dict [col ] = js
135
+ js = distance .jensenshannon (asarray (col_counts_orig .tolist ()), asarray (col_counts_synth .tolist ()),
136
+ base = 2 )
137
+
138
+ js_dict [col ] = js
139
+
140
+ except :
141
+
142
+ print ('For the column ' , col , ' you must generate the same unique values as the real dataset.' )
143
+ print ('The number of unique values than you should generate for column ' , col , 'is ' ,
144
+ len (self .origdst [col ].unique ()))
132
145
133
146
return js_dict
134
147
@@ -139,17 +152,28 @@ def kl_divergence(self):
139
152
The threshold limit for this metric is a value below 2"""
140
153
141
154
target_columns = list (self .origdst .columns [11 :- 3 ])
142
- target_columns .append (self .origdst .columns [4 ]) # content_id
155
+ target_columns .append (self .origdst .columns [1 ]) # channel
156
+ target_columns .append (self .origdst .columns [2 ]) # program_title
157
+ target_columns .append (self .origdst .columns [3 ]) # genre
143
158
144
159
kl_dict = {}
145
160
146
161
for col in target_columns :
147
- col_counts_orig = self .origdst [col ].value_counts (normalize = True ).sort_index (ascending = True )
148
- col_counts_synth = self .synthdst [col ].value_counts (normalize = True ).sort_index (ascending = True )
149
162
150
- kl = sum (rel_entr (col_counts_orig .tolist (), col_counts_synth .tolist ()))
163
+ try :
164
+
165
+ col_counts_orig = self .origdst [col ].value_counts (normalize = True ).sort_index (ascending = True )
166
+ col_counts_synth = self .synthdst [col ].value_counts (normalize = True ).sort_index (ascending = True )
167
+
168
+ kl = sum (rel_entr (col_counts_orig .tolist (), col_counts_synth .tolist ()))
151
169
152
- kl_dict [col ] = kl
170
+ kl_dict [col ] = kl
171
+
172
+ except :
173
+
174
+ print ('For the column ' , col , ' you must generate the same unique values as the real dataset.' )
175
+ print ('The number of unique values than you should generate for column ' , col , 'is ' ,
176
+ len (self .origdst [col ].unique ()))
153
177
154
178
return kl_dict
155
179
@@ -176,123 +200,127 @@ def pairwise_correlation_difference(self):
176
200
177
201
return prwcrdst , substract_m
178
202
179
- if __name__ == "__main__" :
180
-
181
- logging .basicConfig (filename = 'evaluation.log' ,
182
- format = '%(asctime)s %(message)s' ,
183
- filemode = 'w' )
184
-
185
- logger = logging .getLogger ()
186
- logger .setLevel (logging .INFO )
187
-
188
- ob = eval_metrics (r , ra )
189
-
190
- # euclidean distance
191
- flag_eucl = False
192
- eucl , eumatr = ob .euclidean_dist ()
193
- logger .info ('Euclidean distance was calculated' )
194
- print ('The calculated euclidean distance is: ' , eucl )
195
- print ('The calculated euclidean distance matrix is:' , eumatr )
196
- if eucl > 14 :
197
- logger .error (f'The calculated Euclidean distance value between the two correlation matrices is too high it should be \
198
- less than 14. The current value is { eucl } ' )
199
- logger .info (f'The Euclidean distance matrix is \n { eumatr } ' )
200
- else :
201
- logger .info ('The dataset satisfies the criteria for the euclidean distance.' )
202
- logger .info (f'The calculated Euclidean distance value is \n { eucl } ' )
203
- logger .info (f'The Euclidean distance matrix is \n { eumatr } ' )
204
- flag_eucl = True
205
- logger .info ('---------------------------------------------------------' )
206
-
207
- # 2 sample Kolmogorov-Smirnov test
208
- kst = ob .kolmogorov ()
209
-
210
- p_value = 0.05
211
- flag_klg = False
212
- logger .info ('Kolmogorov-Smirnov test was performed' )
213
- print ('The results of the Kolmogorov-Smirnov test is:' , kst )
214
- rejected = {}
215
- for col in kst :
216
- if kst [col ]['p-value' ] < p_value :
217
- rejected [col ] = kst [col ]
218
- if rejected :
219
- logger .info ('The dataset did not pass the Kolmogorov-Smirnov test' )
220
- logger .info (f'The columns that did not pass the test are \n { rejected } ' )
221
- logger .info (f'The overall performance for the test is \n { kst } ' )
222
- else :
223
- logger .info ('The dataset passed the Kolmogorov-Smirnov test' )
224
- logger .info (f'The overall performance for the test is \n { kst } ' )
225
- flag_klg = True
226
- logger .info ('---------------------------------------------------------' )
227
-
228
- # Jensen-Shannon Divergence
229
- dict_js = ob .jensen_shannon ()
230
- logger .info ('Jensen-Shannon Divergence was calculated' )
231
- print ('The result of the Jensen-Shannon Divergence is:' , dict_js )
232
- flag_js = False
233
-
234
- jsd = deepcopy (dict_js )
235
-
236
- for key in list (dict_js ):
237
- if (dict_js [key ] < 0.50 ) & (key != 'CONTENT_ID' ):
203
+
204
+ if __name__ == "__main__" :
205
+
206
+ logging .basicConfig (filename = 'evaluation.log' ,
207
+ format = '%(asctime)s %(message)s' ,
208
+ filemode = 'w' )
209
+
210
+ logger = logging .getLogger ()
211
+ logger .setLevel (logging .INFO )
212
+
213
+ ob = eval_metrics (r , ra )
214
+
215
+ # euclidean distance
216
+ flag_eucl = False
217
+ eucl , eumatr = ob .euclidean_dist ()
218
+ logger .info ('Euclidean distance was calculated' )
219
+ print ('The calculated euclidean distance is: ' , eucl )
220
+ print ('The calculated euclidean distance matrix is:' , eumatr )
221
+ if eucl > 14 :
222
+ logger .error (f'The calculated Euclidean distance value between the two correlation matrices is too high it should be \
223
+ less than 14. The current value is { eucl } ' )
224
+ logger .info (f'The Euclidean distance matrix is \n { eumatr } ' )
225
+ else :
226
+ logger .info ('The dataset satisfies the criteria for the euclidean distance.' )
227
+ logger .info (f'The calculated Euclidean distance value is \n { eucl } ' )
228
+ logger .info (f'The Euclidean distance matrix is \n { eumatr } ' )
229
+ flag_eucl = True
230
+ logger .info ('---------------------------------------------------------' )
231
+
232
+ # 2 sample Kolmogorov-Smirnov test
233
+ kst = ob .kolmogorov ()
234
+
235
+ p_value = 0.05
236
+ flag_klg = False
237
+ logger .info ('Kolmogorov-Smirnov test was performed' )
238
+ print ('The results of the Kolmogorov-Smirnov test is:' , kst )
239
+ rejected = {}
240
+ for col in kst :
241
+ if kst [col ]['p-value' ] < p_value :
242
+ rejected [col ] = kst [col ]
243
+ if rejected :
244
+ logger .info ('The dataset did not pass the Kolmogorov-Smirnov test' )
245
+ logger .info (f'The columns that did not pass the test are \n { rejected } ' )
246
+ logger .info (f'The overall performance for the test is \n { kst } ' )
247
+ else :
248
+ logger .info ('The dataset passed the Kolmogorov-Smirnov test' )
249
+ logger .info (f'The overall performance for the test is \n { kst } ' )
250
+ flag_klg = True
251
+ logger .info ('---------------------------------------------------------' )
252
+
253
+ # Jensen-Shannon Divergence
254
+ dict_js = ob .jensen_shannon ()
255
+ logger .info ('Jensen-Shannon Divergence was calculated' )
256
+ print ('The result of the Jensen-Shannon Divergence is:' , dict_js )
257
+ flag_js = False
258
+
259
+ jsd = deepcopy (dict_js )
260
+
261
+ for key in list (dict_js ):
262
+ if (dict_js [key ] < 0.50 ) & (key not in ['GENRE' , 'PROGRAM_TITLE' ]):
263
+ del dict_js [key ]
264
+ if key == 'GENRE' :
265
+ if (dict_js [key ] < 0.59 ):
238
266
del dict_js [key ]
239
- if key == 'CONTENT_ID ' :
240
- if (dict_js [key ] < 0.75 ):
241
- del dict_js [key ]
242
-
243
- if dict_js :
244
- logger .info ('The dataset did not pass the Jensen-Shannon Divergence test' )
245
- for key in dict_js .keys ():
246
- logger .info (f'The Jensen-Shannon Divergence value for the column { key } was { dict_js [key ]} ' )
247
- logger .info (f'The overall performance for each column is summarized below: \n { jsd } ' )
248
- else :
249
- logger .info ('The dataset passed the Jensen-Shannon Divergence test' )
250
- logger .info (f'The overall performance for each column is summarized below: \n { jsd } ' )
251
- flag_js = True
252
- logger .info ('---------------------------------------------------------' )
253
-
254
- # KL divergence
255
- dict_kl = ob .kl_divergence ()
256
- logger .info ('KL divergence was calculated' )
257
- print ('The result of the KL divergence is' , dict_kl )
258
- flag_kl = False
259
-
260
- kl = deepcopy (dict_kl )
261
-
262
- for key in list (dict_kl ):
263
- if dict_kl [key ] < 2.20 :
264
- del dict_kl [key ]
265
-
266
- if dict_kl :
267
- logger .info ('The dataset did not pass the KL divergence evaluation test' )
268
- for key in dict_kl .keys ():
269
- logger .info (f'The KL divergence value for the column { key } was { dict_kl [key ]} ' )
270
- logger .info (f'The overall for the KL divergence performance for each column is summarized below: \n { kl } ' )
271
- else :
272
- logger .info ('The dataset passed the KL divergence evaluation test' )
273
- logger .info (f'The overall performance for the KL divergence for each column is summarized below: \n { kl } ' )
274
- flag_kl = True
275
- logger .info ('---------------------------------------------------------' )
276
-
277
- # pairwise correlation difference
278
- pair_corr_diff , pcd_matr = ob .pairwise_correlation_difference ()
279
- logger .info ('Pairwise correlation difference was calculated' )
280
- print ('The calculated Pairwise correlation difference was' , pair_corr_diff )
281
- print ('The calculated Pairwise correlation difference matrix was' , pcd_matr )
282
-
283
- flag_pcd = False
284
- if pair_corr_diff > 2.4 :
285
- logger .error (f'The calculated Euclidean distance value between the two correlation matrices is too high it should be \
286
- less than 14. The current value is { pair_corr_diff } ' )
287
- logger .info (f'The Pairwise distance distance matrix is \n { pcd_matr } ' )
288
- else :
289
- logger .info ('The dataaset satisfies the criteria for the Pairwise Correlation Difference.' )
290
- logger .info (f'The Pairwise distance distance value is \n { pair_corr_diff } ' )
291
- logger .info (f'The Pairwise distance distance matrix is \n { pcd_matr } ' )
292
- flag_pcd = True
293
-
294
- if (flag_eucl & flag_js & flag_klg & flag_kl & flag_pcd ):
295
- logger .info ('The dataaset satisfies the minimum evaluation criteria.' )
296
- else :
297
- logger .info ('The dataaset does not satisfy the minimum evaluation criteria.' )
298
- logger .info ('Plese check the previous log messages.' )
267
+ if key == 'PROGRAM_TITLE ' :
268
+ if (dict_js [key ] < 0.69 ):
269
+ del dict_js [key ]
270
+
271
+ if dict_js :
272
+ logger .info ('The dataset did not pass the Jensen-Shannon Divergence test' )
273
+ for key in dict_js .keys ():
274
+ logger .info (f'The Jensen-Shannon Divergence value for the column { key } was { dict_js [key ]} ' )
275
+ logger .info (f'The overall performance for each column is summarized below: \n { jsd } ' )
276
+ else :
277
+ logger .info ('The dataset passed the Jensen-Shannon Divergence test' )
278
+ logger .info (f'The overall performance for each column is summarized below: \n { jsd } ' )
279
+ flag_js = True
280
+ logger .info ('---------------------------------------------------------' )
281
+
282
+ # KL divergence
283
+ dict_kl = ob .kl_divergence ()
284
+ logger .info ('KL divergence was calculated' )
285
+ print ('The result of the KL divergence is' , dict_kl )
286
+ flag_kl = False
287
+
288
+ kl = deepcopy (dict_kl )
289
+
290
+ for key in list (dict_kl ):
291
+ if dict_kl [key ] < 2.20 :
292
+ del dict_kl [key ]
293
+
294
+ if dict_kl :
295
+ logger .info ('The dataset did not pass the KL divergence evaluation test' )
296
+ for key in dict_kl .keys ():
297
+ logger .info (f'The KL divergence value for the column { key } was { dict_kl [key ]} ' )
298
+ logger .info (f'The overall for the KL divergence performance for each column is summarized below: \n { kl } ' )
299
+ else :
300
+ logger .info ('The dataset passed the KL divergence evaluation test' )
301
+ logger .info (f'The overall performance for the KL divergence for each column is summarized below: \n { kl } ' )
302
+ flag_kl = True
303
+ logger .info ('---------------------------------------------------------' )
304
+
305
+ # pairwise correlation difference
306
+ pair_corr_diff , pcd_matr = ob .pairwise_correlation_difference ()
307
+ logger .info ('Pairwise correlation difference was calculated' )
308
+ print ('The calculated Pairwise correlation difference was' , pair_corr_diff )
309
+ print ('The calculated Pairwise correlation difference matrix was' , pcd_matr )
310
+
311
+ flag_pcd = False
312
+ if pair_corr_diff > 2.4 :
313
+ logger .error (f'The calculated Euclidean distance value between the two correlation matrices is too high it should be \
314
+ less than 14. The current value is { pair_corr_diff } ' )
315
+ logger .info (f'The Pairwise distance distance matrix is \n { pcd_matr } ' )
316
+ else :
317
+ logger .info ('The dataaset satisfies the criteria for the Pairwise Correlation Difference.' )
318
+ logger .info (f'The Pairwise distance distance value is \n { pair_corr_diff } ' )
319
+ logger .info (f'The Pairwise distance distance matrix is \n { pcd_matr } ' )
320
+ flag_pcd = True
321
+
322
+ if (flag_eucl & flag_js & flag_klg & flag_kl & flag_pcd ):
323
+ logger .info ('The dataaset satisfies the minimum evaluation criteria.' )
324
+ else :
325
+ logger .info ('The dataaset does not satisfy the minimum evaluation criteria.' )
326
+ logger .info ('Plese check the previous log messages.' )
0 commit comments