Merge pull request #4 from kpnDataScienceLab/master

nboro · web-flow · commit 5ac547cf3d0a · 2021-02-24T15:13:35.000+01:00
Added channel and program title to the columns to be encoded as categ…
diff --git a/evaluation.py b/evaluation.py
@@ -24,6 +24,8 @@ def __init__(self, origdst, synthdst):
     def to_cat(dtr, dts):
 
         target_cols = list(dtr.columns[11:-3])
+        target_cols.insert(0, dtr.columns[1])  # channel
+        target_cols.insert(0, dtr.columns[2])  # program_title
         target_cols.insert(0, dtr.columns[3])  # genre
 
         #         flag_same_demographic_column_values = True
@@ -118,17 +120,28 @@ def jensen_shannon(self):
         real_cat, synth_cat = self.to_cat(self.origdst, self.synthdst)
 
         target_columns = list(self.origdst.columns[11:-3])
+        target_columns.append(self.origdst.columns[1])  # channel
+        target_columns.append(self.origdst.columns[2])  # program_title
         target_columns.append(self.origdst.columns[3])  # genre
 
         js_dict = {}
 
         for col in target_columns:
-            col_counts_orig = real_cat[col].value_counts(normalize=True).sort_index(ascending=True)
-            col_counts_synth = synth_cat[col].value_counts(normalize=True).sort_index(ascending=True)
 
-            js = distance.jensenshannon(asarray(col_counts_orig.tolist()), asarray(col_counts_synth.tolist()), base=2)
+            try:
+                col_counts_orig = real_cat[col].value_counts(normalize=True).sort_index(ascending=True)
+                col_counts_synth = synth_cat[col].value_counts(normalize=True).sort_index(ascending=True)
 
-            js_dict[col] = js
+                js = distance.jensenshannon(asarray(col_counts_orig.tolist()), asarray(col_counts_synth.tolist()),
+                                            base=2)
+
+                js_dict[col] = js
+
+            except:
+
+                print('For the column ', col, ' you must generate the same unique values as the real dataset.')
+                print('The number of unique values than you should generate for column ', col, 'is ',
+                      len(self.origdst[col].unique()))
 
         return js_dict
 
@@ -139,17 +152,28 @@ def kl_divergence(self):
         The threshold limit for this metric is a value below 2"""
 
         target_columns = list(self.origdst.columns[11:-3])
-        target_columns.append(self.origdst.columns[4])  # content_id
+        target_columns.append(self.origdst.columns[1])  # channel
+        target_columns.append(self.origdst.columns[2])  # program_title
+        target_columns.append(self.origdst.columns[3])  # genre
 
         kl_dict = {}
 
         for col in target_columns:
-            col_counts_orig = self.origdst[col].value_counts(normalize=True).sort_index(ascending=True)
-            col_counts_synth = self.synthdst[col].value_counts(normalize=True).sort_index(ascending=True)
 
-            kl = sum(rel_entr(col_counts_orig.tolist(), col_counts_synth.tolist()))
+            try:
+
+                col_counts_orig = self.origdst[col].value_counts(normalize=True).sort_index(ascending=True)
+                col_counts_synth = self.synthdst[col].value_counts(normalize=True).sort_index(ascending=True)
+
+                kl = sum(rel_entr(col_counts_orig.tolist(), col_counts_synth.tolist()))
 
-            kl_dict[col] = kl
+                kl_dict[col] = kl
+
+            except:
+
+                print('For the column ', col, ' you must generate the same unique values as the real dataset.')
+                print('The number of unique values than you should generate for column ', col, 'is ',
+                      len(self.origdst[col].unique()))
 
         return kl_dict
 
@@ -176,123 +200,127 @@ def pairwise_correlation_difference(self):
 
         return prwcrdst, substract_m
 
-    if __name__ == "__main__":
-
-        logging.basicConfig(filename='evaluation.log',
-                            format='%(asctime)s %(message)s',
-                            filemode='w')
-
-        logger = logging.getLogger()
-        logger.setLevel(logging.INFO)
-
-        ob = eval_metrics(r, ra)
-
-        # euclidean distance
-        flag_eucl = False
-        eucl, eumatr = ob.euclidean_dist()
-        logger.info('Euclidean distance was calculated')
-        print('The calculated euclidean distance is: ', eucl)
-        print('The calculated euclidean distance matrix is:', eumatr)
-        if eucl > 14:
-            logger.error(f'The calculated Euclidean distance value between the two correlation matrices is too high it should be \
-            less than 14. The current value is {eucl}')
-            logger.info(f'The Euclidean distance matrix is \n {eumatr}')
-        else:
-            logger.info('The dataset satisfies the criteria for the euclidean distance.')
-            logger.info(f'The calculated Euclidean distance value is \n {eucl}')
-            logger.info(f'The Euclidean distance matrix is \n {eumatr}')
-            flag_eucl = True
-        logger.info('---------------------------------------------------------')
-
-        # 2 sample Kolmogorov-Smirnov test
-        kst = ob.kolmogorov()
-
-        p_value = 0.05
-        flag_klg = False
-        logger.info('Kolmogorov-Smirnov test was performed')
-        print('The results of the Kolmogorov-Smirnov test is:', kst)
-        rejected = {}
-        for col in kst:
-            if kst[col]['p-value'] < p_value:
-                rejected[col] = kst[col]
-        if rejected:
-            logger.info('The dataset did not pass the Kolmogorov-Smirnov test')
-            logger.info(f'The columns that did not pass the test are \n {rejected}')
-            logger.info(f'The overall performance for the test is \n {kst}')
-        else:
-            logger.info('The dataset passed the Kolmogorov-Smirnov test')
-            logger.info(f'The overall performance for the test is \n {kst}')
-            flag_klg = True
-        logger.info('---------------------------------------------------------')
-
-        # Jensen-Shannon Divergence
-        dict_js = ob.jensen_shannon()
-        logger.info('Jensen-Shannon Divergence was calculated')
-        print('The result of the Jensen-Shannon Divergence is:', dict_js)
-        flag_js = False
-
-        jsd = deepcopy(dict_js)
-
-        for key in list(dict_js):
-            if (dict_js[key] < 0.50) & (key != 'CONTENT_ID'):
+
+if __name__ == "__main__":
+
+    logging.basicConfig(filename='evaluation.log',
+                        format='%(asctime)s %(message)s',
+                        filemode='w')
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+
+    ob = eval_metrics(r, ra)
+
+    # euclidean distance
+    flag_eucl = False
+    eucl, eumatr = ob.euclidean_dist()
+    logger.info('Euclidean distance was calculated')
+    print('The calculated euclidean distance is: ', eucl)
+    print('The calculated euclidean distance matrix is:', eumatr)
+    if eucl > 14:
+        logger.error(f'The calculated Euclidean distance value between the two correlation matrices is too high it should be \
+        less than 14. The current value is {eucl}')
+        logger.info(f'The Euclidean distance matrix is \n {eumatr}')
+    else:
+        logger.info('The dataset satisfies the criteria for the euclidean distance.')
+        logger.info(f'The calculated Euclidean distance value is \n {eucl}')
+        logger.info(f'The Euclidean distance matrix is \n {eumatr}')
+        flag_eucl = True
+    logger.info('---------------------------------------------------------')
+
+    # 2 sample Kolmogorov-Smirnov test
+    kst = ob.kolmogorov()
+
+    p_value = 0.05
+    flag_klg = False
+    logger.info('Kolmogorov-Smirnov test was performed')
+    print('The results of the Kolmogorov-Smirnov test is:', kst)
+    rejected = {}
+    for col in kst:
+        if kst[col]['p-value'] < p_value:
+            rejected[col] = kst[col]
+    if rejected:
+        logger.info('The dataset did not pass the Kolmogorov-Smirnov test')
+        logger.info(f'The columns that did not pass the test are \n {rejected}')
+        logger.info(f'The overall performance for the test is \n {kst}')
+    else:
+        logger.info('The dataset passed the Kolmogorov-Smirnov test')
+        logger.info(f'The overall performance for the test is \n {kst}')
+        flag_klg = True
+    logger.info('---------------------------------------------------------')
+
+    # Jensen-Shannon Divergence
+    dict_js = ob.jensen_shannon()
+    logger.info('Jensen-Shannon Divergence was calculated')
+    print('The result of the Jensen-Shannon Divergence is:', dict_js)
+    flag_js = False
+
+    jsd = deepcopy(dict_js)
+
+    for key in list(dict_js):
+        if (dict_js[key] < 0.50) & (key not in ['GENRE', 'PROGRAM_TITLE']):
+            del dict_js[key]
+        if key == 'GENRE':
+            if (dict_js[key] < 0.59):
                 del dict_js[key]
-            if key == 'CONTENT_ID':
-                if (dict_js[key] < 0.75):
-                    del dict_js[key]
-
-        if dict_js:
-            logger.info('The dataset did not pass the Jensen-Shannon Divergence test')
-            for key in dict_js.keys():
-                logger.info(f'The Jensen-Shannon Divergence value for the column {key} was {dict_js[key]}')
-            logger.info(f'The overall performance for each column is summarized below: \n {jsd}')
-        else:
-            logger.info('The dataset passed the Jensen-Shannon Divergence test')
-            logger.info(f'The overall performance for each column is summarized below: \n {jsd}')
-            flag_js = True
-        logger.info('---------------------------------------------------------')
-
-        # KL divergence
-        dict_kl = ob.kl_divergence()
-        logger.info('KL divergence was calculated')
-        print('The result of the KL divergence is', dict_kl)
-        flag_kl = False
-
-        kl = deepcopy(dict_kl)
-
-        for key in list(dict_kl):
-            if dict_kl[key] < 2.20:
-                del dict_kl[key]
-
-        if dict_kl:
-            logger.info('The dataset did not pass the KL divergence evaluation test')
-            for key in dict_kl.keys():
-                logger.info(f'The KL divergence value for the column {key} was {dict_kl[key]}')
-            logger.info(f'The overall for the KL divergence performance for each column is summarized below: \n {kl}')
-        else:
-            logger.info('The dataset passed the KL divergence evaluation test')
-            logger.info(f'The overall performance for the KL divergence for each column is summarized below: \n {kl}')
-            flag_kl = True
-        logger.info('---------------------------------------------------------')
-
-        # pairwise correlation difference
-        pair_corr_diff, pcd_matr = ob.pairwise_correlation_difference()
-        logger.info('Pairwise correlation difference was calculated')
-        print('The calculated Pairwise correlation difference was', pair_corr_diff)
-        print('The calculated Pairwise correlation difference matrix was', pcd_matr)
-
-        flag_pcd = False
-        if pair_corr_diff > 2.4:
-            logger.error(f'The calculated Euclidean distance value between the two correlation matrices is too high it should be \
-            less than 14. The current value is {pair_corr_diff}')
-            logger.info(f'The Pairwise distance distance matrix is \n {pcd_matr}')
-        else:
-            logger.info('The dataaset satisfies the criteria for the Pairwise Correlation Difference.')
-            logger.info(f'The Pairwise distance distance value is \n {pair_corr_diff}')
-            logger.info(f'The Pairwise distance distance matrix is \n {pcd_matr}')
-            flag_pcd = True
-
-        if (flag_eucl & flag_js & flag_klg & flag_kl & flag_pcd):
-            logger.info('The dataaset satisfies the minimum evaluation criteria.')
-        else:
-            logger.info('The dataaset does not satisfy the minimum evaluation criteria.')
-            logger.info('Plese check the previous log messages.')
+        if key == 'PROGRAM_TITLE':
+            if (dict_js[key] < 0.69):
+                del dict_js[key]
+
+    if dict_js:
+        logger.info('The dataset did not pass the Jensen-Shannon Divergence test')
+        for key in dict_js.keys():
+            logger.info(f'The Jensen-Shannon Divergence value for the column {key} was {dict_js[key]}')
+        logger.info(f'The overall performance for each column is summarized below: \n {jsd}')
+    else:
+        logger.info('The dataset passed the Jensen-Shannon Divergence test')
+        logger.info(f'The overall performance for each column is summarized below: \n {jsd}')
+        flag_js = True
+    logger.info('---------------------------------------------------------')
+
+    # KL divergence
+    dict_kl = ob.kl_divergence()
+    logger.info('KL divergence was calculated')
+    print('The result of the KL divergence is', dict_kl)
+    flag_kl = False
+
+    kl = deepcopy(dict_kl)
+
+    for key in list(dict_kl):
+        if dict_kl[key] < 2.20:
+            del dict_kl[key]
+
+    if dict_kl:
+        logger.info('The dataset did not pass the KL divergence evaluation test')
+        for key in dict_kl.keys():
+            logger.info(f'The KL divergence value for the column {key} was {dict_kl[key]}')
+        logger.info(f'The overall for the KL divergence performance for each column is summarized below: \n {kl}')
+    else:
+        logger.info('The dataset passed the KL divergence evaluation test')
+        logger.info(f'The overall performance for the KL divergence for each column is summarized below: \n {kl}')
+        flag_kl = True
+    logger.info('---------------------------------------------------------')
+
+    # pairwise correlation difference
+    pair_corr_diff, pcd_matr = ob.pairwise_correlation_difference()
+    logger.info('Pairwise correlation difference was calculated')
+    print('The calculated Pairwise correlation difference was', pair_corr_diff)
+    print('The calculated Pairwise correlation difference matrix was', pcd_matr)
+
+    flag_pcd = False
+    if pair_corr_diff > 2.4:
+        logger.error(f'The calculated Euclidean distance value between the two correlation matrices is too high it should be \
+        less than 14. The current value is {pair_corr_diff}')
+        logger.info(f'The Pairwise distance distance matrix is \n {pcd_matr}')
+    else:
+        logger.info('The dataaset satisfies the criteria for the Pairwise Correlation Difference.')
+        logger.info(f'The Pairwise distance distance value is \n {pair_corr_diff}')
+        logger.info(f'The Pairwise distance distance matrix is \n {pcd_matr}')
+        flag_pcd = True
+
+    if (flag_eucl & flag_js & flag_klg & flag_kl & flag_pcd):
+        logger.info('The dataaset satisfies the minimum evaluation criteria.')
+    else:
+        logger.info('The dataaset does not satisfy the minimum evaluation criteria.')
+        logger.info('Plese check the previous log messages.')