dataunitylab
diff --git a/‎Code/Ops_added_to_Foofah.py
+225 b/‎Code/Ops_added_to_Foofah.py
+225
diff --git a/‎Code/Table.py
+77 b/‎Code/Table.py
+77
diff --git a/‎Code/Table_Pair.py
+43 b/‎Code/Table_Pair.py
+43
@@ -0,0 +1,225 @@
+import re
+import csv
+from collections import OrderedDict
+import numpy as np
+from .prune_rules import contains_empty_col, add_empty_col
+import datetime
+import random
+import string
+from textblob import TextBlob
+import featuretools as ft
+import pandas as pd
+import nltk
+
+try:
+    nltk.data.find('')
+except LookupError:
+    nltk.download()
+from nltk.corpus import stopwords
+
+try:
+    nltk.data.find('corpus/stopwords')
+except LookupError:
+    nltk.download('stopwords')
+from nltk import stem
+
+lemmatizer = stem.WordNetLemmatizer()
+stopwords_list = set(stopwords.words('english'))
+
+
+### Text to numeric ###
+
+
+def f_count_s(table, col, char):
+    result_table = []
+    for row in table:
+        try:
+            count = str(len(re.findall(row[col], char)))
+        except:
+            count = str(row[col].count(char))
+        result_table.append(row[:col + 1] + [count, ] + row[col + 1:])
+    return result_table
+
+
+def f_number_of_words(table, col):
+    return f_count_s(table, col, ' ')
+
+
+def f_number_of_sentences(table, col):
+    return f_count_s(table, col, '.')
+
+
+def f_number_of_rows(table, col):
+    return f_count_s(table, col, '\n')
+
+
+def f_number_of_questions(table, col):
+    return f_count_s(table, col, '?')
+
+
+def f_number_of_emails(table, col):
+    return f_count_s(table, col, r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
+
+
+def f_number_of_urls(table, col):
+    return f_count_s(table, col, 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
+
+
+def f_number_of_ips(table, col):
+    return f_count_s(table, col, r'^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$')
+
+
+def f_number_of_phone_numbers(table, col):
+    return f_count_s(table, col,
+                     r'[\+\d]?(\d{2,3}[-\.\s]??\d{2,3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
+
+
+def f_number_of_punctuations(table, col):
+    return f_count_s(table, col, "[" + re.escape(string.punctuation) + "]")
+
+
+def f_number_of_stopwords(table, col):
+    result_table = []
+    for row in table:
+        count = str(len([word for word in row[col].split() if word in stopwords_list]))
+        # result_table.append([count, ])
+        result_table.append(row[:col + 1] + [count, ] + row[col + 1:])
+    return result_table
+
+
+def f_len(table, col):
+    result_table = []
+    for row in table:
+        len_str = str(len(row[col]))
+        result_table.append(row[:col + 1] + [len_str, ] + row[col + 1:])
+    return result_table
+
+
+### Text to class ###
+
+def f_exists_s(table, col, char):
+    result_table = []
+    for row in table:
+        exists = str(int(bool(len(re.findall(row[col], char)))))
+        result_table.append(row[:col + 1] + [exists, ] + row[col + 1:])
+    return result_table
+
+
+def f_contains_multiple_words(table, col):
+    return f_exists_s(table, col, ' ')
+
+
+def f_contains_multiple_sentences(table, col):
+    return f_exists_s(table, col, '.')
+
+
+def f_contains_multiple_rows(table, col):
+    return f_exists_s(table, col, '\n')
+
+
+def f_contains_a_questions(table, col):
+    return f_exists_s(table, col, '?')
+
+
+def f_contains_an_email(table, col):
+    return f_exists_s(table, col, r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
+
+
+def f_contains_an_url(table, col):
+    return f_exists_s(table, col, 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
+
+
+def f_contains_an_ip(table, col):
+    return f_exists_s(table, col, r'^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$')
+
+
+def f_contains_a_phone_number(table, col):
+    return f_exists_s(table, col,
+                      r'[\+\d]?(\d{2,3}[-\.\s]??\d{2,3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
+
+
+def f_contains_a_punctuation(table, col):
+    return f_count_s(table, col, "[" + re.escape(string.punctuation) + "]")
+
+
+def f_contains_a_stopword(table, col):
+    result_table = []
+    for row in table:
+        count = str(bool(len([word for word in row[col].split() if word in stopwords_list])))
+        result_table.append(row[:col + 1] + [count, ] + row[col + 1:])
+    return result_table
+
+
+# New Transformations (NLP Data Cleaning)
+
+def f_remove_stopwords(table, col):
+    result_table = []
+    for row in table:
+        new_row = ' '.join([word for word in row[col].split() if word not in stopwords_list])
+        # result_table.append([count, ])
+        result_table.append(row[:col + 1] + [new_row, ] + row[col + 1:])
+    return result_table
+
+
+def f_remove_numeric(table, col):
+    result_table = []
+    for row in table:
+        new_row = ' '.join([word for word in row[col].split() if not word.isdigit()])
+        # result_table.append([count, ])
+        result_table.append(row[:col + 1] + [new_row, ] + row[col + 1:])
+    return result_table
+
+
+def f_remove_punctuation(table, col):
+    # print(table)
+    result_table = []
+    regex = re.compile('[%s]' % re.escape(string.punctuation))
+    for row in table:
+        new_row = regex.sub('', row[col])
+        # result_table.append([count, ])
+        result_table.append(row[:col + 1] + [new_row, ] + row[col + 1:])
+    # print(result_table)
+    return result_table
+
+
+def f_remove_url(table, col):
+    result_table = []
+    regex = re.compile(r'https?://\S+|www\.\S+')
+    for row in table:
+        new_row = regex.sub('', row[col])
+        # result_table.append([count, ])
+        result_table.append(row[:col + 1] + [new_row, ] + row[col + 1:])
+    return result_table
+
+
+def f_remove_html_tags(table, col):
+    result_table = []
+    regex = re.compile(r'<.*?>')
+    for row in table:
+        new_row = regex.sub('', row[col])
+        result_table.append(row[:col + 1] + [new_row, ] + row[col + 1:])
+    return result_table
+
+
+def f_spell_correction(table, col):
+    result_table = []
+    for row in table:
+        new_row = ' '.join([word for word in row[col].split() if not TextBlob(row[col]).correct()])
+        result_table.append(row[:col + 1] + [new_row, ] + row[col + 1:])
+    return result_table
+
+
+def f_lemmatization(table, col):
+    result_table = []
+    for row in table:
+        new_row = lemmatizer.lemmatize(row[col])
+        result_table.append(row[:col + 1] + [new_row, ] + row[col + 1:])
+    return result_table
+
+
+def f_lower(table, col):
+    result_table = []
+    for row in table:
+        new_row = row[col].lower()
+        result_table.append(row[:col + 1] + [new_row, ] + row[col + 1:])
+    return result_table
@@ -0,0 +1,77 @@
+import pandas as pd
+import config
+
+class Table:
+    CATEGORICAL_UPPER_BOUND = config.CATEGORICAL_UPPER_BOUND
+
+    def __init__(self, df):
+        self.table = df
+        self.headers = list(df.columns)
+        self.A = list(range(0, len(df.columns)))
+        self.A_vecs = list()  # temp
+        self.set_A_vec_naive()
+        self.A_types = self.set_attribute_types(False)
+        self.fix_types()
+        self.projected_table = self.table
+        self.projected_A = self.A
+        self.r = self.table.index.to_numpy()
+        self.r_vecs = list()  # temp
+        self.set_r_vec_naive()
+
+    def set_A_vec_naive(self):
+        self.A_vecs = [self.table[col].unique() for col in self.headers]
+
+    def set_r_vec_naive(self):
+        self.r_vecs = self.projected_table.to_numpy()
+
+    def update_A_vec(self, vecs):
+        self.A_vecs = vecs
+
+    def update_projected_table(self, attributes):
+        self.projected_table = self.table.iloc[:, attributes]
+        self.projected_A = attributes
+
+    def set_attribute_types(self, use_profiler=False):
+        if use_profiler:
+            profiler = ProfileReport(T_prime.table)
+            var = prof.get_description()['variables']
+            #             print({i: v for i, v in enumerate(self.A)})
+            return {i: var[v]['type'] for i, v in enumerate(self.table.columns)}
+        else:
+            return dict(zip(self.A, self.table.dtypes))
+
+    def fix_types(self):
+        is_change = False
+        for a in self.A_types:
+            col = self.table.iloc[:, a]
+            cardinality = len(col.unique())
+            # if cardinality == 2:
+            #     self.table.iloc[:, a] = self.table.iloc[:, a].astype('bool')
+            #     is_change = True
+            # elif cardinality <= self.CATEGORICAL_UPPER_BOUND:
+            if cardinality <= self.CATEGORICAL_UPPER_BOUND:
+                self.table.iloc[:, a] = self.table.iloc[:, a].astype('category')
+                is_change = True
+            elif self.A_types[a] == 'object' and pd.api.types.is_string_dtype(col):
+                self.table.iloc[:, a] = self.table.iloc[:, a].astype(str)
+                is_change = True
+        if is_change:
+            self.A_types = self.set_attribute_types(False)
+
+    def get_attributes_types(self, attribute_set):
+        types = []
+        for a in attribute_set:
+            types.append(self.A_types[a])
+        return types
+
+    def get_attributes_names(self, attribute_set):
+        return [self.headers[i] for i in attribute_set]
+
+    def get_feature_like_attributes(self):
+        return [a for a in self.A_types if self.A_types[a] not in ['object']]
+
+    def get_row_by_id(self, row_id):
+        return self.table.iloc[row_id, :]
+
+    def get_rows_by_ids(self, row_ids):
+        return self.table.iloc[row_ids, :]
@@ -0,0 +1,43 @@
+from Table import Table
+import pandas as pd
+
+
+class Table_Pair:
+
+    def __init__(self, T: Table, T_prime: Table):
+        self.T = T
+        self.T_prime = T_prime
+        self.sigma_A = list()
+        self.sigma_A_exact = list()
+        self.LHCA = list()
+        self.LHDA = list()
+        self.RHCA = list()
+        self.RHDA = list()
+        self.sigma_r = list()
+        self.sigma_r_exact = list()
+        self.LHCr = list()
+        self.LHDr = list()
+        self.RHCr = list()
+        self.RHDr = list()
+
+    def update_attribute_match(self, sigma_A):
+        self.sigma_A = sigma_A
+        self.LHCA = [c[0] for c in sigma_A]
+        self.LHDA = list(set(self.T.A).difference(self.LHCA))
+        self.RHCA = [c[1] for c in sigma_A]
+        self.RHDA = list(set(self.T_prime.A).difference(self.RHCA))
+        self.T.update_projected_table(self.LHCA)
+        self.T_prime.update_projected_table(self.RHCA)
+
+    def update_exact_attribute_match(self, sigma_A_exact):
+        self.sigma_A_exact = sigma_A_exact
+
+    def update_record_match(self, sigma_r):
+        self.sigma_r = sigma_r
+        self.LHCr = [c[0] for c in sigma_r]
+        self.LHDr = list(set(self.T.r).difference(self.LHCr))
+        self.RHCr = [c[1] for c in sigma_r]
+        self.RHDr = list(set(self.T_prime.r).difference(self.RHCr))
+
+    def update_exact_record_match(self, sigma_r_exact):
+        self.sigma_r_exact = sigma_r_exact