Skip to content

Commit 28de7cd

Browse files
authored
Add files via upload
0 parents  commit 28de7cd

18 files changed

+8117
-0
lines changed

Code/Ops_added_to_Foofah.py

+225
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
import re
2+
import csv
3+
from collections import OrderedDict
4+
import numpy as np
5+
from .prune_rules import contains_empty_col, add_empty_col
6+
import datetime
7+
import random
8+
import string
9+
from textblob import TextBlob
10+
import featuretools as ft
11+
import pandas as pd
12+
import nltk
13+
14+
try:
15+
nltk.data.find('')
16+
except LookupError:
17+
nltk.download()
18+
from nltk.corpus import stopwords
19+
20+
try:
21+
nltk.data.find('corpus/stopwords')
22+
except LookupError:
23+
nltk.download('stopwords')
24+
from nltk import stem
25+
26+
lemmatizer = stem.WordNetLemmatizer()
27+
stopwords_list = set(stopwords.words('english'))
28+
29+
30+
### Text to numeric ###
31+
32+
33+
def f_count_s(table, col, char):
34+
result_table = []
35+
for row in table:
36+
try:
37+
count = str(len(re.findall(row[col], char)))
38+
except:
39+
count = str(row[col].count(char))
40+
result_table.append(row[:col + 1] + [count, ] + row[col + 1:])
41+
return result_table
42+
43+
44+
def f_number_of_words(table, col):
45+
return f_count_s(table, col, ' ')
46+
47+
48+
def f_number_of_sentences(table, col):
49+
return f_count_s(table, col, '.')
50+
51+
52+
def f_number_of_rows(table, col):
53+
return f_count_s(table, col, '\n')
54+
55+
56+
def f_number_of_questions(table, col):
57+
return f_count_s(table, col, '?')
58+
59+
60+
def f_number_of_emails(table, col):
61+
return f_count_s(table, col, r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
62+
63+
64+
def f_number_of_urls(table, col):
65+
return f_count_s(table, col, 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
66+
67+
68+
def f_number_of_ips(table, col):
69+
return f_count_s(table, col, r'^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$')
70+
71+
72+
def f_number_of_phone_numbers(table, col):
73+
return f_count_s(table, col,
74+
r'[\+\d]?(\d{2,3}[-\.\s]??\d{2,3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
75+
76+
77+
def f_number_of_punctuations(table, col):
78+
return f_count_s(table, col, "[" + re.escape(string.punctuation) + "]")
79+
80+
81+
def f_number_of_stopwords(table, col):
82+
result_table = []
83+
for row in table:
84+
count = str(len([word for word in row[col].split() if word in stopwords_list]))
85+
# result_table.append([count, ])
86+
result_table.append(row[:col + 1] + [count, ] + row[col + 1:])
87+
return result_table
88+
89+
90+
def f_len(table, col):
91+
result_table = []
92+
for row in table:
93+
len_str = str(len(row[col]))
94+
result_table.append(row[:col + 1] + [len_str, ] + row[col + 1:])
95+
return result_table
96+
97+
98+
### Text to class ###
99+
100+
def f_exists_s(table, col, char):
101+
result_table = []
102+
for row in table:
103+
exists = str(int(bool(len(re.findall(row[col], char)))))
104+
result_table.append(row[:col + 1] + [exists, ] + row[col + 1:])
105+
return result_table
106+
107+
108+
def f_contains_multiple_words(table, col):
109+
return f_exists_s(table, col, ' ')
110+
111+
112+
def f_contains_multiple_sentences(table, col):
113+
return f_exists_s(table, col, '.')
114+
115+
116+
def f_contains_multiple_rows(table, col):
117+
return f_exists_s(table, col, '\n')
118+
119+
120+
def f_contains_a_questions(table, col):
121+
return f_exists_s(table, col, '?')
122+
123+
124+
def f_contains_an_email(table, col):
125+
return f_exists_s(table, col, r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
126+
127+
128+
def f_contains_an_url(table, col):
129+
return f_exists_s(table, col, 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
130+
131+
132+
def f_contains_an_ip(table, col):
133+
return f_exists_s(table, col, r'^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$')
134+
135+
136+
def f_contains_a_phone_number(table, col):
137+
return f_exists_s(table, col,
138+
r'[\+\d]?(\d{2,3}[-\.\s]??\d{2,3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
139+
140+
141+
def f_contains_a_punctuation(table, col):
142+
return f_count_s(table, col, "[" + re.escape(string.punctuation) + "]")
143+
144+
145+
def f_contains_a_stopword(table, col):
146+
result_table = []
147+
for row in table:
148+
count = str(bool(len([word for word in row[col].split() if word in stopwords_list])))
149+
result_table.append(row[:col + 1] + [count, ] + row[col + 1:])
150+
return result_table
151+
152+
153+
# New Transformations (NLP Data Cleaning)
154+
155+
def f_remove_stopwords(table, col):
156+
result_table = []
157+
for row in table:
158+
new_row = ' '.join([word for word in row[col].split() if word not in stopwords_list])
159+
# result_table.append([count, ])
160+
result_table.append(row[:col + 1] + [new_row, ] + row[col + 1:])
161+
return result_table
162+
163+
164+
def f_remove_numeric(table, col):
165+
result_table = []
166+
for row in table:
167+
new_row = ' '.join([word for word in row[col].split() if not word.isdigit()])
168+
# result_table.append([count, ])
169+
result_table.append(row[:col + 1] + [new_row, ] + row[col + 1:])
170+
return result_table
171+
172+
173+
def f_remove_punctuation(table, col):
174+
# print(table)
175+
result_table = []
176+
regex = re.compile('[%s]' % re.escape(string.punctuation))
177+
for row in table:
178+
new_row = regex.sub('', row[col])
179+
# result_table.append([count, ])
180+
result_table.append(row[:col + 1] + [new_row, ] + row[col + 1:])
181+
# print(result_table)
182+
return result_table
183+
184+
185+
def f_remove_url(table, col):
186+
result_table = []
187+
regex = re.compile(r'https?://\S+|www\.\S+')
188+
for row in table:
189+
new_row = regex.sub('', row[col])
190+
# result_table.append([count, ])
191+
result_table.append(row[:col + 1] + [new_row, ] + row[col + 1:])
192+
return result_table
193+
194+
195+
def f_remove_html_tags(table, col):
196+
result_table = []
197+
regex = re.compile(r'<.*?>')
198+
for row in table:
199+
new_row = regex.sub('', row[col])
200+
result_table.append(row[:col + 1] + [new_row, ] + row[col + 1:])
201+
return result_table
202+
203+
204+
def f_spell_correction(table, col):
205+
result_table = []
206+
for row in table:
207+
new_row = ' '.join([word for word in row[col].split() if not TextBlob(row[col]).correct()])
208+
result_table.append(row[:col + 1] + [new_row, ] + row[col + 1:])
209+
return result_table
210+
211+
212+
def f_lemmatization(table, col):
213+
result_table = []
214+
for row in table:
215+
new_row = lemmatizer.lemmatize(row[col])
216+
result_table.append(row[:col + 1] + [new_row, ] + row[col + 1:])
217+
return result_table
218+
219+
220+
def f_lower(table, col):
221+
result_table = []
222+
for row in table:
223+
new_row = row[col].lower()
224+
result_table.append(row[:col + 1] + [new_row, ] + row[col + 1:])
225+
return result_table

Code/Table.py

+77
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import pandas as pd
2+
import config
3+
4+
class Table:
5+
CATEGORICAL_UPPER_BOUND = config.CATEGORICAL_UPPER_BOUND
6+
7+
def __init__(self, df):
8+
self.table = df
9+
self.headers = list(df.columns)
10+
self.A = list(range(0, len(df.columns)))
11+
self.A_vecs = list() # temp
12+
self.set_A_vec_naive()
13+
self.A_types = self.set_attribute_types(False)
14+
self.fix_types()
15+
self.projected_table = self.table
16+
self.projected_A = self.A
17+
self.r = self.table.index.to_numpy()
18+
self.r_vecs = list() # temp
19+
self.set_r_vec_naive()
20+
21+
def set_A_vec_naive(self):
22+
self.A_vecs = [self.table[col].unique() for col in self.headers]
23+
24+
def set_r_vec_naive(self):
25+
self.r_vecs = self.projected_table.to_numpy()
26+
27+
def update_A_vec(self, vecs):
28+
self.A_vecs = vecs
29+
30+
def update_projected_table(self, attributes):
31+
self.projected_table = self.table.iloc[:, attributes]
32+
self.projected_A = attributes
33+
34+
def set_attribute_types(self, use_profiler=False):
35+
if use_profiler:
36+
profiler = ProfileReport(T_prime.table)
37+
var = prof.get_description()['variables']
38+
# print({i: v for i, v in enumerate(self.A)})
39+
return {i: var[v]['type'] for i, v in enumerate(self.table.columns)}
40+
else:
41+
return dict(zip(self.A, self.table.dtypes))
42+
43+
def fix_types(self):
44+
is_change = False
45+
for a in self.A_types:
46+
col = self.table.iloc[:, a]
47+
cardinality = len(col.unique())
48+
# if cardinality == 2:
49+
# self.table.iloc[:, a] = self.table.iloc[:, a].astype('bool')
50+
# is_change = True
51+
# elif cardinality <= self.CATEGORICAL_UPPER_BOUND:
52+
if cardinality <= self.CATEGORICAL_UPPER_BOUND:
53+
self.table.iloc[:, a] = self.table.iloc[:, a].astype('category')
54+
is_change = True
55+
elif self.A_types[a] == 'object' and pd.api.types.is_string_dtype(col):
56+
self.table.iloc[:, a] = self.table.iloc[:, a].astype(str)
57+
is_change = True
58+
if is_change:
59+
self.A_types = self.set_attribute_types(False)
60+
61+
def get_attributes_types(self, attribute_set):
62+
types = []
63+
for a in attribute_set:
64+
types.append(self.A_types[a])
65+
return types
66+
67+
def get_attributes_names(self, attribute_set):
68+
return [self.headers[i] for i in attribute_set]
69+
70+
def get_feature_like_attributes(self):
71+
return [a for a in self.A_types if self.A_types[a] not in ['object']]
72+
73+
def get_row_by_id(self, row_id):
74+
return self.table.iloc[row_id, :]
75+
76+
def get_rows_by_ids(self, row_ids):
77+
return self.table.iloc[row_ids, :]

Code/Table_Pair.py

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from Table import Table
2+
import pandas as pd
3+
4+
5+
class Table_Pair:
6+
7+
def __init__(self, T: Table, T_prime: Table):
8+
self.T = T
9+
self.T_prime = T_prime
10+
self.sigma_A = list()
11+
self.sigma_A_exact = list()
12+
self.LHCA = list()
13+
self.LHDA = list()
14+
self.RHCA = list()
15+
self.RHDA = list()
16+
self.sigma_r = list()
17+
self.sigma_r_exact = list()
18+
self.LHCr = list()
19+
self.LHDr = list()
20+
self.RHCr = list()
21+
self.RHDr = list()
22+
23+
def update_attribute_match(self, sigma_A):
24+
self.sigma_A = sigma_A
25+
self.LHCA = [c[0] for c in sigma_A]
26+
self.LHDA = list(set(self.T.A).difference(self.LHCA))
27+
self.RHCA = [c[1] for c in sigma_A]
28+
self.RHDA = list(set(self.T_prime.A).difference(self.RHCA))
29+
self.T.update_projected_table(self.LHCA)
30+
self.T_prime.update_projected_table(self.RHCA)
31+
32+
def update_exact_attribute_match(self, sigma_A_exact):
33+
self.sigma_A_exact = sigma_A_exact
34+
35+
def update_record_match(self, sigma_r):
36+
self.sigma_r = sigma_r
37+
self.LHCr = [c[0] for c in sigma_r]
38+
self.LHDr = list(set(self.T.r).difference(self.LHCr))
39+
self.RHCr = [c[1] for c in sigma_r]
40+
self.RHDr = list(set(self.T_prime.r).difference(self.RHCr))
41+
42+
def update_exact_record_match(self, sigma_r_exact):
43+
self.sigma_r_exact = sigma_r_exact

0 commit comments

Comments
 (0)