-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdata_cleanup.py
More file actions
143 lines (115 loc) · 4.34 KB
/
data_cleanup.py
File metadata and controls
143 lines (115 loc) · 4.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
import string
import unicodedata
import requests
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import nltk
import scipy as scs
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.util import ngrams
from nltk import pos_tag
from nltk import RegexpParser
import pickle
"""
Input: pd dataframe of raw features with no 'acct_type'.
Can be applied to training AND testing data.
"""
def delivery_method_categorize(data):
data['delivery_method_0'] = data['delivery_method'] == 0
data['delivery_method_1'] = data['delivery_method'] == 1
data['delivery_method_3'] = data['delivery_method'] == 3
return data
def payout_type_categorize(data):
data['payout_type_check'] = data['payout_type'] == 'CHECK'
data['payout_type_ach'] = data['payout_type'] == 'ACH'
return data
def currency_categorize(data):
data['usd'] = data['currency'] == 'USD'
data['gbp'] = data['currency'] == 'GBP'
data['cad'] = data['currency'] == 'CAD'
data['aud'] = data['currency'] == 'AUD'
data['eur'] = data['currency'] == 'EUR'
data['nzd'] = data['currency'] == 'NZD'
return data
def user_type_categorize(data):
data['user_type_1'] = data['user_type'] == 1
data['user_type_2'] = data['user_type'] == 2
data['user_type_3'] = data['user_type'] == 3
data['user_type_4'] = data['user_type'] == 4
data['user_type_5'] = data['user_type'] == 5
return data
def email_categorize(data):
"""
Define a "rare_email" domain as one that occurs one or zero times within the
training data.
"""
emails = pd.DataFrame(data['email_domain'].value_counts() <= 1)
emails['rare_email'] = emails['email_domain']
common_emails = emails.index[emails['rare_email'] == False]
data['rare_email'] = [domain not in common_emails for domain in data['email_domain']]
return data
def event_data(data):
"""
Calculate event duration from event end and start timestamps.
"""
data['event_duration'] = data['event_end'] - data['event_start']
return data
def listed_categorize(data):
"""
Categorizes the 'listed' column in the pandas dataframe.
INPUT:
- data: pandas dataframe with 'listed' column as 'y' or 'n'
OUTPUT:
- data: pandas dataframe with 'listed' column replaced with booleans
"""
data['listed'] = data['listed'] == 'y'
return data
def country_data(data):
"""
Takes a pandas dataframe and does some undetermined stuff with the countries
INPUT:
- data: pandas dataframe to get country data from and add engineered
columns to.
OUTPUT:
- data: pandas dataframe with engineered country features added.
"""
data['venue_country_change'] = (data['venue_country'] != data['country'])
data['is_us'] = data['country'] == 'US'
data['is_gb'] = data['country'] == 'GB'
data['is_ca'] = data['country'] == 'CA'
return data
def final_columns(data):
wanted_columns = ['delivery_method_0', 'delivery_method_1', 'delivery_method_3',
'payout_type_check', 'payout_type_ach', 'usd', 'gbp', 'cad',
'aud', 'eur', 'nzd', 'user_type_1', 'user_type_2', 'user_type_3',
'user_type_4', 'user_type_5', 'rare_email', 'event_duration',
'listed', 'venue_country_change', 'is_us', 'is_gb', 'is_ca',
'body_length', 'channels', 'fb_published',
'has_analytics', 'has_logo', 'listed', 'name_length', 'show_map',
'user_age', 'description']
data = data[wanted_columns]
return data
def clean_data(data):
"""
Cleans the entire data set.
INPUT:
- data: dataframe
"""
clean_data = delivery_method_categorize(data)
clean_data = country_data(data)
clean_data = listed_categorize(data)
clean_data = event_data(data)
clean_data = email_categorize(data)
clean_data = user_type_categorize(data)
clean_data = currency_categorize(data)
clean_data = payout_type_categorize(data)
clean_data = final_columns(data)
return clean_data