generated from 10xac/Twitter-Data-Analysis-Template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_tweets_dataframe.py
80 lines (67 loc) · 2.83 KB
/
clean_tweets_dataframe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pandas as pd
class Clean_Tweets:
"""
The PEP8 Standard AMAZING!!!
"""
#
def __init__(self, df:pd.DataFrame):
self.df = df
print('Automation in Action...!!!')
#
def drop_unwanted_column(self, df:pd.DataFrame)->pd.DataFrame:
"""
remove rows that has column names. This error originated from
the data collection stage.
"""
unwanted_rows = df[df['retweet_count'] == 'retweet_count' ].index
df.drop(unwanted_rows , inplace=True)
df = df[df['polarity'] != 'polarity']
return df
# remove duplicated rows
def drop_duplicate(self, df:pd.DataFrame)->pd.DataFrame:
"""
drop duplicate rows
"""
df = self.df.drop_duplicates(subset ='original_text')
return df
# convert created_at column into datetime data type
def convert_to_datetime(self, df:pd.DataFrame)->pd.DataFrame:
"""
convert column to datetime
"""
self.df['created_at'] = pd.to_datetime(self.df['created_at'], errors='coerce')
self.df = self.df[self.df['created_at'] >= '2020-12-31' ]
return self.df
#
def convert_to_numbers(self, df:pd.DataFrame)->pd.DataFrame:
"""
convert columns like polarity, subjectivity, retweet_count
favorite_count etc to numbers
"""
self.df['polarity'] = pd.to_numeric(self.df["polarity"],errors = 'coerce')
self.df['subjectivity'] = pd.to_numeric(self.df["subjectivity"],errors = 'coerce')
self.df['retweet_count'] = pd.to_numeric(self.df["retweet_count"],errors = 'coerce')
self.df['favorite_count'] = pd.to_numeric(self.df["favorite_count"],errors = 'coerce')
return self.df
#
def remove_non_english_tweets(self, df:pd.DataFrame)->pd.DataFrame:
"""
remove non english tweets from lang
"""
# self.df = self.df.query("lang == 'en' ")
self.df = self.df.drop(self.df[self.df['lang'] != 'en'].index)
return self.df
def extract_twitter_source(self, df:pd.DataFrame)->pd.DataFrame:
self.df["source"] = self.df["source"].str.replace(r"(\s*\<.*?\>\s*)", " ", regex=True).str.strip()
#
if __name__ == "__main__":
cleaned_df = pd.read_csv("data/processed_tweet_data.csv")
clean_tweets = Clean_Tweets(cleaned_df)
cleaned_df = clean_tweets.drop_duplicate(cleaned_df)
cleaned_df = clean_tweets.remove_non_english_tweets(cleaned_df)
cleaned_df = clean_tweets.convert_to_datetime(cleaned_df)
cleaned_df = clean_tweets.drop_unwanted_column(cleaned_df)
cleaned_df = clean_tweets.convert_to_numbers(cleaned_df)
print(cleaned_df['polarity'][0:5])
cleaned_df.to_csv('data/clean_processed_tweet_data.csv')
print('File Successfully Saved.!!!')