diff --git a/05/avnig/similar_tweeters.py b/05/avnig/similar_tweeters.py new file mode 100644 index 000000000..b0d279c40 --- /dev/null +++ b/05/avnig/similar_tweeters.py @@ -0,0 +1,79 @@ +import re +import sys +from collections import defaultdict + +from usertweets import UserTweets +import nltk + +nltk.download('punkt') +from nltk import word_tokenize +from nltk.corpus import stopwords +from gensim import corpora, similarities, models + +stop_words = set(stopwords.words('english')) + + +def clean_tweet(tweet): + # removes #, @ and URL + tweet = re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", tweet) + tweet = re.sub("RT[\s]+", "", tweet) + tweet = re.sub("[0-9]+", "", tweet) + tweet = ' '.join([w for w in tweet.split() if len(w) > 3]) + return tweet + + +def tokenize_tweet(tweet): + return word_tokenize(tweet) + + +def filter_tweet(tokens): + return [w for w in tokens if not w.lower() in stop_words] + + +def preprocess_tweets(handle, tweet_count=5): + user_tweets = UserTweets(handle) + superset_tokens = [] + for tw in user_tweets[:tweet_count]: + tweeted_text = clean_tweet(tw.text) + tokens_text = tokenize_tweet(tweeted_text) + filtered_text = filter_tweet(tokens_text) + + # remove words that appear only once + frequency = defaultdict(int) + for token in filtered_text: + frequency[token] += 1 + + superset_tokens.append([token for token in filtered_text if frequency[token] > 1]) + return superset_tokens + + +def similar_tweeters(user1, user2): + number_of_tweets = 5000 + user1_tweets_relevant_tokens = preprocess_tweets(user1, number_of_tweets) + user2_tweets_relevant_tokens = preprocess_tweets(user2, number_of_tweets) + # print(user1_out) + dictionary = corpora.Dictionary(user1_tweets_relevant_tokens) + corpus = [dictionary.doc2bow(text) for text in user1_tweets_relevant_tokens] + lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2) + vec_lsi = [lsi[dictionary.doc2bow(text)] for text in user2_tweets_relevant_tokens] + index = similarities.MatrixSimilarity(lsi[corpus]) + + sims = [index[a] for a in vec_lsi] + most_similar = [max(a) for a in sims] + mean_similarity = sum(most_similar) / len(most_similar) + + print('------- Checking Twitter Handle Similarity --------') + print(f'Similarity between {user1} and {user2}: {round(mean_similarity, 4) * 100}%') + # user2_tweets = UserTweets(user2) + # for tw in user2_tweets[:5]: + # print(tw.text) + # print() + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print('Usage: {} '.format(sys.argv[0])) + sys.exit(1) + + user1, user2 = sys.argv[1:3] + similar_tweeters(user1, user2) diff --git a/05/avnig/usertweets.py b/05/avnig/usertweets.py new file mode 100644 index 000000000..27f848d4f --- /dev/null +++ b/05/avnig/usertweets.py @@ -0,0 +1,71 @@ +from collections import namedtuple +import csv +import os + +import tweepy + +from config import CONSUMER_KEY, CONSUMER_SECRET +from config import ACCESS_TOKEN, ACCESS_SECRET + +DEST_DIR = 'data' +EXT = 'csv' +NUM_TWEETS = 100 + +Tweet = namedtuple('Tweet', 'id_str created_at text') + +class UserTweets(object): + + def __init__(self, handle, max_id=None): + """Get handle and optional max_id. + Use tweepy.OAuthHandler, set_access_token and tweepy.API + to create api interface. + Use _get_tweets() helper to get a list of tweets. + Save the tweets as data/.csv""" + # ... + auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) + auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET) + self.api = tweepy.API(auth) + self.handle = handle + self.max_id = max_id + self.output_file = os.path.join(f"{DEST_DIR}/handle.{EXT}") + self._tweets = list(self._get_tweets()) + self._save_tweets() + + def _get_tweets(self): + """Hint: use the user_timeline() method on the api you defined in init. + See tweepy API reference: http://docs.tweepy.org/en/v3.5.0/api.html + Use a list comprehension / generator to filter out fields + id_str created_at text (optionally use namedtuple)""" + # file_tweets = tweets.TWEETS + data_tweets = self.api.user_timeline(screen_name=self.handle, max_id=self.max_id, count=NUM_TWEETS) + for tweet in data_tweets: + yield Tweet(tweet.id_str, tweet.created_at, tweet.text) + + def _save_tweets(self): + """Use the csv module (csv.writer) to write out the tweets. + If you use a namedtuple get the column names with Tweet._fields. + Otherwise define them as: id_str created_at text + You can use writerow for the header, writerows for the rows""" + if not os.path.exists(DEST_DIR): + os.mkdir(DEST_DIR) + with open(self.output_file, 'w') as f: + w = csv.writer(f) + w.writerow(Tweet._fields) + w.writerows(tweet for tweet in self._tweets) + + def __len__(self): + """See http://pybit.es/python-data-model.html""" + return len(self._tweets) + + def __getitem__(self, pos): + """See http://pybit.es/python-data-model.html""" + return self._tweets[pos] + + +if __name__ == "__main__": + for handle in ('pybites', 'juliansequeira', 'bbelderbos'): + print('--- {} ---'.format(handle)) + user = UserTweets(handle) + for tw in user[:5]: + print(tw) + print()