Skip to content

PCC05 avnig #842

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: community
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions 05/avnig/similar_tweeters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import re
import sys
from collections import defaultdict

from usertweets import UserTweets
import nltk

nltk.download('punkt')
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim import corpora, similarities, models

stop_words = set(stopwords.words('english'))


def clean_tweet(tweet):
# removes #, @ and URL
tweet = re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", tweet)
tweet = re.sub("RT[\s]+", "", tweet)
tweet = re.sub("[0-9]+", "", tweet)
tweet = ' '.join([w for w in tweet.split() if len(w) > 3])
return tweet


def tokenize_tweet(tweet):
return word_tokenize(tweet)


def filter_tweet(tokens):
return [w for w in tokens if not w.lower() in stop_words]


def preprocess_tweets(handle, tweet_count=5):
user_tweets = UserTweets(handle)
superset_tokens = []
for tw in user_tweets[:tweet_count]:
tweeted_text = clean_tweet(tw.text)
tokens_text = tokenize_tweet(tweeted_text)
filtered_text = filter_tweet(tokens_text)

# remove words that appear only once
frequency = defaultdict(int)
for token in filtered_text:
frequency[token] += 1

superset_tokens.append([token for token in filtered_text if frequency[token] > 1])
return superset_tokens


def similar_tweeters(user1, user2):
number_of_tweets = 5000
user1_tweets_relevant_tokens = preprocess_tweets(user1, number_of_tweets)
user2_tweets_relevant_tokens = preprocess_tweets(user2, number_of_tweets)
# print(user1_out)
dictionary = corpora.Dictionary(user1_tweets_relevant_tokens)
corpus = [dictionary.doc2bow(text) for text in user1_tweets_relevant_tokens]
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
vec_lsi = [lsi[dictionary.doc2bow(text)] for text in user2_tweets_relevant_tokens]
index = similarities.MatrixSimilarity(lsi[corpus])

sims = [index[a] for a in vec_lsi]
most_similar = [max(a) for a in sims]
mean_similarity = sum(most_similar) / len(most_similar)

print('------- Checking Twitter Handle Similarity --------')
print(f'Similarity between {user1} and {user2}: {round(mean_similarity, 4) * 100}%')
# user2_tweets = UserTweets(user2)
# for tw in user2_tweets[:5]:
# print(tw.text)
# print()


if __name__ == "__main__":
if len(sys.argv) < 3:
print('Usage: {} <user1> <user2>'.format(sys.argv[0]))
sys.exit(1)

user1, user2 = sys.argv[1:3]
similar_tweeters(user1, user2)
71 changes: 71 additions & 0 deletions 05/avnig/usertweets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from collections import namedtuple
import csv
import os

import tweepy

from config import CONSUMER_KEY, CONSUMER_SECRET
from config import ACCESS_TOKEN, ACCESS_SECRET

DEST_DIR = 'data'
EXT = 'csv'
NUM_TWEETS = 100

Tweet = namedtuple('Tweet', 'id_str created_at text')

class UserTweets(object):

def __init__(self, handle, max_id=None):
"""Get handle and optional max_id.
Use tweepy.OAuthHandler, set_access_token and tweepy.API
to create api interface.
Use _get_tweets() helper to get a list of tweets.
Save the tweets as data/<handle>.csv"""
# ...
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
self.api = tweepy.API(auth)
self.handle = handle
self.max_id = max_id
self.output_file = os.path.join(f"{DEST_DIR}/handle.{EXT}")
self._tweets = list(self._get_tweets())
self._save_tweets()

def _get_tweets(self):
"""Hint: use the user_timeline() method on the api you defined in init.
See tweepy API reference: http://docs.tweepy.org/en/v3.5.0/api.html
Use a list comprehension / generator to filter out fields
id_str created_at text (optionally use namedtuple)"""
# file_tweets = tweets.TWEETS
data_tweets = self.api.user_timeline(screen_name=self.handle, max_id=self.max_id, count=NUM_TWEETS)
for tweet in data_tweets:
yield Tweet(tweet.id_str, tweet.created_at, tweet.text)

def _save_tweets(self):
"""Use the csv module (csv.writer) to write out the tweets.
If you use a namedtuple get the column names with Tweet._fields.
Otherwise define them as: id_str created_at text
You can use writerow for the header, writerows for the rows"""
if not os.path.exists(DEST_DIR):
os.mkdir(DEST_DIR)
with open(self.output_file, 'w') as f:
w = csv.writer(f)
w.writerow(Tweet._fields)
w.writerows(tweet for tweet in self._tweets)

def __len__(self):
"""See http://pybit.es/python-data-model.html"""
return len(self._tweets)

def __getitem__(self, pos):
"""See http://pybit.es/python-data-model.html"""
return self._tweets[pos]


if __name__ == "__main__":
for handle in ('pybites', 'juliansequeira', 'bbelderbos'):
print('--- {} ---'.format(handle))
user = UserTweets(handle)
for tw in user[:5]:
print(tw)
print()