-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathmodel_nltk.py
122 lines (85 loc) · 4.12 KB
/
model_nltk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# importing libraries for persorming the sentiment analysis, cleaning data, training and saving model
import pickle
import random
import re
import string
from nltk import FreqDist, NaiveBayesClassifier, classify
from nltk.corpus import stopwords, twitter_samples
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
def remove_noise(tweet_tokens, stop_words=()):
'''This function removes the links or hashtags presesnt in the text and change the verbs to its first form'''
cleaned_tokens = []
for token, tag in pos_tag(tweet_tokens):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'
'(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', token)
token = re.sub("(@[A-Za-z0-9_]+)", "", token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
cleaned_tokens.append(token.lower())
return cleaned_tokens
def get_all_words(cleaned_tokens_list):
'''It acts as an generator for the tokens'''
for tokens in cleaned_tokens_list:
for token in tokens:
yield token
def get_tweets_for_model(cleaned_tokens_list):
'''This function takes the cleaned token list as input and reutrn a list which is suitable to fed to the classifier'''
for tweet_tokens in cleaned_tokens_list:
yield dict([token, True] for token in tweet_tokens)
def predict_sentiment(sentence, classifier):
'''predict_sentiment function predict the senitment of the text which was given as a argument'''
custom_tokens = remove_noise(word_tokenize(sentence))
return classifier.classify(
dict([token, True] for token in custom_tokens))
def save_model():
'''Saving the trained classifier'''
f = open('my_classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()
if __name__ == "__main__":
# loading dataset for model trainig
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]
# saving the stopwords from the nltk into a variable
stop_words = stopwords.words('english')
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
for tokens in positive_tweet_tokens:
positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
all_pos_words = get_all_words(positive_cleaned_tokens_list)
freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))
positive_tokens_for_model = get_tweets_for_model(
positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(
negative_cleaned_tokens_list)
positive_dataset = [(tweet_dict, "Positive")
for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, "Negative")
for tweet_dict in negative_tokens_for_model]
dataset = positive_dataset + negative_dataset
random.shuffle(dataset)
train_data = dataset[: 7000]
test_data = dataset[7000:]
classifier = NaiveBayesClassifier.train(train_data)
print("Accuracy is:", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(10))
custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."
custom_tokens = remove_noise(word_tokenize(custom_tweet))
print(custom_tweet, classifier.classify(
dict([token, True] for token in custom_tokens)))