-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdemoSimpleConnotationAnalyse.py
96 lines (79 loc) · 4.81 KB
/
demoSimpleConnotationAnalyse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import cPickle as pickle
import os, timeit, csv
from utilities.normalizeTweets import *
from utilities.readTweet import *
import numpy as np
def getData(): #returns 2 lists of strings for the 2 groups
control_folder_path = '../data/clpsych2015/schizophrenia/anonymized_control_tweets/'
sch_folder_path = '../data/clpsych2015/schizophrenia/anonymized_schizophrenia_tweets/'
csvFileLoc = '../data/clpsych2015/schizophrenia/anonymized_user_manifest.csv'
picklefile = 'dumpdata1_rhyme.pickle' #using the rhyme pickle file as its the same data
if os.path.isfile(picklefile):
with open(picklefile) as f:
allControlTweets, allSchTweets = pickle.load(f)
else:
#read lines of csv
allControl = readCSV(csvFileLoc, {'condition':'control'})
allSch = readCSV(csvFileLoc, {'condition':'schizophrenia'})
#get tweets by read files.
start = timeit.default_timer()
allControlTweets = getTweetsForGroup(allControl, control_folder_path, sch_folder_path, fields = ['text']) #read the files and get the tweet (only the text as specified in 'field')
allSchTweets = getTweetsForGroup(allSch, control_folder_path, sch_folder_path, fields = ['text'])
allControlTweets = getFieldDictFromGroupPerUser(allControlTweets, field = 'text')
allSchTweets = getFieldDictFromGroupPerUser(allSchTweets, field = 'text')
print 'got tweets', timeit.default_timer() - start
with open(picklefile, 'w') as f:
pickle.dump([allControlTweets, allSchTweets], f)
return [allControlTweets, allSchTweets]
def cleanup(tt): #do spell check, separate out hashtags and add that to spellchecked text
t = {k:normTweet1(tt[k], ops = [0], retain = 2, separateTokens = ['E'])[0] for k in tt}
return {k:[(' ').join([word[0] for word in singletweet]) for singletweet in t[k][-1]] for k in t}
def getSentimentDict():
d = {'positive':1, 'negative':-1, 'neutral':0}
with open('connotation_lexicon_a.0.1.csv', 'r') as fn:
t = [line.split(',') for line in fn.read().split('\n')]
return {line[0].split('_')[0]:d[line[1]] for line in t[0:-1]}
def getSentimentDict1():
with open('utilities/AFINN-111.txt','r') as f:
t = [line.split('\t') for line in f.read().split('\n')]
return {line[0]:int(line[1]) for line in t}
def getSentiFeatures(scoreList, counts): #scorelist is a list of lists containing only 1,-1,0 or 'x'
#counts of sentiments
#counts = {1:0, -1:0, 0:0, 'x':0} #[pos, neg, neu, unk]
countsOrig = {k:counts[k] for k in counts}
for singleTweetScores in scoreList:
counts = {k:countsOrig[k] for k in countsOrig}
for singleTweetScore in singleTweetScores:
#print counts, singleTweetScore, singleTweetScore in counts.keys()
#print counts.keys()
counts[singleTweetScore]+=1
ft = [counts[k]/(sum(counts.values())+0.) for k in counts] #return normalized counts
#mean/var of tweet sentiment score
tweetScores = [sum([singleTweetScore for singleTweetScore in singleTweetScores if singleTweetScore != 'x']) for singleTweetScores in scoreList]
ft += [np.mean(tweetScores), np.var(tweetScores), max(tweetScores)]
return ft
def calculateSentimentFeatures(userTweetDict, sentDict, csvName, counts):
t = {user:getSentiFeatures([[sentDict.get(word,'x') for word in tweet.split(' ')] for tweet in userTweetDict[user]], counts) for user in userTweetDict} #'x' is for words not in the senti dictionary
with open(csvName, 'wb') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
for user in t:
writer.writerow([user] + t[user])
print 'start'
[allControlTweets, allSchTweets] = getData()
picklefile = 'dumpdata2_rhyme.pickle' #using the rhyme pickle file as its the same data
if os.path.isfile(picklefile):
with open(picklefile) as f:
allControlTweetsCleaned, allSchTweetsCleaned = pickle.load(f)
else:
print 'clean allControlTweets'
allControlTweetsCleaned = cleanup(allControlTweets)
print 'clean allControlTweets'
allSchTweetsCleaned = cleanup(allSchTweets)
with open(picklefile, 'w') as f:
pickle.dump([allControlTweetsCleaned, allSchTweetsCleaned], f)
sentDict = getSentimentDict()
calculateSentimentFeatures(allControlTweetsCleaned, sentDict, 'control_simpleconnotation_features.csv', {1:0, -1:0, 0:0, 'x':0})
calculateSentimentFeatures(allSchTweetsCleaned, sentDict, 'sch_simpleconnotation_features.csv', {1:0, -1:0, 0:0, 'x':0})
sentDict = getSentimentDict1() #AFINN
calculateSentimentFeatures(allControlTweetsCleaned, sentDict, 'control_simplesentimentAFINN_features.csv', {1:0, -1:0, 0:0, 2:0, -2:0, 3:0, -3:0, 4:0, -4:0, 5:0, -5:0, 'x':0})
calculateSentimentFeatures(allSchTweetsCleaned, sentDict, 'sch_simplesentimentAFINN_features.csv', {1:0, -1:0, 0:0, 2:0, -2:0, 3:0, -3:0, 4:0, -4:0, 5:0, -5:0, 'x':0})