-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdemoGetRhymeFeature.py
142 lines (121 loc) · 6.17 KB
/
demoGetRhymeFeature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import cPickle as pickle
import os, timeit
from utilities.readTweet import *
from utilities.normalizeTweets import normTweet1
from utilities.rhymingWords import *
import csv
import numpy as np
from scipy.stats import ttest_ind
def getData(): #returns 2 lists of strings for the 2 groups
control_folder_path = '../data/clpsych2015/schizophrenia/anonymized_control_tweets/'
sch_folder_path = '../data/clpsych2015/schizophrenia/anonymized_schizophrenia_tweets/'
csvFileLoc = '../data/clpsych2015/schizophrenia/anonymized_user_manifest.csv'
picklefile = 'dumpdata1_rhyme.pickle'
if os.path.isfile(picklefile):
with open(picklefile) as f:
allControlTweets, allSchTweets = pickle.load(f)
else:
#read lines of csv
allControl = readCSV(csvFileLoc, {'condition':'control'})
allSch = readCSV(csvFileLoc, {'condition':'schizophrenia'})
#get tweets by read files.
start = timeit.default_timer()
allControlTweets = getTweetsForGroup(allControl, control_folder_path, sch_folder_path, fields = ['text']) #read the files and get the tweet (only the text as specified in 'field')
allSchTweets = getTweetsForGroup(allSch, control_folder_path, sch_folder_path, fields = ['text'])
allControlTweets = getFieldDictFromGroupPerUser(allControlTweets, field = 'text')
allSchTweets = getFieldDictFromGroupPerUser(allSchTweets, field = 'text')
print 'got tweets', timeit.default_timer() - start
with open(picklefile, 'w') as f:
pickle.dump([allControlTweets, allSchTweets], f)
return [allControlTweets, allSchTweets]
def cleanup(tt): #do spell check, separate out hashtags and add that to spellchecked text
t = {k:normTweet1(tt[k], ops = [0], retain = 2, separateTokens = ['E'])[0] for k in tt}
return {k:[(' ').join([word[0] for word in singletweet]) for singletweet in t[k][-1]] for k in t}
#t = {k:normTweet1(tt[k], ops = [0,1], retain = 2, separateTokens = ['E'])[0] for k in tt} #key:[[[],[],[],[]], [[],[],[],[]], [,[(),(),()],[(),()],[()],[()]]]
#for k in t:
# tmp = t[k]
# t[k] = [(' ').join([tpl[0] for tpl in tmp[0][i] + tmp[2][i]]) for i in range(0,len(tmp[0]))] # len(tmp[0]) is num of tweets fro this user. tmp[0] is the hashtags, tmp[2] is the text
#return t
def getRhymingScores(cleanedTweets, entries): #cleanedTweetsis a dict. username is key, and val is a list of strings (user's cleaned tweets)
return {user:getRhymingScoreForText(cleanedTweets[user], entries) for user in cleanedTweets}
print 'start'
[allControlTweets, allSchTweets] = getData()
picklefile = 'dumpdata2_rhyme.pickle'
if os.path.isfile(picklefile):
with open(picklefile) as f:
allControlTweetsCleaned, allSchTweetsCleaned = pickle.load(f)
else:
print 'clean allControlTweets'
allControlTweetsCleaned = cleanup(allControlTweets)
print 'clean allControlTweets'
allSchTweetsCleaned = cleanup(allSchTweets)
with open(picklefile, 'w') as f:
pickle.dump([allControlTweetsCleaned, allSchTweetsCleaned], f)
print 'calc rhyming scores'
picklefile = 'dumpdata3_rhyme.pickle'
if os.path.isfile(picklefile):
with open(picklefile) as f:
rhymingScoresControl, rhymingScoresSch = pickle.load(f)
else:
entries = getWordSyllableDict()
rhymingScoresControl = getRhymingScores(allControlTweetsCleaned, entries)
rhymingScoresSch = getRhymingScores(allSchTweetsCleaned, entries)
with open(picklefile, 'w') as f:
pickle.dump([rhymingScoresControl, rhymingScoresSch], f)
picklefile = 'dumpdata3_rhyme.pickle'
with open(picklefile) as f:
rhymingScoresControl, rhymingScoresSch = pickle.load(f)
mnCtrl = []; vrCtrl = []; zerCtrl = []; nonzerCtrl = []
mnSch = []; vrSch = []; zerSch = []; nonzerSch = []
with open('RhymeFeaturesCtrl.csv', 'wb') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
for user in rhymingScoresControl:
scores = rhymingScoresControl[user]
mn = np.nanmean([np.mean(i) for i in scores]) #nanmean ignores nans. nan may occur if mean of [] (empty) array is taken
vr = np.nanmean([np.var(i) for i in scores])
zer = 0.; nonzer = 0.
for i in scores:
for j in i:
if j==0:
zer+=1
else:
nonzer+=1
mnCtrl += [mn]; vrCtrl += [vr]; zerCtrl += [zer/(zer+nonzer)]; nonzerCtrl += [nonzer/(zer+nonzer)];
writer.writerow([user, mn, vr, zer/(zer+nonzer), nonzer/(zer+nonzer)])
with open('RhymeFeaturesSch.csv', 'wb') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
for user in rhymingScoresSch:
scores = rhymingScoresSch[user]
mn = np.nanmean([np.mean(i) for i in scores])
vr = np.nanmean([np.var(i) for i in scores])
zer = 0.; nonzer = 0.
for i in scores:
for j in i:
if j==0:
zer+=1
else:
nonzer+=1
mnSch += [mn]; vrSch += [vr]; zerSch += [zer/(zer+nonzer)]; nonzerSch += [nonzer/(zer+nonzer)];
writer.writerow([user, mn, vr, zer/(zer+nonzer), nonzer/(zer+nonzer)])
#concatControl = []
#for i in rhymingScoresControl:
# concatControl += rhymingScoresControl[i]
#concatSch = []
#for i in rhymingScoresSch:
# concatSch += rhymingScoresSch[i]
print 't test mean', ttest_ind(mnCtrl, mnSch)
print 't test var', ttest_ind(vrCtrl, vrSch)
print 't test zero', ttest_ind(zerCtrl, zerSch)
print 't test nonzero', ttest_ind(nonzerCtrl, nonzerSch)
"""
t test mean Ttest_indResult(statistic=-4.7560304698663938, pvalue=3.2066866234498495e-06)
t test var Ttest_indResult(statistic=-5.3343641528416716, pvalue=2.0198713087689003e-07)
t test zero Ttest_indResult(statistic=4.6420513477542551, pvalue=5.3721995651696209e-06)
t test nonzero Ttest_indResult(statistic=-4.6420513477542356, pvalue=5.3721995651701003e-06)
"""
"""
t test mean Ttest_indResult(statistic=-3.900886273598184, pvalue=0.00012084652671746473)
t test var Ttest_indResult(statistic=-4.5176265439826215, pvalue=9.3314325366263576e-06)
t test zero Ttest_indResult(statistic=4.1322320325847013, pvalue=4.7872826011918165e-05)
t test nonzero Ttest_indResult(statistic=-4.1322320325847146, pvalue=4.7872826011915651e-05)
"""