-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathword2vec.py
executable file
·185 lines (169 loc) · 7.33 KB
/
word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/python -W all
"""
word2vec.py: process tweets with word2vec vectors
usage: word2vec.py [-x] [-m model-file [-l word-vector-length]] -w word-vector-file -T train-file -t test-file
notes:
- optional model file is a text file from which the word vector file is built
- option x writes tokenized sentences to stdout
20170504 erikt(at)xs4all.nl
"""
# import modules & set up logging
import gensim
import getopt
import logging
import numpy
import naiveBayes
import os.path
import re
import sys
from scipy.sparse import csr_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
# constants
COMMAND = "word2vec.py"
TWEETCOLUMN = 4 # column tweet text in test data file dutch-2012.csv
CLASSCOLUMN = 9 # column tweeting behaviour (T3) in file dutch-2012.csv
IDCOLUMN = 0 # column with the id of the current tweet
PARENTCOLUMN = 5 # column of the id of the parent of the tweet if it is a retweet or reply (otherwise: None)
HASHEADING = True
MINCOUNT = 2
USAGE = "usage: "+COMMAND+" [-m model-file] -w word-vector-file -T train-file -t test-file\n"
# input file names
trainFile = ""
testFile = ""
wordvectorFile = ""
modelFile = ""
# length of word vectors
maxVector = 200
# exporting tokenized sentences
exportTokens = False
selectedTokens = {}
# check for command line options
def checkOptions():
global trainFile
global testFile
global wordvectorFile
global modelFile
global maxVector
global exportTokens
try: options = getopt.getopt(sys.argv,"T:t:w:m:l:x",[])
except: sys.exit(USAGE)
for option in options[0]:
if option[0] == "-T": trainFile = option[1]
elif option[0] == "-t": testFile = option[1]
elif option[0] == "-w": wordvectorFile = option[1]
elif option[0] == "-m": modelFile = option[1]
elif option[0] == "-l": maxVector = int(option[1])
elif option[0] == "-x": exportTokens = True
if trainFile == "" or testFile == "" or wordvectorFile == "":
sys.exit(USAGE)
# create data matrix (no sparse version needed)
def makeVectors(tokenizeResults,wordvecModel,selectedTokens):
tweetVectors = numpy.zeros((len(tokenizeResults),maxVector),dtype=numpy.float64)
# process all tweets
seen = {}
for i in range(0,len(tokenizeResults)):
# process all tokens in this tweet
for token in tokenizeResults[i]:
# if the token is present in the word vector model
if token in wordvecModel and token in selectedTokens and not token in seen:
# add (+) the word vector of this token to the tweet vector
tweetVectors[i] += wordvecModel[token]
seen[token] = True
# the result: a tweet vector which is the sum of its token vectors
return(tweetVectors)
# change the class vector into a binary vector
def makeBinary(vector):
outVector = []
for e in vector:
if e == naiveBayes.OTHER: outVector.append(0)
else: outVector.append(1)
return(outVector)
# read wordvector file from file in format of fasttext:
# first line: nbrOfVectors vectorLength; rest: token vector
def readFasttextModel(wordvectorFile):
global maxVector
try: inFile = open(wordvectorFile,"r")
except: sys.exit(COMMAND+": cannot read file "+wordvectorFile)
wordvectorModel = {}
lineCounter = 0
expectedLines = -1
for line in inFile:
line = line.rstrip()
fields = line.split()
lineCounter += 1
if lineCounter == 1:
if len(fields) != 2: sys.exit(COMMAND+": unexpected first line of file "+wordvectorFile+": "+line)
expectedLines = int(fields[0])
maxVector = int(fields[1])
else:
if len(fields) != 1+maxVector: sys.exit(COMMAND+": unexpected line in file "+wordvectorFile+": "+line)
token = fields.pop(0)
for i in range(0,len(fields)): fields[i] = float(fields[i])
wordvectorModel[token] = fields
inFile.close()
return(wordvectorModel)
# main function starts here
checkOptions()
# get target classes from training data file
targetClasses = naiveBayes.getTargetClasses(trainFile)
if len(targetClasses) == 0: sys.exit(COMMAND+": cannot find target classes\n")
# if required: train the word vector model and save it to file
if modelFile != "":
# read the model data
readDataResults = naiveBayes.readData(modelFile,targetClasses[0])
# tokenize the model data
tokenizeResults = naiveBayes.tokenize(readDataResults["text"])
# build the word vectors (test sg=1,window=10)
wordvecModel = gensim.models.Word2Vec(tokenizeResults, min_count=MINCOUNT, size=maxVector)
# save the word vectors
wordvecModel.save(wordvectorFile)
# load the word vector model from file
patternNameVec = re.compile("\.vec$")
if not patternNameVec.search(wordvectorFile):
print >> sys.stderr,"loading gensim vector model from file: %s" % (wordvectorFile)
# read standard file format from gensim
wordvecModel = gensim.models.Word2Vec.load(wordvectorFile)
else:
print >> sys.stderr,"loading fasttext vector model from file: %s" % (wordvectorFile)
# read file format from fasttext
wordvecModel = readFasttextModel(wordvectorFile)
# read training data, tokenize data, make vector matrix
readDataResults = naiveBayes.readData(trainFile,"")
tokenizeResults = naiveBayes.tokenize(readDataResults["text"])
# check if we need to export tokens
if exportTokens:
for i in range(0,len(tokenizeResults)):
sys.stdout.write("__label__"+readDataResults["classes"][i])
for j in range(0,len(tokenizeResults[i])):
sys.stdout.write(" ")
sys.stdout.write(unicode(tokenizeResults[i][j]).encode('utf8'))
sys.stdout.write("\n")
sys.exit()
# select tokens to be used in model, based on token frequency
selectedTokens = naiveBayes.selectFeatures(tokenizeResults,MINCOUNT)
makeVectorsResultsTrain = makeVectors(tokenizeResults,wordvecModel,selectedTokens)
# the matrix can be saved to file and reloaded in next runs but this does not gain much time
# read test data, tokenize data, make vector matrix
readDataResults = naiveBayes.readData(testFile,"")
tokenizeResults = naiveBayes.tokenize(readDataResults["text"])
makeVectorsResultsTest = makeVectors(tokenizeResults,wordvecModel,selectedTokens)
# run binary svm experiments: one for each target class
for targetClass in targetClasses:
# read the training and test file again to get the right class distribution for this target class
readDataResultsTrain = naiveBayes.readData(trainFile,targetClass)
readDataResultsTest = naiveBayes.readData(testFile,targetClass)
# get binary version of train classes
binTrainClasses = makeBinary(readDataResultsTrain["classes"])
# perform svm experiment: http://scikit-learn.org/stable/modules/svm.html (1.4.1.1)
clf = svm.SVC(decision_function_shape='ovo') # definition
clf.fit(makeVectorsResultsTrain,binTrainClasses) # training
outFile = open(testFile+".out."+targetClass,"w") # output file for test results
scores = clf.decision_function(makeVectorsResultsTest) # process all test items
for i in range(0,len(makeVectorsResultsTest)):
guess = "O"
if scores[i] >= 0: guess = targetClass
print >>outFile, "# %d: %s %s %0.3f" % (i,readDataResultsTest["classes"][i],guess,scores[i])
outFile.close()