-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrnn.py
190 lines (166 loc) · 6.27 KB
/
rnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import numpy as np
import math
np.random.seed(42)
import itertools
import tensorflow as tf
tf.set_random_seed(42)
from sklearn.preprocessing import scale
from keras.models import Sequential, load_model
from keras.preprocessing.text import one_hot, text_to_word_sequence
from keras.layers import Dense, Activation
from keras.layers import LSTM, Dropout, GRU
from keras.layers import TimeDistributed
from keras.layers.core import Dense, Activation, Dropout, RepeatVector
from keras.optimizers import RMSprop
import matplotlib.pyplot as plt
import pickle
import sys
import heapq
import seaborn as sns
from gensim.models.word2vec import Word2Vec
from pylab import rcParams
from joblib import Parallel, delayed
from utils import constants
from utils.helpers import load_wv, memoize, script_tokenizer
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
rcParams['figure.figsize'] = 12, 5
# pad the scripts with begining and ending
def pad_script(script):
# add seq_size -1 to the begining of the script
script.append(constants.END_SYMBOL)
for _ in range(0, SEQ_SIZE-1):
script.insert(0, constants.START_SYMBOL)
return script
def load_data(file_path, num_scripts=100000, padding=False):
print(f'loading {num_scripts} lines of data..')
tokens_list = []
with open(file_path, 'r') as f:
for idx, line in enumerate(f.readlines()):
# line_words = text_to_word_sequence(line, filters='\n', lower=False)
line_words = script_tokenizer(line)
if (padding): line_words = pad_script(line_words)
tokens_list.append(line_words)
if (idx >= num_scripts): break
all_tokens = itertools.chain.from_iterable(tokens_list)
word_to_id = {token: idx for idx, token in enumerate(set(all_tokens))}
# convert token lists to token-id lists, e.g. [[1, 2], [2, 2]] here
token_ids = [[word_to_id[token] for token in tokens_doc] for tokens_doc in tokens_list]
return token_ids, word_to_id
@memoize
def one_hot_encode(number):
vec = np.zeros(VOCAB_SIZE, dtype=bool)
np.put(vec, number, True)
return vec
blockVectors = False
@memoize
def embedding_encode(number):
# number to block
block = x_word[number]
return blockVectors[block]
def encode_dataset(x, encoder):
encodedSet = []
failedCounter = 0
for seq in x:
encoded = []
for blockNumber in seq:
try:
encBlock = encoder(blockNumber)
encoded.append(encBlock)
except Exception as e:
print(e)
failedCounter += 1
encodedSet.append(encoded)
if (failedCounter > 10): print(f'failed to encode {failedCounter} block instances')
return encodedSet
# split a list randomly into two with the given ratio
def split_data(aList, ratio=0.8):
np.random.shuffle(aList)
splitPoint = int(ratio * len(aList))
return aList[0:splitPoint], aList[splitPoint:]
# build lists of sequences and the actual next word for that response
# builds out the sequences and the corresponding next word
def build_xy(scripts_list):
sequences = []
next_words = []
for script in scripts_list:
scriptSize = len(script)
for idx in range(0, scriptSize-SEQ_SIZE):
sequences.append(script[idx:idx+SEQ_SIZE])
next_words.append(script[idx+SEQ_SIZE])
return sequences, next_words
def plot():
#plot
print('plotting..')
plt.plot(history['acc'])
plt.plot(history['val_acc'])
plt.title(f'Model Accuracy. {configToString()}')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left');
plt.savefig(constants.RESULTS_DIR + f'/acc-{configToString()}.png', bbox_inches='tight')
plt.close() # clear fig
plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title(f'Model Loss. {configToString()}')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left');
plt.savefig(constants.RESULTS_DIR + f'/loss-{configToString()}.png', bbox_inches='tight')
# helper to print config
def configToString():
return f'{ENCODER.__name__}-{LOSS}-{DATASET_SIZE}-{BATCH_SIZE}-{LSTM_UNITS}-{BLOCK_VEC_SIZE}-{SEQ_SIZE}-{EPOCHS}-{PADDING}'
#### parameters
SEQ_SIZE = 4
PADDING = False
VOCAB_SIZE = 'unknown' # will be set after loading the dataset
# model params
LSTM_UNITS = 128
EPOCHS = 3
BATCH_SIZE = 128
DATASET_SIZE = 500 * 1000
VALIDATION_SPLIT = 0.1
ENCODER = embedding_encode
OPTIMIZER = RMSprop(lr=0.01) # 'adam'
DROPOUT = 0.01
RNN_CELL = GRU
x, word_x = load_data(constants.INPUT, num_scripts=DATASET_SIZE, padding=PADDING)
VOCAB_SIZE = len(word_x) + 2 if (PADDING) else len(word_x) # calc vocab size
## auto set
if (ENCODER == embedding_encode):
blockVectors = load_wv(False)
BLOCK_VEC_SIZE = constants.WORD2VEC_SIZE # if word2vec == vec size
LOSS = 'mse'
else:
BLOCK_VEC_SIZE = VOCAB_SIZE
LOSS = 'categorical_crossentropy'
print('Running config:', configToString())
x_word = {v: k for k, v in word_x.items()}
print('Encoding the words..')
encoded_x = encode_dataset(x, ENCODER)
print('Building the y labels..')
xs, ys = build_xy(encoded_x)
# make it a numpy array
xs = np.array(xs)
ys = np.array(ys)
print('Sample training sequence', xs[0])
print(math.floor(len(xs)*(1-VALIDATION_SPLIT)), 'train sequences')
print(math.floor(len(xs)*VALIDATION_SPLIT), 'test sequences')
print('X shape', np.shape(xs))
print('Y shape', np.shape(ys))
print('Building the model')
model = Sequential()
# model.add(Embedding(VOCAB_SIZE, 100, input_length=10))
model.add(RNN_CELL(LSTM_UNITS, dropout=DROPOUT, recurrent_dropout=0, input_shape=(SEQ_SIZE, BLOCK_VEC_SIZE)))
model.add(Dense(BLOCK_VEC_SIZE))
model.add(Activation('softmax'))
print('Training..')
model.compile(loss=LOSS, optimizer=OPTIMIZER, metrics=['accuracy'])
history = model.fit(xs, ys, validation_split=VALIDATION_SPLIT, batch_size=BATCH_SIZE, epochs=EPOCHS, shuffle=True).history
print(model.summary())
print('saving the model and history..')
model.save( constants.RESULTS_DIR + f'/rnn-{configToString()}.h5')
pickle.dump(history, open( constants.RESULTS_DIR + f"/history-{configToString()}.p", "wb"))
# load em back
model = load_model( constants.RESULTS_DIR + f'/rnn-{configToString()}.h5')
history = pickle.load(open( constants.RESULTS_DIR + f"/history-{configToString()}.p", "rb"))
plot()