-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
124 lines (103 loc) · 4.08 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# Imports
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Activation, Dropout, TimeDistributed
from keras.layers import LSTM
from keras.models import model_from_json
from keras.utils import to_categorical
import numpy as np
import json
import os
kindly_bot = "src/kindly_bot.json"
with open(kindly_bot) as json_file:
data = json.load(json_file)
user_input_array = []
bot_answer_array = []
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
k = 0
for i in data["dialogues"]:
if("nb" in data["dialogues"][k]["samples"]):
label_id = len(labels_index)
labels_index[data["dialogues"][k]["title"]] = label_id
for samples in data["dialogues"][k]["samples"]["nb"]:
user_input_array.append(samples)
bot_answer_array.append(data["dialogues"][k]["replies"]["nb"])
labels.append(label_id)
k += 1
#Sentences are padded to 15 words
MAX_SEQUENCE_LEN = 15
#Only the 4k top words in the input file are vectorized
MAX_NUM_WORDS = 4000
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(user_input_array)
word_index = tokenizer.word_index
#Dumps a json dictionary with the words
with open('model/dictionary.json', 'w', encoding = 'utf8') as dictionary_file:
json.dump(word_index, dictionary_file, ensure_ascii=False)
#Converts each word into a sequence, done for the network
def convert_text_to_index_array(text):
return [word_index[word] for word in text_to_word_sequence(text)]
allSequences = []
for text in user_input_array:
wordSequence = convert_text_to_index_array(text)
allSequences.append(wordSequence)
allSequences = np.asarray(allSequences)
#Pads the sequence of words
data = pad_sequences(allSequences, maxlen=MAX_SEQUENCE_LEN)
#The bot answers are turned into one-hot encoded vectors
labels = to_categorical(labels, num_classes = len(labels_index))
#Time to shuffle, so that dataset isn' predictable for the network
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
VALIDATION_SPLIT = 0.2
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
#Should have used a exclusive test set, but decided to use as much as possible data in the training set... So val_set == test_set, not optimal for test, I knows
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
print(x_train.shape)
print(x_train.shape[0])
#x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1]))
#x_val = np.reshape(x_val, (x_val.shape[0], 1, x_val.shape[1]))
print(x_train.shape)
print(x_train.shape[0])
print(len(labels_index))
print(str(x_train[0]))
print('Build model...')
model = Sequential()
model = Sequential()
model.add(Embedding(MAX_NUM_WORDS, 512))
model.add(LSTM(512, return_sequences = True)) # returns a sequence of vectors of dimension 512
model.add(Dropout(0.2))
model.add(LSTM(1024, return_sequences = True)) # returns a sequence of vectors of dimension 512
model.add(Dropout(0.2))
model.add(LSTM(512)) # returns a single vector of dimension 32
model.add(Dense(324, activation='softmax')) #output num_classes
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
print(model.summary())
print('Train...')
batch_size = 24
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=20,
validation_data=(x_val, y_val),
shuffle = True)
score, acc = model.evaluate(x_val, y_val,
batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
# serialize model to JSON
model_json = model.to_json()
with open("model/model.json", "w") as json_file:
json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model/model.h5")
print("Saved model to disk")