Skip to content

Commit

Permalink
bug fix; code re-apply-pylint; chatbot update docs;
Browse files Browse the repository at this point in the history
  • Loading branch information
qhduan committed Mar 8, 2018
1 parent 108a384 commit 688e5f5
Show file tree
Hide file tree
Showing 10 changed files with 76 additions and 91 deletions.
15 changes: 14 additions & 1 deletion chatbot/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,19 @@ python3 extract_conv.py

运行 `python3 train.py` 训练(默认到`./s2ss_chatbot.ckpt`

或者!

运行 `python3 train_anti.py` 训练抗语言模型(默认到`./s2ss_chatbot_anti.ckpt`

## 5、测试数据(测试对话)

运行 `python3 test.py` 查看测试结果
运行 `python3 test.py` 查看测试结果,需要提前训练普通模型

或者!

运行 `python3 test_anti.py` 查看抗语言模型的测试结果,需要提前训练抗语言模型

或者!

运行 `python3 test_compare.py` 查看普通模型和抗语言模型的对比测试结果,
需要提前训练两个模型
20 changes: 6 additions & 14 deletions chatbot/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

import numpy as np
import tensorflow as tf
import jieba
from nltk.tokenize import word_tokenize
# import jieba
# from nltk.tokenize import word_tokenize

sys.path.append('..')

Expand Down Expand Up @@ -85,17 +85,9 @@ def test(bidirectional, cell_type, depth,
print(ws.inverse_transform(x[0]))
# print(ws.inverse_transform(pred[0]))
# print(pred.shape, prob.shape)
for i in range(len(pred)):
ans = ws.inverse_transform(pred[i])
for p in pred:
ans = ws.inverse_transform(p)
print(ans)
# p = prob[i]
# if '</s>' in ans:
# print(ans.index('</s>'))
# p = p[:ans.index('</s>')]
# pp = 1
# for ppp in p:
# pp *= ppp
# print(pp, np.mean(p))


def main():
Expand All @@ -106,12 +98,12 @@ def main():
test(
bidirectional=False,
cell_type='lstm',
depth=1,
depth=2,
attention_type='Bahdanau',
use_residual=False,
use_dropout=False,
time_major=False,
hidden_units=1024
hidden_units=512
)


Expand Down
16 changes: 4 additions & 12 deletions chatbot/test_anti.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

import numpy as np
import tensorflow as tf
import jieba
from nltk.tokenize import word_tokenize
# import jieba
# from nltk.tokenize import word_tokenize

sys.path.append('..')

Expand Down Expand Up @@ -85,17 +85,9 @@ def test(bidirectional, cell_type, depth,
print(ws.inverse_transform(x[0]))
# print(ws.inverse_transform(pred[0]))
# print(pred.shape, prob.shape)
for i in range(len(pred)):
ans = ws.inverse_transform(pred[i])
for p in pred:
ans = ws.inverse_transform(p)
print(ans)
# p = prob[i]
# if '</s>' in ans:
# print(ans.index('</s>'))
# p = p[:ans.index('</s>')]
# pp = 1
# for ppp in p:
# pp *= ppp
# print(pp, np.mean(p))


def main():
Expand Down
2 changes: 1 addition & 1 deletion chatbot/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from sklearn.utils import shuffle
# from sklearn.utils import shuffle

sys.path.append('..')

Expand Down
2 changes: 1 addition & 1 deletion chatbot/train_anti.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from sklearn.utils import shuffle
# from sklearn.utils import shuffle

sys.path.append('..')

Expand Down
12 changes: 9 additions & 3 deletions data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,14 +113,20 @@ def batch_flow(data, ws, batch_size, raw=False):

def batch_flow_bucket(data, ws, batch_size, raw=False,
n_buckets=5, bucket_ind=1, debug=False):
"""batch_flow的bucket版本"""
"""batch_flow的bucket版本
多了两重要参数,一个是n_buckets,一个是bucket_ind
n_buckets是分成几个buckets,理论上n_buckets == 1时就相当于没有进行buckets操作
bucket_ind是指定哪一维度的输入数据作为bucket的依据
"""

all_data = list(zip(*data))
lengths = sorted(list(set([len(x[bucket_ind]) for x in all_data])))
if n_buckets > len(lengths):
n_buckets = len(lengths)

splits = np.array(lengths)[(np.linspace(0, 1, 5, endpoint=False) * len(lengths)).astype(int)].tolist()
splits = np.array(lengths)[
(np.linspace(0, 1, 5, endpoint=False) * len(lengths)).astype(int)
].tolist()
splits += [np.inf]

if debug:
Expand Down Expand Up @@ -195,7 +201,7 @@ def test_batch_flow_bucket():
flow = batch_flow_bucket(
[x_data, y_data], [ws_input, ws_target], 4,
debug=True)
for i in range(10):
for _ in range(10):
x, xl, y, yl = next(flow)
print(x.shape, y.shape, xl.shape, yl.shape)

Expand Down
68 changes: 26 additions & 42 deletions rnn_crf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

import numpy as np
import tensorflow as tf
from tensorflow import layers
# from tensorflow import layers
from tensorflow.contrib.rnn import LSTMCell
from tensorflow.contrib.rnn import GRUCell
from tensorflow.contrib.rnn import MultiRNNCell
Expand Down Expand Up @@ -56,6 +56,8 @@ def __init__(self,
use_residual=False,
optimizer='adam',
learning_rate=0.001,
min_learning_rate=1e-6,
decay_steps=500000,
max_gradient_norm=5.0,
bidirectional=False,
output_project_active=None,
Expand Down Expand Up @@ -120,6 +122,8 @@ def __init__(self,
self.mode = mode
self.optimizer = optimizer
self.learning_rate = learning_rate
self.min_learning_rate = min_learning_rate
self.decay_steps = decay_steps
self.max_gradient_norm = max_gradient_norm
self.keep_prob = 1.0 - dropout
self.bidirectional = bidirectional
Expand Down Expand Up @@ -175,6 +179,8 @@ def build_model(self):
if self.mode == 'train':
self.init_optimizer()

self.saver = tf.train.Saver()


def init_placeholders(self):
"""初始化训练、预测所需的变量
Expand Down Expand Up @@ -302,20 +308,6 @@ def build_encoder(self):
ids=self.encoder_inputs
)

# Input projection layer to feed embedded inputs to the cell
# ** Essential when use_residual=True to match input/output dims
# 输入投影层
# 如果使用了residual,为了对齐输入和输出层,这里可能必须增加一个投影
input_layer = layers.Dense(
self.hidden_units, dtype=tf.float32, name='input_projection'
)
self.input_layer = input_layer

# Embedded inputs having gone through input projection layer
self.encoder_inputs_embedded = input_layer(
self.encoder_inputs_embedded
)

# Encode input sequences into context vectors:
# encoder_outputs: [batch_size, max_time_step, cell_output_size]
# encoder_state: [batch_size, cell_output_size]
Expand Down Expand Up @@ -478,29 +470,13 @@ def build_decoder_crf(self):

def save(self, sess, save_path='model.ckpt'):
"""保存模型"""

# if not os.path.exists(save_path):
# os.makedirs(save_path)

saver = tf.train.Saver()
save_path = saver.save(sess,
save_path=save_path) #,
# global_step=self.global_step)
self.saver.save(sess, save_path=save_path)


def load(self, sess, save_path='model.ckpt'):
"""读取模型"""
# if not os.path.exists(save_path):
# print('没有找到模型路径', save_path)
# return

print('try load model from', save_path)
# ckpt = tf.train.get_checkpoint_state(save_path)
saver = tf.train.Saver()
# saver = tf.train.import_meta_graph(save_path)
# saver.restore(sess, save_path=ckpt.model_checkpoint_path)
# saver = tf.train.import_meta_graph(save_path + '.meta')
saver.restore(sess, save_path)
self.saver.restore(sess, save_path)


def check_feeds(self, encoder_inputs, encoder_inputs_length,
Expand Down Expand Up @@ -579,33 +555,41 @@ def init_optimizer(self):
"""初始化优化器
支持的方法有 sgd, adadelta, adam, rmsprop, momentum
"""
# print("setting optimizer..")
# Gradients and SGD update operation for training the model

# 学习率下降算法
learning_rate = tf.train.polynomial_decay(
self.learning_rate,
self.global_step,
self.decay_steps,
self.min_learning_rate,
power=0.5
)
self.current_learning_rate = learning_rate

# 设置优化器,合法的优化器如下
# 'adadelta', 'adam', 'rmsprop', 'momentum', 'sgd'
trainable_params = tf.trainable_variables()
if self.optimizer.lower() == 'adadelta':
self.opt = tf.train.AdadeltaOptimizer(
learning_rate=self.learning_rate)
learning_rate=learning_rate)
elif self.optimizer.lower() == 'adam':
self.opt = tf.train.AdamOptimizer(
learning_rate=self.learning_rate)
learning_rate=learning_rate)
elif self.optimizer.lower() == 'rmsprop':
self.opt = tf.train.RMSPropOptimizer(
learning_rate=self.learning_rate)
learning_rate=learning_rate)
elif self.optimizer.lower() == 'momentum':
self.opt = tf.train.MomentumOptimizer(
learning_rate=self.learning_rate, momentum=0.9)
learning_rate=learning_rate, momentum=0.9)
elif self.optimizer.lower() == 'sgd':
self.opt = tf.train.GradientDescentOptimizer(
learning_rate=self.learning_rate)
learning_rate=learning_rate)

# Compute gradients of loss w.r.t. all trainable variables
gradients = tf.gradients(self.loss, trainable_params)

# Clip gradients by a given maximum_gradient_norm
clip_gradients, _ = tf.clip_by_global_norm(
gradients, self.max_gradient_norm)

# Update the model
self.updates = self.opt.apply_gradients(
zip(clip_gradients, trainable_params),
Expand Down
10 changes: 7 additions & 3 deletions sequence_to_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@
https://github.com/tensorflow/tensor2tensor
"""


import math

import numpy as np
import tensorflow as tf
from tensorflow import layers
Expand Down Expand Up @@ -582,6 +579,13 @@ def build_decoder(self):
dtype=tf.float32
)

# 使用 residual 的时候,对齐输入
if self.use_residual:
self.decoder_embeddings = tf.layers.dense(
self.decoder_embeddings,
self.hidden_units * 2
)

# On Using Very Large Target Vocabulary
# for Neural Machine Translation
# https://arxiv.org/pdf/1412.2007v2.pdf
Expand Down
18 changes: 6 additions & 12 deletions threadedgenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,11 @@
# A simple generator wrapper, not sure if it's good for anything at all.
# With basic python threading
from threading import Thread

try:
from queue import Queue

except ImportError:
from Queue import Queue
from queue import Queue

# ... or use multiprocessing versions
# WARNING: use sentinel based on value, not identity
from multiprocessing import Process, Queue as MpQueue
# from multiprocessing import Process, Queue as MpQueue


class ThreadedGenerator(object):
Expand All @@ -29,9 +24,7 @@ class ThreadedGenerator(object):
def __init__(self, iterator,
sentinel=object(),
queue_maxsize=0,
daemon=False,
Thread=Thread,
Queue=Queue):
daemon=False):
self._iterator = iterator
self._sentinel = sentinel
self._queue = Queue(maxsize=queue_maxsize)
Expand Down Expand Up @@ -61,7 +54,7 @@ def close(self):
self._queue.get(timeout=0)
except KeyboardInterrupt as e:
raise e
except:
except: # pylint: disable=bare-except
pass
# self._thread.join()

Expand All @@ -84,6 +77,7 @@ def __next__(self):


def test():
"""测试"""

def gene():
i = 0
Expand All @@ -92,7 +86,7 @@ def gene():
i += 1
t = gene()
tt = ThreadedGenerator(t)
for i in range(10):
for _ in range(10):
print(next(tt))
tt.close()
# for i in range(10):
Expand Down
4 changes: 2 additions & 2 deletions word_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@ def fit(self, sentences, min_count=5, max_count=None, max_features=None):

if isinstance(max_features, int):
count = sorted(list(count.items()), key=lambda x: x[1])
if len(count) > max_features:
count = count[-max_features:]
if max_features is not None and len(count) > max_features:
count = count[-int(max_features):]
for w, _ in count:
self.dict[w] = len(self.dict)
else:
Expand Down

0 comments on commit 688e5f5

Please sign in to comment.