bug fix; code re-apply-pylint; chatbot update docs;

qhduan · Mar 8, 2018 · 688e5f5 · 688e5f5
1 parent 108a384
commit 688e5f5
Show file tree

Hide file tree

Showing 10 changed files with 76 additions and 91 deletions.
diff --git a/chatbot/README.md b/chatbot/README.md
@@ -31,6 +31,19 @@ python3 extract_conv.py
 
 运行 `python3 train.py` 训练（默认到`./s2ss_chatbot.ckpt`）
 
+或者！
+
+运行 `python3 train_anti.py` 训练抗语言模型（默认到`./s2ss_chatbot_anti.ckpt`）
+
 ## 5、测试数据（测试对话）
 
-运行 `python3 test.py` 查看测试结果
+运行 `python3 test.py` 查看测试结果，需要提前训练普通模型
+
+或者！
+
+运行 `python3 test_anti.py` 查看抗语言模型的测试结果，需要提前训练抗语言模型
+
+或者！
+
+运行 `python3 test_compare.py` 查看普通模型和抗语言模型的对比测试结果，
+需要提前训练两个模型
diff --git a/chatbot/test.py b/chatbot/test.py
@@ -8,8 +8,8 @@
 
 import numpy as np
 import tensorflow as tf
-import jieba
-from nltk.tokenize import word_tokenize
+# import jieba
+# from nltk.tokenize import word_tokenize
 
 sys.path.append('..')
 
@@ -85,17 +85,9 @@ def test(bidirectional, cell_type, depth,
             print(ws.inverse_transform(x[0]))
             # print(ws.inverse_transform(pred[0]))
             # print(pred.shape, prob.shape)
-            for i in range(len(pred)):
-                ans = ws.inverse_transform(pred[i])
+            for p in pred:
+                ans = ws.inverse_transform(p)
                 print(ans)
-                # p = prob[i]
-                # if '</s>' in ans:
-                #     print(ans.index('</s>'))
-                #     p = p[:ans.index('</s>')]
-                # pp = 1
-                # for ppp in p:
-                #     pp *= ppp
-                # print(pp, np.mean(p))
 
 
 def main():
@@ -106,12 +98,12 @@ def main():
     test(
         bidirectional=False,
         cell_type='lstm',
-        depth=1,
+        depth=2,
         attention_type='Bahdanau',
         use_residual=False,
         use_dropout=False,
         time_major=False,
-        hidden_units=1024
+        hidden_units=512
     )
 
 

diff --git a/chatbot/test_anti.py b/chatbot/test_anti.py
@@ -8,8 +8,8 @@
 
 import numpy as np
 import tensorflow as tf
-import jieba
-from nltk.tokenize import word_tokenize
+# import jieba
+# from nltk.tokenize import word_tokenize
 
 sys.path.append('..')
 
@@ -85,17 +85,9 @@ def test(bidirectional, cell_type, depth,
             print(ws.inverse_transform(x[0]))
             # print(ws.inverse_transform(pred[0]))
             # print(pred.shape, prob.shape)
-            for i in range(len(pred)):
-                ans = ws.inverse_transform(pred[i])
+            for p in pred:
+                ans = ws.inverse_transform(p)
                 print(ans)
-                # p = prob[i]
-                # if '</s>' in ans:
-                #     print(ans.index('</s>'))
-                #     p = p[:ans.index('</s>')]
-                # pp = 1
-                # for ppp in p:
-                #     pp *= ppp
-                # print(pp, np.mean(p))
 
 
 def main():

diff --git a/chatbot/train.py b/chatbot/train.py
@@ -9,7 +9,7 @@
 import numpy as np
 import tensorflow as tf
 from tqdm import tqdm
-from sklearn.utils import shuffle
+# from sklearn.utils import shuffle
 
 sys.path.append('..')
 

diff --git a/chatbot/train_anti.py b/chatbot/train_anti.py
@@ -9,7 +9,7 @@
 import numpy as np
 import tensorflow as tf
 from tqdm import tqdm
-from sklearn.utils import shuffle
+# from sklearn.utils import shuffle
 
 sys.path.append('..')
 

diff --git a/data_utils.py b/data_utils.py
@@ -113,14 +113,20 @@ def batch_flow(data, ws, batch_size, raw=False):
 
 def batch_flow_bucket(data, ws, batch_size, raw=False,
                       n_buckets=5, bucket_ind=1, debug=False):
-    """batch_flow的bucket版本"""
+    """batch_flow的bucket版本
+    多了两重要参数，一个是n_buckets，一个是bucket_ind
+    n_buckets是分成几个buckets，理论上n_buckets == 1时就相当于没有进行buckets操作
+    bucket_ind是指定哪一维度的输入数据作为bucket的依据
+    """
 
     all_data = list(zip(*data))
     lengths = sorted(list(set([len(x[bucket_ind]) for x in all_data])))
     if n_buckets > len(lengths):
         n_buckets = len(lengths)
 
-    splits = np.array(lengths)[(np.linspace(0, 1, 5, endpoint=False) * len(lengths)).astype(int)].tolist()
+    splits = np.array(lengths)[
+        (np.linspace(0, 1, 5, endpoint=False) * len(lengths)).astype(int)
+    ].tolist()
     splits += [np.inf]
 
     if debug:
@@ -195,7 +201,7 @@ def test_batch_flow_bucket():
     flow = batch_flow_bucket(
         [x_data, y_data], [ws_input, ws_target], 4,
         debug=True)
-    for i in range(10):
+    for _ in range(10):
         x, xl, y, yl = next(flow)
         print(x.shape, y.shape, xl.shape, yl.shape)
 

diff --git a/rnn_crf.py b/rnn_crf.py
@@ -13,7 +13,7 @@
 
 import numpy as np
 import tensorflow as tf
-from tensorflow import layers
+# from tensorflow import layers
 from tensorflow.contrib.rnn import LSTMCell
 from tensorflow.contrib.rnn import GRUCell
 from tensorflow.contrib.rnn import MultiRNNCell
@@ -56,6 +56,8 @@ def __init__(self,
                  use_residual=False,
                  optimizer='adam',
                  learning_rate=0.001,
+                 min_learning_rate=1e-6,
+                 decay_steps=500000,
                  max_gradient_norm=5.0,
                  bidirectional=False,
                  output_project_active=None,
@@ -120,6 +122,8 @@ def __init__(self,
         self.mode = mode
         self.optimizer = optimizer
         self.learning_rate = learning_rate
+        self.min_learning_rate = min_learning_rate
+        self.decay_steps = decay_steps
         self.max_gradient_norm = max_gradient_norm
         self.keep_prob = 1.0 - dropout
         self.bidirectional = bidirectional
@@ -175,6 +179,8 @@ def build_model(self):
         if self.mode == 'train':
             self.init_optimizer()
 
+        self.saver = tf.train.Saver()
+
 
     def init_placeholders(self):
         """初始化训练、预测所需的变量
@@ -302,20 +308,6 @@ def build_encoder(self):
                 ids=self.encoder_inputs
             )
 
-            # Input projection layer to feed embedded inputs to the cell
-            # ** Essential when use_residual=True to match input/output dims
-            # 输入投影层
-            # 如果使用了residual，为了对齐输入和输出层，这里可能必须增加一个投影
-            input_layer = layers.Dense(
-                self.hidden_units, dtype=tf.float32, name='input_projection'
-            )
-            self.input_layer = input_layer
-
-            # Embedded inputs having gone through input projection layer
-            self.encoder_inputs_embedded = input_layer(
-                self.encoder_inputs_embedded
-            )
-
             # Encode input sequences into context vectors:
             # encoder_outputs: [batch_size, max_time_step, cell_output_size]
             # encoder_state: [batch_size, cell_output_size]
@@ -478,29 +470,13 @@ def build_decoder_crf(self):
 
     def save(self, sess, save_path='model.ckpt'):
         """保存模型"""
-
-        # if not os.path.exists(save_path):
-        #     os.makedirs(save_path)
-
-        saver = tf.train.Saver()
-        save_path = saver.save(sess,
-                               save_path=save_path) #,
-                               # global_step=self.global_step)
+        self.saver.save(sess, save_path=save_path)
 
 
     def load(self, sess, save_path='model.ckpt'):
         """读取模型"""
-        # if not os.path.exists(save_path):
-        #     print('没有找到模型路径', save_path)
-        #     return
-
         print('try load model from', save_path)
-        # ckpt = tf.train.get_checkpoint_state(save_path)
-        saver = tf.train.Saver()
-        # saver = tf.train.import_meta_graph(save_path)
-        # saver.restore(sess, save_path=ckpt.model_checkpoint_path)
-        # saver = tf.train.import_meta_graph(save_path + '.meta')
-        saver.restore(sess, save_path)
+        self.saver.restore(sess, save_path)
 
 
     def check_feeds(self, encoder_inputs, encoder_inputs_length,
@@ -579,33 +555,41 @@ def init_optimizer(self):
         """初始化优化器
         支持的方法有 sgd, adadelta, adam, rmsprop, momentum
         """
-        # print("setting optimizer..")
-        # Gradients and SGD update operation for training the model
+
+        # 学习率下降算法
+        learning_rate = tf.train.polynomial_decay(
+            self.learning_rate,
+            self.global_step,
+            self.decay_steps,
+            self.min_learning_rate,
+            power=0.5
+        )
+        self.current_learning_rate = learning_rate
+
+        # 设置优化器,合法的优化器如下
         # 'adadelta', 'adam', 'rmsprop', 'momentum', 'sgd'
         trainable_params = tf.trainable_variables()
         if self.optimizer.lower() == 'adadelta':
             self.opt = tf.train.AdadeltaOptimizer(
-                learning_rate=self.learning_rate)
+                learning_rate=learning_rate)
         elif self.optimizer.lower() == 'adam':
             self.opt = tf.train.AdamOptimizer(
-                learning_rate=self.learning_rate)
+                learning_rate=learning_rate)
         elif self.optimizer.lower() == 'rmsprop':
             self.opt = tf.train.RMSPropOptimizer(
-                learning_rate=self.learning_rate)
+                learning_rate=learning_rate)
         elif self.optimizer.lower() == 'momentum':
             self.opt = tf.train.MomentumOptimizer(
-                learning_rate=self.learning_rate, momentum=0.9)
+                learning_rate=learning_rate, momentum=0.9)
         elif self.optimizer.lower() == 'sgd':
             self.opt = tf.train.GradientDescentOptimizer(
-                learning_rate=self.learning_rate)
+                learning_rate=learning_rate)
 
         # Compute gradients of loss w.r.t. all trainable variables
         gradients = tf.gradients(self.loss, trainable_params)
-
         # Clip gradients by a given maximum_gradient_norm
         clip_gradients, _ = tf.clip_by_global_norm(
             gradients, self.max_gradient_norm)
-
         # Update the model
         self.updates = self.opt.apply_gradients(
             zip(clip_gradients, trainable_params),

diff --git a/sequence_to_sequence.py b/sequence_to_sequence.py
@@ -22,9 +22,6 @@
 https://github.com/tensorflow/tensor2tensor
 """
 
-
-import math
-
 import numpy as np
 import tensorflow as tf
 from tensorflow import layers
@@ -582,6 +579,13 @@ def build_decoder(self):
                         dtype=tf.float32
                     )
 
+            # 使用 residual 的时候，对齐输入
+            if self.use_residual:
+                self.decoder_embeddings = tf.layers.dense(
+                    self.decoder_embeddings,
+                    self.hidden_units * 2
+                )
+
             # On Using Very Large Target Vocabulary
             # for Neural Machine Translation
             # https://arxiv.org/pdf/1412.2007v2.pdf

diff --git a/threadedgenerator.py b/threadedgenerator.py
@@ -7,16 +7,11 @@
 # A simple generator wrapper, not sure if it's good for anything at all.
 # With basic python threading
 from threading import Thread
-
-try:
-    from queue import Queue
-
-except ImportError:
-    from Queue import Queue
+from queue import Queue
 
 # ... or use multiprocessing versions
 # WARNING: use sentinel based on value, not identity
-from multiprocessing import Process, Queue as MpQueue
+# from multiprocessing import Process, Queue as MpQueue
 
 
 class ThreadedGenerator(object):
@@ -29,9 +24,7 @@ class ThreadedGenerator(object):
     def __init__(self, iterator,
                  sentinel=object(),
                  queue_maxsize=0,
-                 daemon=False,
-                 Thread=Thread,
-                 Queue=Queue):
+                 daemon=False):
         self._iterator = iterator
         self._sentinel = sentinel
         self._queue = Queue(maxsize=queue_maxsize)
@@ -61,7 +54,7 @@ def close(self):
                 self._queue.get(timeout=0)
         except KeyboardInterrupt as e:
             raise e
-        except:
+        except: # pylint: disable=bare-except
             pass
         # self._thread.join()
 
@@ -84,6 +77,7 @@ def __next__(self):
 
 
 def test():
+    """测试"""
 
     def gene():
         i = 0
@@ -92,7 +86,7 @@ def gene():
             i += 1
     t = gene()
     tt = ThreadedGenerator(t)
-    for i in range(10):
+    for _ in range(10):
         print(next(tt))
     tt.close()
     # for i in range(10):

diff --git a/word_sequence.py b/word_sequence.py
@@ -101,8 +101,8 @@ def fit(self, sentences, min_count=5, max_count=None, max_features=None):
 
         if isinstance(max_features, int):
             count = sorted(list(count.items()), key=lambda x: x[1])
-            if len(count) > max_features:
-                count = count[-max_features:]
+            if max_features is not None and len(count) > max_features:
+                count = count[-int(max_features):]
             for w, _ in count:
                 self.dict[w] = len(self.dict)
         else: