尝试添加一个same_person模型用来判断一句话是不是一个人说的，结果并没有什么提高

qhduan · Mar 14, 2018 · 3c15f4b · 3c15f4b
1 parent bd1a350
commit 3c15f4b
Show file tree

Hide file tree

Showing 12 changed files with 768 additions and 78 deletions.
diff --git a/chatbot/gen_same_person.py b/chatbot/gen_same_person.py
@@ -0,0 +1,64 @@
+"""
+判断两句话是不是连续上下文
+"""
+
+import sys
+import pickle
+from tqdm import tqdm
+import numpy as np
+from sklearn.utils import shuffle
+
+sys.path.append('..')
+
+def main():
+
+    from word_sequence import WordSequence
+
+    data, _, ws = pickle.load(open('chatbot.pkl', 'rb'))
+
+    x1_data = []
+    x2_data = []
+    y_data = []
+
+    for i, x in tqdm(enumerate(data), total=len(data)):
+        for sign in ('，', '。', '；', '！', '？'):
+            if sign in x:
+                t = ''.join(x).split(sign)
+                if len(t) == 2:
+                    a, b = t
+                    if len(a) >= 3 and len(b) >= 3:
+                        x1_data.append(a)
+                        x2_data.append(b)
+                        y_data.append(1)
+    print(len(x1_data))
+    length = len(x1_data)
+
+    for i in range(length):
+
+        if np.random.random() > 0.5:
+            a = x1_data[i]
+        else:
+            a = x2_data[i]
+
+        j = np.random.randint(0, length)
+        while j == i:
+            j = np.random.randint(0, length)
+
+        if np.random.random() > 0.5:
+            b = x1_data[j]
+        else:
+            b = x2_data[j]
+
+        x1_data.append(a)
+        x2_data.append(b)
+        y_data.append(0)
+
+    # ws = WordSequence()
+    # ws.fit(x1_data[:length])
+
+    x1_data, x2_data, y_data = shuffle(x1_data, x2_data, y_data, random_state=0)
+
+    pickle.dump((x1_data, x2_data, y_data, ws), open('same_person.pkl', 'wb'))
+
+if __name__ == '__main__':
+    main()
diff --git a/chatbot/test_same_person.py b/chatbot/test_same_person.py
@@ -0,0 +1,94 @@
+
+"""
+对SequenceToSequence模型进行基本的参数组合测试
+"""
+
+import sys
+import random
+import pickle
+
+import numpy as np
+import tensorflow as tf
+# import jieba
+from tqdm import tqdm
+# from sklearn.utils import shuffle
+
+sys.path.append('..')
+
+
+def test(bidirectional, cell_type, depth,
+         attention_type, use_residual, use_dropout, time_major, hidden_units):
+    """测试不同参数在生成的假数据上的运行结果"""
+
+    from sequence_to_sequence import SequenceToSequence
+    from same_person_model import SamePerson
+    from data_utils import batch_flow
+    from word_sequence import WordSequence # pylint: disable=unused-variable
+
+    x1_data, x2_data, y_data, ws = pickle.load(
+        open('same_person.pkl', 'rb'))
+
+    x_data, y_data, _ = pickle.load(
+        open('chatbot.pkl', 'rb'))
+
+    config = tf.ConfigProto(
+        device_count={'CPU': 1, 'GPU': 0},
+        allow_soft_placement=True,
+        log_device_placement=False
+    )
+
+    save_path = './s2ss_chatbot_samperson.ckpt'
+    batch_size = 1
+
+    with tf.Graph().as_default():
+        model = SamePerson(
+            input_vocab_size=len(ws),
+            n_target=2,
+            batch_size=batch_size,
+            learning_rate=0.001,
+            bidirectional=bidirectional,
+            cell_type=cell_type,
+            depth=depth,
+            use_residual=use_residual,
+            use_dropout=use_dropout,
+            parallel_iterations=1,
+            time_major=time_major,
+            hidden_units=hidden_units,
+            optimizer='adam'
+        )
+        init = tf.global_variables_initializer()
+        sess = tf.Session(config=config)
+        sess.run(init)
+        model.load(sess, save_path)
+
+    # 开始训练
+    # flow = batch_flow([x1_data, x2_data, y_data], [ws, ws, None], batch_size, raw=True)
+    flow = batch_flow([x_data, y_data], ws, batch_size, raw=True)
+
+    steps = 100
+    bar = range(steps)
+    for _ in bar:
+
+        # x1, x1l, x1r, x2, x2l, x2r, y, _, _ = next(flow)
+        x1, x1l, x1r, x2, x2l, x2r = next(flow)
+
+        print(x1r, x2r)
+        # print(x1, x2)
+        # print(x1.shape, x2.shape, x1l.shape, x2l.shape)
+
+        ans = model.predict(sess, x1, x1l, x2, x2l)
+        print('{:.3f}'.format(ans[0][1]))#, y[0])
+        print('-' * 30)
+
+
+
+def main():
+    """入口程序，开始测试不同参数组合"""
+    random.seed(0)
+    np.random.seed(0)
+    tf.set_random_seed(0)
+    test(True, 'lstm', 2, 'Bahdanau', True, True, True, 256)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/chatbot/test_sp.py b/chatbot/test_sp.py
@@ -0,0 +1,111 @@
+"""
+对SequenceToSequence模型进行基本的参数组合测试
+"""
+
+import sys
+import random
+import pickle
+
+import numpy as np
+import tensorflow as tf
+# import jieba
+# from nltk.tokenize import word_tokenize
+
+sys.path.append('..')
+
+
+def test(bidirectional, cell_type, depth,
+         attention_type, use_residual, use_dropout, time_major, hidden_units):
+    """测试不同参数在生成的假数据上的运行结果"""
+
+    from sequence_to_sequence import SequenceToSequence
+    from data_utils import batch_flow
+    from word_sequence import WordSequence # pylint: disable=unused-variable
+
+    x_data, _, ws = pickle.load(open('chatbot.pkl', 'rb'))
+
+    for x in x_data[:5]:
+        print(' '.join(x))
+
+    config = tf.ConfigProto(
+        device_count={'CPU': 1, 'GPU': 0},
+        allow_soft_placement=True,
+        log_device_placement=False
+    )
+
+    # save_path = '/tmp/s2ss_chatbot.ckpt'
+    save_path = './s2ss_chatbot_sp.ckpt'
+
+    # 测试部分
+    tf.reset_default_graph()
+    model_pred = SequenceToSequence(
+        input_vocab_size=len(ws),
+        target_vocab_size=len(ws),
+        batch_size=1,
+        mode='decode',
+        beam_width=0,
+        bidirectional=bidirectional,
+        cell_type=cell_type,
+        depth=depth,
+        attention_type=attention_type,
+        use_residual=use_residual,
+        use_dropout=use_dropout,
+        parallel_iterations=1,
+        time_major=time_major,
+        hidden_units=hidden_units,
+        share_embedding=True
+    )
+    init = tf.global_variables_initializer()
+
+    with tf.Session(config=config) as sess:
+        sess.run(init)
+        model_pred.load(sess, save_path)
+
+        while True:
+            user_text = input('Input Chat Sentence:')
+            if user_text in ('exit', 'quit'):
+                exit(0)
+            x_test = [list(user_text.lower())]
+            # x_test = [word_tokenize(user_text)]
+            bar = batch_flow([x_test], ws, 1)
+            x, xl = next(bar)
+            x = np.flip(x, axis=1)
+            # x = np.array([
+            #     list(reversed(xx))
+            #     for xx in x
+            # ])
+            print(x, xl)
+            pred = model_pred.predict(
+                sess,
+                np.array(x),
+                np.array(xl)
+            )
+            print(pred)
+            # prob = np.exp(prob.transpose())
+            print(ws.inverse_transform(x[0]))
+            # print(ws.inverse_transform(pred[0]))
+            # print(pred.shape, prob.shape)
+            for p in pred:
+                ans = ws.inverse_transform(p)
+                print(ans)
+
+
+def main():
+    """入口程序，开始测试不同参数组合"""
+    random.seed(0)
+    np.random.seed(0)
+    tf.set_random_seed(0)
+    test(
+        bidirectional=True,
+        cell_type='lstm',
+        depth=2,
+        attention_type='Bahdanau',
+        use_residual=False,
+        use_dropout=False,
+        time_major=False,
+        hidden_units=512
+    )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/chatbot/train.py b/chatbot/train.py
@@ -28,7 +28,7 @@ def test(bidirectional, cell_type, depth,
 
     # 训练部分
     n_epoch = 10
-    batch_size = 64
+    batch_size = 4
     # x_data, y_data = shuffle(x_data, y_data, random_state=0)
     # x_data = x_data[:10000]
     # y_data = y_data[:10000]
@@ -74,7 +74,8 @@ def test(bidirectional, cell_type, depth,
             # exit(1)
 
             flow = ThreadedGenerator(
-                batch_flow([x_data, y_data], ws, batch_size),
+                batch_flow([x_data, y_data], ws, batch_size,
+                           add_end=[False, True]),
                 queue_maxsize=30)
 
             for epoch in range(1, n_epoch + 1):
@@ -85,6 +86,8 @@ def test(bidirectional, cell_type, depth,
                     x, xl, y, yl = next(flow)
                     x = np.flip(x, axis=1)
                     # print(x, y)
+                    # print(xl, yl)
+                    # exit(1)
                     cost, lr = model.train(sess, x, xl, y, yl, return_lr=True)
                     costs.append(cost)
                     bar.set_description('epoch {} loss={:.6f} lr={:.6f}'.format(
@@ -124,7 +127,7 @@ def test(bidirectional, cell_type, depth,
         sess.run(init)
         model_pred.load(sess, save_path)
 
-        bar = batch_flow([x_data, y_data], ws, 1)
+        bar = batch_flow([x_data, y_data], ws, 1, add_end=False)
         t = 0
         for x, xl, y, yl in bar:
             x = np.flip(x, axis=1)
@@ -166,7 +169,7 @@ def test(bidirectional, cell_type, depth,
         sess.run(init)
         model_pred.load(sess, save_path)
 
-        bar = batch_flow([x_data, y_data], ws, 1)
+        bar = batch_flow([x_data, y_data], ws, 1, add_end=False)
         t = 0
         for x, xl, y, yl in bar:
             pred = model_pred.predict(

diff --git a/chatbot/train_anti.py b/chatbot/train_anti.py
@@ -74,7 +74,8 @@ def test(bidirectional, cell_type, depth,
             # exit(1)
 
             flow = ThreadedGenerator(
-                batch_flow([x_data, y_data], ws, batch_size),
+                batch_flow([x_data, y_data], ws, batch_size,
+                           add_end=[False, True]),
                 queue_maxsize=30)
 
             dummy_encoder_inputs = np.array([
@@ -136,7 +137,7 @@ def test(bidirectional, cell_type, depth,
         sess.run(init)
         model_pred.load(sess, save_path)
 
-        bar = batch_flow([x_data, y_data], ws, 1)
+        bar = batch_flow([x_data, y_data], ws, 1, add_end=False)
         t = 0
         for x, xl, y, yl in bar:
             x = np.flip(x, axis=1)
@@ -178,7 +179,7 @@ def test(bidirectional, cell_type, depth,
         sess.run(init)
         model_pred.load(sess, save_path)
 
-        bar = batch_flow([x_data, y_data], ws, 1)
+        bar = batch_flow([x_data, y_data], ws, 1, add_end=False)
         t = 0
         for x, xl, y, yl in bar:
             pred = model_pred.predict(