From d40185f781387172cd4f3ad5d98976e2f78f136b Mon Sep 17 00:00:00 2001
From: nguyenvo09 <vknguyen09@gmail.com>
Date: Sun, 17 Jan 2021 23:52:36 -0500
Subject: [PATCH] refactor code

---
 .../FittingFC/char_man_fitter_query_repr1.py  |   1 -
 Fitting/densebaseline_fit.py                  |   2 -
 __init__.py                                   |   1 +
 handlers/mz_sampler.py                        | 156 ++++++++++
 handlers/output_handler_FC.py                 |  79 +++++
 handlers/tensorboard_writer.py                |  17 +
 handlers/tensorboard_writer_class.py          |  17 +
 interactions.py                               | 291 ++++++++++++++++++
 matchzoo/preprocessors/__init__.py            |   9 -
 matchzoo/preprocessors/basic_preprocessor.py  |   1 -
 matchzoo/preprocessors/bow_preprocessor.py    |   1 -
 matchzoo/preprocessors/cdssm_preprocessor.py  | 125 --------
 .../char_man_elmo_preprocessor.py             | 269 ----------------
 .../preprocessors/char_man_preprocessor.py    |   1 -
 .../preprocessors/char_ngram_preprocessor.py  |  95 ------
 .../preprocessors/declare_preprocessor.py     | 202 ------------
 matchzoo/preprocessors/dssm_preprocessor.py   | 124 --------
 .../preprocessors/elmo_basic_preprocessor.py  | 168 ----------
 .../fact_checking_elmo_preprocessor.py        | 173 -----------
 .../mz_pretrained_preprocessor.py             | 250 ---------------
 matchzoo/preprocessors/naive_preprocessor.py  |  61 ----
 matchzoo/preprocessors/tfidf_preprocessor.py  |  41 ---
 setting_keywords.py                           |  78 +++++
 23 files changed, 639 insertions(+), 1523 deletions(-)
 create mode 100644 __init__.py
 create mode 100644 handlers/mz_sampler.py
 create mode 100644 handlers/output_handler_FC.py
 create mode 100644 handlers/tensorboard_writer.py
 create mode 100644 handlers/tensorboard_writer_class.py
 create mode 100644 interactions.py
 delete mode 100644 matchzoo/preprocessors/cdssm_preprocessor.py
 delete mode 100644 matchzoo/preprocessors/char_man_elmo_preprocessor.py
 delete mode 100644 matchzoo/preprocessors/char_ngram_preprocessor.py
 delete mode 100644 matchzoo/preprocessors/declare_preprocessor.py
 delete mode 100644 matchzoo/preprocessors/dssm_preprocessor.py
 delete mode 100644 matchzoo/preprocessors/elmo_basic_preprocessor.py
 delete mode 100644 matchzoo/preprocessors/fact_checking_elmo_preprocessor.py
 delete mode 100644 matchzoo/preprocessors/mz_pretrained_preprocessor.py
 delete mode 100644 matchzoo/preprocessors/naive_preprocessor.py
 delete mode 100644 matchzoo/preprocessors/tfidf_preprocessor.py
 create mode 100644 setting_keywords.py

diff --git a/Fitting/FittingFC/char_man_fitter_query_repr1.py b/Fitting/FittingFC/char_man_fitter_query_repr1.py
index d019c68..78961d4 100644
--- a/Fitting/FittingFC/char_man_fitter_query_repr1.py
+++ b/Fitting/FittingFC/char_man_fitter_query_repr1.py
@@ -5,7 +5,6 @@
 import torch_utils as my_utils
 import time
 import interactions
-from handlers.output_handler import FileHandler
 from handlers.tensorboard_writer import TensorboardWrapper
 from setting_keywords import KeyWordSettings
 from Fitting.FittingFC.multi_level_attention_composite_fitter import MultiLevelAttentionCompositeFitter
diff --git a/Fitting/densebaseline_fit.py b/Fitting/densebaseline_fit.py
index 7cbabbd..ce0fc42 100644
--- a/Fitting/densebaseline_fit.py
+++ b/Fitting/densebaseline_fit.py
@@ -14,9 +14,7 @@
 import json
 import matchzoo
 import interactions
-from handlers.output_handler import FileHandler
 from handlers.tensorboard_writer import TensorboardWrapper
-from matchzoo.preprocessors.tfidf_preprocessor import TFIDF
 from setting_keywords import KeyWordSettings
 from matchzoo.metrics import average_precision, discounted_cumulative_gain, \
     mean_average_precision, mean_reciprocal_rank, normalized_discounted_cumulative_gain, precision
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e8cc56f
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1 @@
+__version__ = 'v0.1.5'
diff --git a/handlers/mz_sampler.py b/handlers/mz_sampler.py
new file mode 100644
index 0000000..292dc7c
--- /dev/null
+++ b/handlers/mz_sampler.py
@@ -0,0 +1,156 @@
+"""
+Module containing functions for negative item sampling.
+"""
+
+import numpy as np
+from scipy.sparse import csr_matrix
+import torch_utils
+import time
+import interactions
+
+
+class Sampler(object):
+    def __init__(self):
+        super(Sampler, self).__init__()
+
+    def get_train_instances(self, interactions: interactions.MatchInteraction, num_negatives: int):
+        """
+        Sample negative from a candidate set of each user. The
+        candidate set of each user is defined by:
+        {All Items} \ {Items Rated by User}
+        Parameters
+        ----------
+        interactions: :class:`matchzoo.DataPack`
+            training instances, used for generate candidates. Note that
+            since I am using MatchZoo datapack, there are negative cases in left-right relation ship as
+            well.
+        num_negatives: int
+            total number of negatives to sample for each sequence
+        """
+
+        query_ids = interactions.pos_queries.astype(np.int64)  # may not be unique
+        query_contents = interactions.np_query_contents.astype(np.int64)
+        query_lengths = interactions.np_query_lengths.astype(np.int64)
+
+        doc_ids = interactions.pos_docs.astype(np.int64)
+        doc_contents = interactions.np_doc_contents.astype(np.int64)
+        doc_lengths = interactions.np_doc_lengths.astype(np.int64)
+
+        negative_samples = np.zeros((query_ids.shape[0], num_negatives, interactions.padded_doc_length), np.int64)
+        negative_samples_lens = np.zeros((query_ids.shape[0], num_negatives), np.int64)
+        negative_docs_ids = np.zeros((query_ids.shape[0], num_negatives), np.int64)
+        self._candidate = interactions.negatives
+
+        for i, u in enumerate(query_ids):
+            for j in range(num_negatives):
+                x = self._candidate[u]
+                neg_item = x[np.random.randint(len(x))]  # int
+                # print("Neg_item: ", neg_item)
+                neg_item_content = interactions.dict_doc_contents[neg_item]  # np.array
+                negative_samples[i, j] = neg_item_content
+                negative_samples_lens[i, j] = interactions.dict_doc_lengths[neg_item]
+                negative_docs_ids[i, j] = neg_item
+            # if u <= 0:
+            #     print("Negative samples: ", negative_samples[i])
+        # print(negative_samples)
+        return query_ids, query_contents, query_lengths, \
+               doc_ids, doc_contents, doc_lengths, \
+               negative_docs_ids, negative_samples, negative_samples_lens
+
+    def get_train_instances_declare(self, interactions: interactions.ClassificationInteractions,
+                                          fixed_num_evidences: int):
+        """
+        ----------
+        interactions: :class:`interactions.ClassificationInteractions`
+            training instances,
+        fixed_num_evidences: `int`
+            fixed number of evidences for each claim
+        """
+        claim_sources = np.array([interactions.dict_claim_source[e] for e in interactions.claims])
+        evidence_sources = np.array([interactions.dict_evd_source[e] for e in interactions.evidences])
+        return interactions.claims, interactions.claims_contents, interactions.claims_lens, claim_sources, \
+               interactions.evidences, interactions.evd_contents, interactions.evd_lens, evidence_sources, \
+               interactions.pair_labels
+
+    def get_train_instances_hanfc(self, interactions: interactions.ClassificationInteractions,
+                                  fixed_num_evidences: int):
+        """
+        For each query/claim, we get its x number of evidences.
+        Parameters
+        ----------
+        interactions: :class:`interactions.ClassificationInteractions`
+            training instances,
+        fixed_num_evidences: `int`
+            fixed number of evidences for each claim
+        """
+
+        query_ids = interactions.claims.astype(np.int64)  # must be all unique
+        query_labels = interactions.claims_labels
+        query_contents = interactions.np_query_contents.astype(np.int64)
+        query_lengths = interactions.np_query_lengths.astype(np.int64)
+        query_sources = np.array([interactions.dict_claim_source[q] for q in query_ids])
+
+        evd_docs_ids = np.zeros((query_ids.shape[0], fixed_num_evidences), np.int64) - 1  # all indices are -1
+        # by default it is all pad tokens
+        evd_docs_contents = np.zeros((query_ids.shape[0], fixed_num_evidences, interactions.padded_doc_length), np.int64)
+        evd_docs_lens = np.zeros((query_ids.shape[0], fixed_num_evidences), np.int64)
+        evd_sources = np.zeros((query_ids.shape[0], fixed_num_evidences), np.int64) - 1  # for padding sources are -1
+        evd_cnt_each_query = np.zeros((query_ids.shape[0]), np.int64)
+
+        for i, u in enumerate(query_ids):
+            evidences_info = interactions.dict_claims_and_evidences_test[u]  # use u not i
+            assert len(evidences_info) <= fixed_num_evidences
+            evd_cnt_each_query[i] = len(evidences_info[0])  # number of real evidences for the query i
+            # we have a list of evidences, now I need to take the content and doc_id
+            for idx, (doc_id, doc_label, doc_content, doc_len) in enumerate(zip(*evidences_info)):
+                evd_docs_contents[i][idx] = doc_content  # we already pad the content array with zeros due to init
+                evd_docs_lens[i][idx] = doc_len  # we set 0 length for padding evidences
+                evd_docs_ids[i][idx] = doc_id  # we set -1 as index for padding evidences
+                evd_sources[i][idx] = interactions.dict_evd_source[doc_id][0]  # -1 since we have an array size 1
+
+        return query_ids, query_contents, query_lengths, query_sources, \
+               evd_docs_ids, evd_docs_contents, evd_docs_lens, evd_sources, evd_cnt_each_query, query_labels
+
+    def get_train_instances_char_man(self, interactions: interactions.ClassificationInteractions,
+                                  fixed_num_evidences: int):
+        """
+        For each query/claim, we get its x number of evidences.
+        Parameters
+        ----------
+        interactions: :class:`interactions.ClassificationInteractions`
+            training instances,
+        fixed_num_evidences: `int`
+            fixed number of evidences for each claim
+        """
+
+        query_ids = interactions.claims.astype(np.int64)  # must be all unique
+        query_labels = interactions.claims_labels
+        query_contents = interactions.np_query_contents.astype(np.int64)
+        query_lengths = interactions.np_query_lengths.astype(np.int64)
+        query_char_source = interactions.np_query_char_source.astype(np.int64)
+        query_sources = np.array([interactions.dict_claim_source[q] for q in query_ids])
+
+        evd_docs_ids = np.zeros((query_ids.shape[0], fixed_num_evidences), np.int64) - 1  # all indices are -1
+        # by default it is all pad tokens
+        evd_docs_contents = np.zeros((query_ids.shape[0], fixed_num_evidences, interactions.padded_doc_length), np.int64)
+        evd_docs_lens = np.zeros((query_ids.shape[0], fixed_num_evidences), np.int64)
+        evd_sources = np.zeros((query_ids.shape[0], fixed_num_evidences), np.int64) - 1  # for padding sources are -1
+        evd_cnt_each_query = np.zeros((query_ids.shape[0]), np.int64)
+        evd_docs_char_source_contents = np.zeros((query_ids.shape[0], fixed_num_evidences,
+                                                  interactions.padded_doc_char_source_length), np.int64)
+
+        for i, u in enumerate(query_ids):
+            evidences_info = interactions.dict_claims_and_evidences_test[u]  # use u not i
+            assert len(evidences_info) <= fixed_num_evidences
+            evd_cnt_each_query[i] = len(evidences_info[0])  # number of real evidences for the query i
+            # we have a list of evidences, now I need to take the content and doc_id
+            for idx, (doc_id, doc_label, doc_content, doc_len) in enumerate(zip(*evidences_info)):
+                evd_docs_contents[i][idx] = doc_content  # we already pad the content array with zeros due to init
+                evd_docs_lens[i][idx] = doc_len  # we set 0 length for padding evidences
+                evd_docs_ids[i][idx] = doc_id  # we set -1 as index for padding evidences
+                evd_sources[i][idx] = interactions.dict_evd_source[doc_id][0]  # -1 since we have an array size 1
+                evd_docs_char_source_contents[i][idx] = interactions.dict_char_right_src[doc_id]
+
+        return query_ids, query_contents, query_lengths, query_sources, query_char_source, \
+               evd_docs_ids, evd_docs_contents, evd_docs_lens, evd_sources, evd_cnt_each_query, \
+               evd_docs_char_source_contents, query_labels
diff --git a/handlers/output_handler_FC.py b/handlers/output_handler_FC.py
new file mode 100644
index 0000000..090e072
--- /dev/null
+++ b/handlers/output_handler_FC.py
@@ -0,0 +1,79 @@
+import sys
+
+class FileHandlerFC(object):
+    # mylogfile = None
+    # mylogfile_details = None
+    # error_analysis_log_validation = None
+    # error_analysis_log_testing = None
+    # error_analysis_log_test2 = None
+    # error_analysis_log_test3 = None
+
+    def __init__(self):
+        pass
+
+    # @classmethod
+    def init_log_files(self, log_file):
+        if log_file != None:
+            self.mylogfile = open(log_file, "w")
+            self.mylogfile_details = open(log_file + "_best_details.json", "w")
+            self.error_analysis_log_validation = open(log_file + "_error_analysis_validation.json", "w")
+            self.error_analysis_log_testing = open(log_file + "_error_analysis_testing.json", "w")
+            self.error_analysis_log_test2 = open(log_file + "_error_analysis_test2.json", "w")
+            self.error_analysis_log_test3 = open(log_file + "_error_analysis_test3.json", "w")
+
+    # @classmethod
+    def myprint(self, message):
+        assert self.mylogfile != None, "The LogFile is not initialized yet!"
+        print(message)
+        sys.stdout.flush()
+        if self.mylogfile != None:
+            print(message, file = self.mylogfile)
+            self.mylogfile.flush()
+
+    # @classmethod
+    def myprint_details(self, message):
+        assert self.mylogfile_details != None, "The Detailed JSON log file is not initialized yet!"
+        # print(message)
+        if self.mylogfile_details != None:
+            print(message, file = self.mylogfile_details)
+            self.mylogfile_details.flush()
+
+    # @classmethod
+    def save_error_analysis_validation(self, message: str):
+        assert self.error_analysis_log_validation != None, "The Detailed JSON log file is not initialized yet!"
+        # print(message)
+        if self.error_analysis_log_validation != None:
+            print(message, file = self.error_analysis_log_validation)
+            self.error_analysis_log_validation.flush()
+
+    # @classmethod
+    def save_error_analysis_testing(self, message: str):
+        assert self.error_analysis_log_testing != None, "The Detailed JSON log file is not initialized yet!"
+        # print(message)
+        if self.error_analysis_log_testing != None:
+            print(message, file = self.error_analysis_log_testing)
+            self.error_analysis_log_testing.flush()
+
+    # @classmethod
+    def save_error_analysis_test2(self, message: str):
+        assert self.error_analysis_log_test2 != None, "The Detailed JSON log file is not initialized yet!"
+        # print(message)
+        if self.error_analysis_log_test2 != None:
+            print(message, file=self.error_analysis_log_test2)
+            self.error_analysis_log_test2.flush()
+
+    # @classmethod
+    def save_error_analysis_test3(self, message: str):
+        assert self.error_analysis_log_test3 != None, "The Detailed JSON log file is not initialized yet!"
+        # print(message)
+        if self.error_analysis_log_test3 != None:
+            print(message, file=self.error_analysis_log_test3)
+            self.error_analysis_log_test3.flush()
+
+    def close(self):
+        self.mylogfile.close()
+        self.mylogfile_details.close()
+        self.error_analysis_log_validation.close()
+        self.error_analysis_log_testing.close()
+        self.error_analysis_log_test2.close()
+        self.error_analysis_log_test3.close()
\ No newline at end of file
diff --git a/handlers/tensorboard_writer.py b/handlers/tensorboard_writer.py
new file mode 100644
index 0000000..51e7a2c
--- /dev/null
+++ b/handlers/tensorboard_writer.py
@@ -0,0 +1,17 @@
+from tensorboardX import SummaryWriter
+
+class TensorboardWrapper():
+    my_tensorboard_writer = None
+
+    def __init__(self):
+        pass
+
+    @classmethod
+    def init_log_files(cls, log_file):
+        if log_file != None:
+            cls.my_tensorboard_writer = SummaryWriter(log_file)
+
+    @classmethod
+    def mywriter(cls):
+        assert cls.my_tensorboard_writer != None, "The LogFile is not initialized yet!"
+        return cls.my_tensorboard_writer
\ No newline at end of file
diff --git a/handlers/tensorboard_writer_class.py b/handlers/tensorboard_writer_class.py
new file mode 100644
index 0000000..d8f4154
--- /dev/null
+++ b/handlers/tensorboard_writer_class.py
@@ -0,0 +1,17 @@
+from tensorboardX import SummaryWriter
+
+class TensorboardWrapperClass(object):
+    # my_tensorboard_writer = None
+
+    def __init__(self):
+        pass
+
+    # @classmethod
+    def init_log_files(self, log_file):
+        if log_file != None:
+            self.my_tensorboard_writer = SummaryWriter(log_file)
+
+    # @classmethod
+    def mywriter(self):
+        assert self.my_tensorboard_writer != None, "The LogFile is not initialized yet!"
+        return self.my_tensorboard_writer
\ No newline at end of file
diff --git a/interactions.py b/interactions.py
new file mode 100644
index 0000000..c9016ca
--- /dev/null
+++ b/interactions.py
@@ -0,0 +1,291 @@
+import numpy as np
+import pandas as pd
+import matchzoo
+import collections
+from setting_keywords import KeyWordSettings
+from handlers.output_handler import FileHandler
+
+
+class MatchInteraction(object):
+    """
+        Interactions object. Contains (at a minimum) pair of user-item
+        interactions, but can also be enriched with ratings, timestamps,
+        and interaction weights.
+
+        For *implicit feedback* scenarios, user ids and item ids should
+        only be provided for user-item pairs where an interaction was
+        observed. All pairs that are not provided are treated as missing
+        observations, and often interpreted as (implicit) negative
+        signals.
+
+        For *explicit feedback* scenarios, user ids, item ids, and
+        ratings should be provided for all user-item-rating triplets
+        that were observed in the dataset.
+
+         This class is designed specificlaly for matching models only. Since I don't want
+        to use MatchZoo datapack at all.
+        Parameters
+        ----------
+
+        data_pack:
+        Attributes
+        ----------
+
+        unique_query_ids: `np.ndarray`array of np.int32
+            array of user ids of the user-item pairs
+        query_contents: array of np.int32
+            array of item ids of the user-item pairs
+        query_lengths: array of np.float32, optional
+            array of ratings
+
+        unique_doc_ids: array of np.int32, optional
+            array of timestamps
+        doc_contents: array of np.float32, optional
+            array of weights
+        doc_lengths: int, optional
+            Number of distinct users in the dataset.
+
+        pos_queries: list[int]
+            Number of distinct items in the dataset.
+        pos_docs: list[int]
+            Number of distinct items in the dataset.
+        negatives: dict
+
+        """
+
+    def __init__(self, data_pack: matchzoo.DataPack, **kargs):
+        # Note that, these indices are not from 0.
+        FileHandler.myprint("Converting DataFrame to Normal Dictionary of Data")
+        self.unique_query_ids, \
+        self.dict_query_contents, \
+        self.dict_query_lengths, \
+        self.dict_query_raw_contents, \
+        self.dict_query_positions = self.convert_leftright(data_pack.left, text_key = "text_left",
+                                                              length_text_key = "length_left",
+                                                              raw_text_key = "raw_text_left")
+        self.data_pack = data_pack
+        assert len(self.unique_query_ids) == len(set(self.unique_query_ids)), "Must be unique ids"
+        """ Why do I need to sort it? I have no idea why did I do it? """
+
+        self.unique_doc_ids, \
+        self.dict_doc_contents, \
+        self.dict_doc_lengths, \
+        self.dict_doc_raw_contents, \
+        self.dict_doc_positions = self.convert_leftright(data_pack.right, text_key = "text_right",
+                                                            length_text_key = "length_right",
+                                                            raw_text_key = "raw_text_right")
+
+        assert len(self.unique_doc_ids) == len(set(self.unique_doc_ids)), "Must be unique ids for doc ids"
+        assert len(self.unique_query_ids) != len(self.unique_doc_ids), "Impossible to have equal number of docs and number of original tweets"
+
+        self.pos_queries, \
+        self.pos_docs, \
+        self.negatives, \
+        self.unique_queries_test = self.convert_relations(data_pack.relation)
+
+        # for queries, padded
+        self.np_query_contents = np.array([self.dict_query_contents[q] for q in self.pos_queries])
+        self.np_query_lengths = np.array([self.dict_query_lengths[q] for q in self.pos_queries])
+        self.query_positions = np.array([self.dict_query_positions[q] for q in self.pos_queries])
+
+        # for docs, padded
+        self.np_doc_contents = np.array([self.dict_doc_contents[d] for d in self.pos_docs])
+        self.np_doc_lengths = np.array([self.dict_doc_lengths[d] for d in self.pos_docs])
+        self.doc_positions = np.array([self.dict_doc_positions[d] for d in self.pos_docs])
+
+        assert self.np_query_lengths.shape == self.np_doc_lengths.shape
+        self.padded_doc_length = len(self.np_doc_contents[0])
+        self.padded_query_length = len(self.np_query_contents[0])
+
+    def convert_leftright(self, part: pd.DataFrame, text_key: str, length_text_key: str, raw_text_key: str, **kargs):
+        """ Converting the dataframe of interactions """
+        ids, contents_dict, lengths_dict, position_dict = [], {}, {}, {}
+        raw_content_dict = {}
+        # Why don't we use the queryID as the key for dictionary????
+        FileHandler.myprint("[NOTICE] MatchZoo use queryID and docID as index in dataframe left and right, "
+                            "therefore, iterrows will return index which is left_id or right_id")
+        for index, row in part.iterrows(): # very dangerous, be careful because it may change order!!!
+            ids.append(index)
+            text_ = row[text_key]  # text_ here is converted to numbers and padded
+            raw_content_dict[index] = row[raw_text_key]
+
+            if length_text_key not in row: length_ = len(text_)
+            else: length_ = row[length_text_key]
+            assert length_ != 0
+            assert index not in contents_dict
+            contents_dict[index] = text_
+            lengths_dict[index] = length_
+            position_dict[index] = np.pad(np.arange(length_) + 1, (0, len(text_) - length_), 'constant')
+
+        return np.array(ids), contents_dict, lengths_dict, raw_content_dict, position_dict
+
+    def convert_relations(self, relation: pd.DataFrame):
+        """ Convert relations.
+        We want to retrieve positive interactions and negative interactions. Particularly,
+        for every pair (query, doc) = 1, we get a list of negatives of the query q
+
+        It is possible that a query may have multiple positive docs. Therefore, negatives[q]
+        may vary the lengths but not too much.
+        """
+        queries, docs, negatives = [], [], collections.defaultdict(list)
+        unique_queries = collections.defaultdict(list)
+
+        for index, row in relation.iterrows():
+            query = row["id_left"]
+            doc = row["id_right"]
+            label = row["label"]
+            assert label == 0 or label == 1
+            unique_queries[query] = unique_queries.get(query, [[], [], [], []]) #  doc, label, content, length
+            a, b, c, d = unique_queries[query]
+            a.append(doc)
+            b.append(label)
+            c.append(self.dict_doc_contents[doc])
+            d.append(self.dict_doc_lengths[doc])
+
+            if label == 1:
+                queries.append(query)
+                docs.append(doc)
+            elif label == 0:
+                negatives[query].append(doc)
+        assert len(queries) == len(docs)
+        return np.array(queries), np.array(docs), negatives, unique_queries
+
+    def __repr__(self):
+
+        return ('<Interactions dataset ({num_users} users x {num_items} items '
+                'x {num_interactions} interactions)>'
+            .format(
+            num_users = self.num_users,
+            num_items = self.num_items,
+            num_interactions = len(self)
+        ))
+
+    def _check(self):
+        pass
+
+
+class BaseClassificationInteractions(object):
+    """ Base classification interactions for fact-checking with evidences """
+
+    def __init__(self, data_pack: matchzoo.DataPack, **kargs):
+        # FileHandler.myprint("Converting DataFrame to Normal Dictionary of Data")
+        self.output_handler = kargs[KeyWordSettings.OutputHandlerFactChecking]
+        self.output_handler.myprint("Converting DataFrame to Normal Dictionary of Data")
+        additional_field = {KeyWordSettings.FCClass.CharSourceKey: "char_claim_source"}
+        self.unique_query_ids, \
+        self.dict_claim_contents, \
+        self.dict_claim_lengths, \
+        self.dict_query_raw_contents, \
+        self.dict_query_positions, \
+        self.dict_claim_source, \
+        self.dict_raw_claim_source, \
+        self.dict_char_left_src = self.convert_leftright(data_pack.left, text_key="text_left",
+                                     length_text_key="length_left", raw_text_key="raw_text_left",
+                                     source_key="claim_source", raw_source_key="raw_claim_source", **additional_field)
+        self.data_pack = data_pack
+        assert len(self.unique_query_ids) == len(set(self.unique_query_ids)), "Must be unique ids"
+        """ Why do I need to sort it? I have no idea why did I do it? """
+        additional_field = {KeyWordSettings.FCClass.CharSourceKey: "char_evidence_source"}
+        self.unique_doc_ids, \
+        self.dict_doc_contents, \
+        self.dict_doc_lengths, \
+        self.dict_doc_raw_contents, \
+        self.dict_doc_positions, \
+        self.dict_evd_source,\
+        self.dict_raw_evd_source, \
+        self.dict_char_right_src = self.convert_leftright(data_pack.right, text_key="text_right",
+                                                      length_text_key="length_right",
+                                                      raw_text_key="raw_text_right", source_key="evidence_source",
+                                                      raw_source_key="raw_evidence_source", **additional_field)
+
+        assert len(self.unique_doc_ids) == len(set(self.unique_doc_ids)), "Must be unique ids for doc ids"
+        assert len(self.unique_query_ids) != len(
+            self.unique_doc_ids), "Impossible to have equal number of docs and number of original tweets"
+
+    def convert_leftright(self, part: pd.DataFrame, text_key: str, length_text_key: str, raw_text_key: str,
+                          source_key: str, raw_source_key: str, **kargs):
+        """ Converting the dataframe of interactions """
+        ids, contents_dict, lengths_dict, position_dict = [], {}, {}, {}
+        raw_content_dict, sources, raw_sources, char_sources = {}, {}, {}, {}
+        char_source_key = kargs[KeyWordSettings.FCClass.CharSourceKey]
+        # Why don't we use the queryID as the key for dictionary????
+        self.output_handler.myprint("[NOTICE] MatchZoo use queryID and docID as index in dataframe left and right, "
+                            "therefore, iterrows will return index which is left_id or right_id")
+        for index, row in part.iterrows(): # very dangerous, be careful because it may change order!!!
+            ids.append(index)
+            text_ = row[text_key]  # text_ here is converted to numbers and padded
+            raw_content_dict[index] = row[raw_text_key]
+
+            if length_text_key not in row: length_ = len(text_)
+            else: length_ = row[length_text_key]
+            assert length_ != 0
+            assert index not in contents_dict
+            contents_dict[index] = text_
+            lengths_dict[index] = length_
+            position_dict[index] = np.pad(np.arange(length_) + 1, (0, len(text_) - length_), 'constant')
+            sources[index] = row[source_key]
+            raw_sources[index] = row[raw_source_key]
+            char_sources[index] = row[char_source_key]
+
+        return np.array(ids), contents_dict, lengths_dict, raw_content_dict, \
+               position_dict, sources, raw_sources, char_sources
+
+    def convert_relations(self, relation: pd.DataFrame): pass
+
+
+class ClassificationInteractions(BaseClassificationInteractions):
+    """
+    This class is for classification based on evidences.
+    Query - [list of evidences] -> labels
+    """
+
+    def __init__(self, data_pack: matchzoo.DataPack, **kargs):
+        super(ClassificationInteractions, self).__init__(data_pack, **kargs)
+
+        # (1) unique claims, (2) labels for each claim and (3) info of each claim
+        self.claims, self.claims_labels, self.dict_claims_and_evidences_test = \
+            self.convert_relations(data_pack.relation)
+
+        # for queries, padded
+        self.np_query_contents = np.array([self.dict_claim_contents[q] for q in self.claims])
+        self.np_query_lengths = np.array([self.dict_claim_lengths[q] for q in self.claims])
+        self.np_query_char_source = np.array([self.dict_char_left_src[q] for q in self.claims])
+        self.query_positions = np.array([self.dict_query_positions[q] for q in self.claims])
+
+        # assert self.np_query_lengths.shape == self.np_doc_lengths.shape
+        self.padded_doc_length = len(self.dict_doc_contents[self.unique_doc_ids[0]])
+        self.padded_doc_char_source_length = len(self.dict_char_right_src[self.unique_doc_ids[0]])
+        # self.padded_query_length = len(self.np_query_contents[0])
+
+    def convert_relations(self, relation: pd.DataFrame):
+        """ Convert relations.
+        We want to retrieve positive interactions and negative interactions. Particularly,
+        for every pair (query, doc) = 1, we get a list of negatives of the query q
+
+        It is possible that a query may have multiple positive docs. Therefore, negatives[q]
+        may vary the lengths but not too much.
+        """
+        queries = []  # , collections.defaultdict(list)
+        queries_labels = []
+        unique_queries = collections.defaultdict(list)
+        set_queries = set()
+
+        for index, row in relation.iterrows():
+            query = row["id_left"]
+            doc = row["id_right"]
+            label = row["label"]
+            # assert label == 0 or label == 1
+            unique_queries[query] = unique_queries.get(query, [[], [], [], []])  # doc, label, content, length
+            a, b, c, d = unique_queries[query]
+            a.append(doc)
+            b.append(label)
+            c.append(self.dict_doc_contents[doc])
+            d.append(self.dict_doc_lengths[doc])
+
+            if query not in set_queries:
+                queries.append(query)  # same as unique_queries
+                queries_labels.append(label)
+                set_queries.add(query)
+
+        assert len(queries) == len(unique_queries)
+        return np.array(queries), np.array(queries_labels), unique_queries
diff --git a/matchzoo/preprocessors/__init__.py b/matchzoo/preprocessors/__init__.py
index 9ca462e..98340c3 100644
--- a/matchzoo/preprocessors/__init__.py
+++ b/matchzoo/preprocessors/__init__.py
@@ -1,16 +1,7 @@
 from . import units
-from .dssm_preprocessor import DSSMPreprocessor
-from .naive_preprocessor import NaivePreprocessor
 from .basic_preprocessor import BasicPreprocessor
-from .cdssm_preprocessor import CDSSMPreprocessor
-from .mz_pretrained_preprocessor import PreTrainedModelsProcessor
-from .char_ngram_preprocessor import CharNGramPreprocessor
-from .elmo_basic_preprocessor import ElmoPreprocessor
 from .bow_preprocessor import BoWPreprocessor
-from .declare_preprocessor import DeClarePreprocessor
-from .fact_checking_elmo_preprocessor import FactCheckingElmoPreprocessor
 from .char_man_preprocessor import CharManPreprocessor
-from .char_man_elmo_preprocessor import CharManElmoPreprocessor
 
 
 def list_available() -> list:
diff --git a/matchzoo/preprocessors/basic_preprocessor.py b/matchzoo/preprocessors/basic_preprocessor.py
index d72df17..7a042aa 100644
--- a/matchzoo/preprocessors/basic_preprocessor.py
+++ b/matchzoo/preprocessors/basic_preprocessor.py
@@ -8,7 +8,6 @@
 from .build_vocab_unit import build_vocab_unit
 from .build_unit_from_data_pack import build_unit_from_data_pack
 from .chain_transform import chain_transform
-from handlers.output_handler import FileHandler
 
 tqdm.pandas()
 
diff --git a/matchzoo/preprocessors/bow_preprocessor.py b/matchzoo/preprocessors/bow_preprocessor.py
index eb33559..4c15c04 100644
--- a/matchzoo/preprocessors/bow_preprocessor.py
+++ b/matchzoo/preprocessors/bow_preprocessor.py
@@ -8,7 +8,6 @@
 from .build_vocab_unit import build_vocab_unit
 from .build_unit_from_data_pack import build_unit_from_data_pack
 from .chain_transform import chain_transform
-from handlers.output_handler import FileHandler
 from typing import List
 import torch
 import itertools, os
diff --git a/matchzoo/preprocessors/cdssm_preprocessor.py b/matchzoo/preprocessors/cdssm_preprocessor.py
deleted file mode 100644
index edeac4e..0000000
--- a/matchzoo/preprocessors/cdssm_preprocessor.py
+++ /dev/null
@@ -1,125 +0,0 @@
-"""CDSSM Preprocessor."""
-
-from tqdm import tqdm
-
-from . import units
-from .chain_transform import chain_transform
-from matchzoo import DataPack
-from matchzoo.engine.base_preprocessor import BasePreprocessor
-from .build_vocab_unit import build_vocab_unit
-
-tqdm.pandas()
-
-
-class CDSSMPreprocessor(BasePreprocessor):
-    """CDSSM Model preprocessor."""
-
-    def __init__(self,
-                 fixed_length_left: int = 10,
-                 fixed_length_right: int = 40,
-                 with_word_hashing: bool = True):
-        """
-        CDSSM Model preprocessor.
-
-        The word hashing step could eats up a lot of memory. To workaround
-        this problem, set `with_word_hashing` to `False` and use a
-        :class:`matchzoo.DynamicDataGenerator` with a
-        :class:`matchzoo.preprocessor.units.WordHashing`.
-
-        TODO: doc here.
-
-        :param with_word_hashing: Include a word hashing step if `True`.
-
-        Example:
-            >>> import matchzoo as mz
-            >>> train_data = mz.datasets.toy.load_data()
-            >>> test_data = mz.datasets.toy.load_data(stage='test')
-            >>> cdssm_preprocessor = mz.preprocessors.CDSSMPreprocessor()
-            >>> train_data_processed = cdssm_preprocessor.fit_transform(
-            ...     train_data, verbose=0
-            ... )
-            >>> type(train_data_processed)
-            <class 'matchzoo.data_pack.data_pack.DataPack'>
-            >>> test_data_transformed = cdssm_preprocessor.transform(test_data,
-            ...                                                      verbose=0)
-            >>> type(test_data_transformed)
-            <class 'matchzoo.data_pack.data_pack.DataPack'>
-
-        """
-        super().__init__()
-        self._fixed_length_left = fixed_length_left
-        self._fixed_length_right = fixed_length_right
-        self._left_fixedlength_unit = units.FixedLength(
-            self._fixed_length_left,
-            pad_value='0', pad_mode='post'
-        )
-        self._right_fixedlength_unit = units.FixedLength(
-            self._fixed_length_right,
-            pad_value='0', pad_mode='post'
-        )
-        self._with_word_hashing = with_word_hashing
-
-    def fit(self, data_pack: DataPack, verbose: int = 1):
-        """
-        Fit pre-processing context for transformation.
-
-        :param verbose: Verbosity.
-        :param data_pack: Data_pack to be preprocessed.
-        :return: class:`CDSSMPreprocessor` instance.
-        """
-        fit_units = self._default_units() + [units.NgramLetter()]
-        func = chain_transform(fit_units)
-        data_pack = data_pack.apply_on_text(func, verbose=verbose)
-        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
-
-        self._context['vocab_unit'] = vocab_unit
-        vocab_size = len(vocab_unit.state['term_index']) + 1
-        self._context['input_shapes'] = [
-            (self._fixed_length_left, vocab_size),
-            (self._fixed_length_right, vocab_size)
-        ]
-        return self
-
-    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
-        """
-        Apply transformation on data, create `letter-ngram` representation.
-
-        :param data_pack: Inputs to be preprocessed.
-        :param verbose: Verbosity.
-
-        :return: Transformed data as :class:`DataPack` object.
-        """
-        data_pack = data_pack.copy()
-        func = chain_transform(self._default_units())
-        data_pack.apply_on_text(func, inplace=True, verbose=verbose)
-        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
-                                mode='left', inplace=True, verbose=verbose)
-        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
-                                mode='right', inplace=True, verbose=verbose)
-        post_units = [units.NgramLetter(reduce_dim=False)]
-        if self._with_word_hashing:
-            term_index = self._context['vocab_unit'].state['term_index']
-            post_units.append(units.WordHashing(term_index))
-        data_pack.apply_on_text(chain_transform(post_units),
-                                inplace=True, verbose=verbose)
-        return data_pack
-
-    @classmethod
-    def _default_units(cls) -> list:
-        """Prepare needed process units."""
-        return [
-            units.Tokenize(),
-            units.Lowercase(),
-            units.PuncRemoval(),
-            units.StopRemoval(),
-        ]
-
-    @property
-    def with_word_hashing(self):
-        """`with_word_hashing` getter."""
-        return self._with_word_hashing
-
-    @with_word_hashing.setter
-    def with_word_hashing(self, value):
-        """`with_word_hashing` setter."""
-        self._with_word_hashing = value
diff --git a/matchzoo/preprocessors/char_man_elmo_preprocessor.py b/matchzoo/preprocessors/char_man_elmo_preprocessor.py
deleted file mode 100644
index ec4fda0..0000000
--- a/matchzoo/preprocessors/char_man_elmo_preprocessor.py
+++ /dev/null
@@ -1,269 +0,0 @@
-"""Basic Preprocessor."""
-
-from tqdm import tqdm
-
-from . import units
-from matchzoo import DataPack
-from matchzoo.engine.base_preprocessor import BasePreprocessor
-from .build_vocab_unit import build_vocab_unit
-from .build_unit_from_data_pack import build_unit_from_data_pack
-from .chain_transform import chain_transform
-from handlers.output_handler import FileHandler
-from matchzoo.preprocessors.basic_preprocessor import BasicPreprocessor
-from matchzoo.preprocessors.units import Unit
-from .units import Vocabulary
-tqdm.pandas()
-
-
-class CharManElmoPreprocessor(BasicPreprocessor):
-    """
-    Baisc preprocessor helper for fact-checking with external evidences for my
-
-    :param fixed_length_left: Integer, maximize length of :attr:`left` in the
-        data_pack.
-    :param fixed_length_right: Integer, maximize length of :attr:`right` in the
-        data_pack.
-    :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can
-        be 'df', 'cf', and 'idf'.
-    :param filter_low_freq: Float, lower bound value used by
-        :class:`FrequenceFilterUnit`.
-    :param filter_high_freq: Float, upper bound value used by
-        :class:`FrequenceFilterUnit`.
-    :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not.
-
-    Example:
-        >>> import matchzoo as mz
-        >>> train_data = mz.datasets.toy.load_data('train')
-        >>> test_data = mz.datasets.toy.load_data('test')
-        >>> preprocessor = mz.preprocessors.BasicPreprocessor(
-        ...     fixed_length_left=10,
-        ...     fixed_length_right=20,
-        ...     filter_mode='df',
-        ...     filter_low_freq=2,
-        ...     filter_high_freq=1000,
-        ...     remove_stop_words=True
-        ... )
-        >>> preprocessor = preprocessor.fit(train_data, verbose=0)
-        >>> preprocessor.context['input_shapes']
-        [(10,), (20,)]
-        >>> preprocessor.context['vocab_size']
-        225
-        >>> processed_train_data = preprocessor.transform(train_data,
-        ...                                               verbose=0)
-        >>> type(processed_train_data)
-        <class 'matchzoo.data_pack.data_pack.DataPack'>
-        >>> test_data_transformed = preprocessor.transform(test_data,
-        ...                                                verbose=0)
-        >>> type(test_data_transformed)
-        <class 'matchzoo.data_pack.data_pack.DataPack'>
-
-    """
-
-    def __init__(self, fixed_length_left: int = 30,
-                 fixed_length_right: int = 30,
-                 fixed_length_left_src: int = 30,
-                 fixed_length_right_src: int = 30,
-                 filter_mode: str = 'df',
-                 filter_low_freq: float = 2,
-                 filter_high_freq: float = float('inf'),
-                 remove_stop_words: bool = False):
-        """Initialization."""
-        super().__init__()
-        self._fixed_length_left = fixed_length_left
-        self._fixed_length_right = fixed_length_right
-        self._fixed_length_left_src = fixed_length_left_src
-        self._fixed_length_right_src = fixed_length_right_src
-        self._left_fixedlength_unit = units.FixedLength(
-            self._fixed_length_left,
-            pad_mode='post'
-        )
-        self._right_fixedlength_unit = units.FixedLength(
-            self._fixed_length_right,
-            pad_mode='post'
-        )
-        # for padding character level of left_source and right_source
-        self._left_char_src_fixedlength_unit = units.FixedLength(self._fixed_length_left_src, pad_mode='post')
-        self._right_char_src_fixedlength_unit = units.FixedLength(self._fixed_length_right_src, pad_mode='post')
-
-        self.char_unit = units.ngram_letter.NgramLetter(ngram=1, reduce_dim=True)
-        self._units = [SplitTokenize()]
-        if remove_stop_words:
-            self._units.append(units.stop_removal.StopRemoval())
-
-    def fit(self, data_pack: DataPack, verbose: int = 1):
-        """
-        Fit pre-processing context for transformation.
-
-        :param data_pack: data_pack to be preprocessed.
-        :param verbose: Verbosity.
-        :return: class:`BasicPreprocessor` instance.
-        """
-        data_pack = data_pack.apply_on_text(chain_transform(self._units), verbose=verbose)
-        # fitted_filter_unit = build_unit_from_data_pack(self._filter_unit,
-        #                                                data_pack,
-        #                                                flatten=False,
-        #                                                mode='right',
-        #                                                verbose=verbose)
-        # data_pack = data_pack.apply_on_text(fitted_filter_unit.transform,
-        #                                     mode='right', verbose=verbose)
-        # self._context['filter_unit'] = fitted_filter_unit
-
-        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
-        self._context['vocab_unit'] = vocab_unit
-
-        vocab_size = len(vocab_unit.state['term_index'])  # + 1  # +1 for padding
-        self._context['vocab_size'] = vocab_size
-        self._context['embedding_input_dim'] = vocab_size
-        self._context['input_shapes'] = [(self._fixed_length_left,),
-                                         (self._fixed_length_right,)]
-
-        claim_source_unit = build_entity_unit(column="claim_source", data_pack=data_pack, mode="left")
-        article_source_unit = build_entity_unit(column="evidence_source", data_pack=data_pack, mode="right")
-        self._context['claim_source_unit'] = claim_source_unit
-        self._context['article_source_unit'] = article_source_unit
-
-        char_source_unit = build_ngram_unit(left_column="claim_source", right_column="evidence_source",
-                                            data_pack=data_pack, mode="both")
-        self._context['char_source_unit'] = char_source_unit
-        return self
-
-    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
-        """
-        Apply transformation on data, create fixed length representation.
-
-        :param data_pack: Inputs to be preprocessed.
-        :param verbose: Verbosity.
-
-        :return: Transformed data as :class:`DataPack` object.
-        """
-        data_pack = data_pack.copy()
-
-        def map_claim_source(entity: str): return self._context['claim_source_unit'].transform([entity])
-
-        def map_evidence_source(entity: str): return self._context['article_source_unit'].transform([entity])
-
-        def map_src2char(entity: str):
-            return self._context['char_source_unit'].transform(list(entity))
-
-        data_pack.left["claim_source"] = data_pack.left["claim_source"].progress_apply(map_claim_source)
-        data_pack.left["char_claim_source"] = data_pack.left["char_claim_source"].progress_apply(map_src2char)
-        data_pack.left["char_claim_source"] = data_pack.left["char_claim_source"].progress_apply(
-            self._left_char_src_fixedlength_unit.transform)
-
-        data_pack.right["evidence_source"] = data_pack.right["evidence_source"].progress_apply(map_evidence_source)
-        data_pack.right["char_evidence_source"] = data_pack.right["char_evidence_source"].progress_apply(map_src2char)
-        data_pack.right["char_evidence_source"] = data_pack.right["char_evidence_source"].progress_apply(
-            self._right_char_src_fixedlength_unit.transform)
-
-        data_pack.apply_on_text(chain_transform(self._units), inplace=True, verbose=verbose)
-
-        # data_pack.apply_on_text(self._context['filter_unit'].transform,
-        #                         mode='right', inplace=True, verbose=verbose)
-        data_pack.apply_on_text(self._context['vocab_unit'].transform,
-                                mode='both', inplace=True, verbose=verbose)
-        data_pack.append_text_length(inplace=True, verbose=verbose)
-        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
-                                mode='left', inplace=True, verbose=verbose)
-        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
-                                mode='right', inplace=True, verbose=verbose)
-
-        max_len_left = self._fixed_length_left
-        max_len_right = self._fixed_length_right
-
-        data_pack.left['length_left'] = \
-            data_pack.left['length_left'].apply(
-                lambda val: min(val, max_len_left))
-
-        data_pack.right['length_right'] = \
-            data_pack.right['length_right'].apply(
-                lambda val: min(val, max_len_right))
-
-        return data_pack
-
-
-class SplitTokenize(Unit):
-    """Process unit for text tokenization."""
-
-    def transform(self, input_: str) -> list:
-        """
-        Process input data from raw terms to list of tokens.
-
-        :param input_: raw textual input.
-
-        :return tokens: tokenized tokens as a list.
-        """
-        return input_.split()
-
-
-def build_entity_unit(
-    column: str,
-    data_pack: DataPack,
-    mode: str = 'both',
-    verbose: int = 1
-) -> Vocabulary:
-    """
-    Build a :class:`preprocessor.units.Vocabulary` given `data_pack`.
-
-    The `data_pack` should be preprocessed forehand, and each item in
-    `text_left` and `text_right` columns of the `data_pack` should be a list
-    of tokens.
-
-    :param column: `str` the selected column to build units
-    :param data_pack: The :class:`DataPack` to build vocabulary upon.
-    :param mode: One of 'left', 'right', and 'both', to determine the source
-    data for building the :class:`VocabularyUnit`.
-    :param verbose: Verbosity.
-    :return: A built vocabulary unit.
-
-    """
-    unit = Vocabulary()
-    corpus = []
-    def func(entity: str): corpus.append(entity.strip())
-    assert mode in ["left", "right"]
-    if mode == "left":
-        data_pack.left[column].progress_apply(func)
-    elif mode == "right":
-        data_pack.right[column].progress_apply(func)
-    else:
-        raise NotImplemented("Not coded for both columns")
-
-    if verbose:
-        description = 'Building Entities ' + unit.__class__.__name__ + ' from a datapack.'
-        corpus = tqdm(corpus, desc=description)
-    unit.fit(corpus)
-    return unit
-
-
-def build_ngram_unit(left_column: str, right_column: str, data_pack: DataPack, mode: str = 'both', verbose: int = 1):
-    """
-    Build a :class:`preprocessor.units.Vocabulary` given `data_pack`.
-
-    The `data_pack` should be preprocessed forehand, and each item in
-    `text_left` and `text_right` columns of the `data_pack` should be a list
-    of tokens.
-
-    :param column: `str` the selected column to build units
-    :param data_pack: The :class:`DataPack` to build vocabulary upon.
-    :param mode: One of 'left', 'right', and 'both', to determine the source
-    data for building the :class:`VocabularyUnit`.
-    :param verbose: Verbosity.
-    :return: A built vocabulary unit.
-
-    """
-    unit = Vocabulary()
-    corpus = []
-
-    def func(entity: str):
-        assert type(entity) == str
-        entity = entity.strip()
-        for c in entity: corpus.append(c)
-
-    assert mode == "both"
-    data_pack.left[left_column].progress_apply(func)
-    data_pack.right[right_column].progress_apply(func)
-
-    if verbose:
-        description = 'Building Characters ' + unit.__class__.__name__ + ' from a datapack.'
-        corpus = tqdm(corpus, desc=description)
-    unit.fit(corpus)
-    return unit
diff --git a/matchzoo/preprocessors/char_man_preprocessor.py b/matchzoo/preprocessors/char_man_preprocessor.py
index 779fd69..7bc207a 100644
--- a/matchzoo/preprocessors/char_man_preprocessor.py
+++ b/matchzoo/preprocessors/char_man_preprocessor.py
@@ -8,7 +8,6 @@
 from .build_vocab_unit import build_vocab_unit
 from .build_unit_from_data_pack import build_unit_from_data_pack
 from .chain_transform import chain_transform
-from handlers.output_handler import FileHandler
 from .units import Vocabulary
 from .units import StatefulUnit
 tqdm.pandas()
diff --git a/matchzoo/preprocessors/char_ngram_preprocessor.py b/matchzoo/preprocessors/char_ngram_preprocessor.py
deleted file mode 100644
index 22087fc..0000000
--- a/matchzoo/preprocessors/char_ngram_preprocessor.py
+++ /dev/null
@@ -1,95 +0,0 @@
-"""Basic Preprocessor."""
-
-from tqdm import tqdm
-
-from . import units
-from matchzoo import DataPack
-from matchzoo.preprocessors.basic_preprocessor import BasicPreprocessor
-from .build_vocab_unit import build_vocab_unit
-from .build_unit_from_data_pack import build_unit_from_data_pack
-from .chain_transform import chain_transform
-from handlers.output_handler import FileHandler
-
-tqdm.pandas()
-
-
-class CharNGramPreprocessor(BasicPreprocessor):
-    """
-    Baisc preprocessor helper.
-
-    :param fixed_length_left: Integer, maximize length of :attr:`left` in the
-        data_pack.
-    :param fixed_length_right: Integer, maximize length of :attr:`right` in the
-        data_pack.
-    :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can
-        be 'df', 'cf', and 'idf'.
-    :param filter_low_freq: Float, lower bound value used by
-        :class:`FrequenceFilterUnit`.
-    :param filter_high_freq: Float, upper bound value used by
-        :class:`FrequenceFilterUnit`.
-    :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not.
-
-    Example:
-        >>> import matchzoo as mz
-        >>> train_data = mz.datasets.toy.load_data('train')
-        >>> test_data = mz.datasets.toy.load_data('test')
-        >>> preprocessor = mz.preprocessors.BasicPreprocessor(
-        ...     fixed_length_left=10,
-        ...     fixed_length_right=20,
-        ...     filter_mode='df',
-        ...     filter_low_freq=2,
-        ...     filter_high_freq=1000,
-        ...     remove_stop_words=True
-        ... )
-        >>> preprocessor = preprocessor.fit(train_data, verbose=0)
-        >>> preprocessor.context['input_shapes']
-        [(10,), (20,)]
-        >>> preprocessor.context['vocab_size']
-        225
-        >>> processed_train_data = preprocessor.transform(train_data,
-        ...                                               verbose=0)
-        >>> type(processed_train_data)
-        <class 'matchzoo.data_pack.data_pack.DataPack'>
-        >>> test_data_transformed = preprocessor.transform(test_data,
-        ...                                                verbose=0)
-        >>> type(test_data_transformed)
-        <class 'matchzoo.data_pack.data_pack.DataPack'>
-
-    """
-
-    def __init__(self, fixed_length_left: int = 30,
-                 fixed_length_right: int = 30,
-                 filter_mode: str = 'df',
-                 filter_low_freq: float = 2,
-                 filter_high_freq: float = float('inf'),
-                 remove_stop_words: bool = False):
-        """Initialization."""
-        # super().__init__()
-        super(BasicPreprocessor, self).__init__()
-        self._fixed_length_left = fixed_length_left
-        self._fixed_length_right = fixed_length_right
-        self._left_fixedlength_unit = units.FixedLength(
-            self._fixed_length_left,
-            pad_mode='post'
-        )
-        self._right_fixedlength_unit = units.FixedLength(
-            self._fixed_length_right,
-            pad_mode='post'
-        )
-        self._filter_unit = units.FrequencyFilter(
-            low=filter_low_freq,
-            high=filter_high_freq,
-            mode=filter_mode
-        )
-        self._units = self._default_units()
-        # if remove_stop_words:
-        #     self._units.append(units.stop_removal.StopRemoval())
-
-    def _default_units(cls) -> list:
-        return [
-            units.Tokenize(),
-            units.Lowercase(),
-            units.PuncRemoval(),
-            units.StopRemoval(),
-            units.NgramLetter(),
-        ]
diff --git a/matchzoo/preprocessors/declare_preprocessor.py b/matchzoo/preprocessors/declare_preprocessor.py
deleted file mode 100644
index 3bca49f..0000000
--- a/matchzoo/preprocessors/declare_preprocessor.py
+++ /dev/null
@@ -1,202 +0,0 @@
-"""Basic Preprocessor."""
-
-from tqdm import tqdm
-
-from . import units
-from matchzoo import DataPack
-from matchzoo.engine.base_preprocessor import BasePreprocessor
-from .build_vocab_unit import build_vocab_unit
-from .build_unit_from_data_pack import build_unit_from_data_pack
-from .chain_transform import chain_transform
-from handlers.output_handler import FileHandler
-from .units import Vocabulary
-from .units import StatefulUnit
-tqdm.pandas()
-
-
-class DeClarePreprocessor(BasePreprocessor):
-    """
-    Declare preprocessor helper which has source embeddings.
-
-    :param fixed_length_left: Integer, maximize length of :attr:`left` in the
-        data_pack.
-    :param fixed_length_right: Integer, maximize length of :attr:`right` in the
-        data_pack.
-    :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can
-        be 'df', 'cf', and 'idf'.
-    :param filter_low_freq: Float, lower bound value used by
-        :class:`FrequenceFilterUnit`.
-    :param filter_high_freq: Float, upper bound value used by
-        :class:`FrequenceFilterUnit`.
-    :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not.
-
-    Example:
-        >>> import matchzoo as mz
-        >>> train_data = mz.datasets.toy.load_data('train')
-        >>> test_data = mz.datasets.toy.load_data('test')
-        >>> preprocessor = mz.preprocessors.BasicPreprocessor(
-        ...     fixed_length_left=10,
-        ...     fixed_length_right=20,
-        ...     filter_mode='df',
-        ...     filter_low_freq=2,
-        ...     filter_high_freq=1000,
-        ...     remove_stop_words=True
-        ... )
-        >>> preprocessor = preprocessor.fit(train_data, verbose=0)
-        >>> preprocessor.context['input_shapes']
-        [(10,), (20,)]
-        >>> preprocessor.context['vocab_size']
-        225
-        >>> processed_train_data = preprocessor.transform(train_data,
-        ...                                               verbose=0)
-        >>> type(processed_train_data)
-        <class 'matchzoo.data_pack.data_pack.DataPack'>
-        >>> test_data_transformed = preprocessor.transform(test_data,
-        ...                                                verbose=0)
-        >>> type(test_data_transformed)
-        <class 'matchzoo.data_pack.data_pack.DataPack'>
-
-    """
-
-    def __init__(self, fixed_length_left: int = 30,
-                 fixed_length_right: int = 30,
-                 filter_mode: str = 'df',
-                 filter_low_freq: float = 2,
-                 filter_high_freq: float = float('inf'),
-                 remove_stop_words: bool = False):
-        """Initialization."""
-        super().__init__()
-        self._fixed_length_left = fixed_length_left
-        self._fixed_length_right = fixed_length_right
-        self._left_fixedlength_unit = units.FixedLength(
-            self._fixed_length_left,
-            pad_mode='post'
-        )
-        self._right_fixedlength_unit = units.FixedLength(
-            self._fixed_length_right,
-            pad_mode='post'
-        )
-        # self._filter_unit = units.FrequencyFilter(
-        #     low=filter_low_freq,
-        #     high=filter_high_freq,
-        #     mode=filter_mode
-        # )
-        self._units = self._default_units()
-        if remove_stop_words:
-            self._units.append(units.stop_removal.StopRemoval())
-
-    def fit(self, data_pack: DataPack, verbose: int = 1):
-        """
-        Fit pre-processing context for transformation.
-
-        :param data_pack: data_pack to be preprocessed.
-        :param verbose: Verbosity.
-        :return: class:`BasicPreprocessor` instance.
-        """
-        data_pack = data_pack.apply_on_text(chain_transform(self._units),
-                                            verbose=verbose)
-        # fitted_filter_unit = build_unit_from_data_pack(self._filter_unit,
-        #                                                data_pack,
-        #                                                flatten=False,
-        #                                                mode='right',
-        #                                                verbose=verbose)
-        # data_pack = data_pack.apply_on_text(fitted_filter_unit.transform,
-        #                                     mode='right', verbose=verbose)
-        # self._context['filter_unit'] = fitted_filter_unit
-
-        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
-        self._context['vocab_unit'] = vocab_unit
-
-        vocab_size = len(vocab_unit.state['term_index'])  # + 1  # +1 for padding
-        self._context['vocab_size'] = vocab_size
-        self._context['embedding_input_dim'] = vocab_size
-        self._context['input_shapes'] = [(self._fixed_length_left,),
-                                         (self._fixed_length_right,)]
-
-        claim_source_unit = build_entity_unit(column = "claim_source", data_pack = data_pack, mode = "left")
-        article_source_unit = build_entity_unit(column = "evidence_source", data_pack = data_pack, mode = "right")
-        self._context['claim_source_unit'] = claim_source_unit
-        self._context['article_source_unit'] = article_source_unit
-
-        return self
-
-    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
-        """
-        Apply transformation on data, create fixed length representation.
-
-        :param data_pack: Inputs to be preprocessed.
-        :param verbose: Verbosity.
-
-        :return: Transformed data as :class:`DataPack` object.
-        """
-        data_pack = data_pack.copy()
-
-        def map_claim_source(entity: str): return self._context['claim_source_unit'].transform([entity])
-
-        def map_evidence_source(entity: str): return self._context['article_source_unit'].transform([entity])
-
-        data_pack.left["claim_source"] = data_pack.left["claim_source"].progress_apply(map_claim_source)
-        data_pack.right["evidence_source"] = data_pack.right["evidence_source"].progress_apply(map_evidence_source)
-        data_pack.apply_on_text(chain_transform(self._units), inplace=True, verbose=verbose)
-
-        # data_pack.apply_on_text(self._context['filter_unit'].transform,
-        #                         mode='right', inplace=True, verbose=verbose)
-        data_pack.apply_on_text(self._context['vocab_unit'].transform,
-                                mode='both', inplace=True, verbose=verbose)
-        data_pack.append_text_length(inplace=True, verbose=verbose)
-        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
-                                mode='left', inplace=True, verbose=verbose)
-        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
-                                mode='right', inplace=True, verbose=verbose)
-
-        max_len_left = self._fixed_length_left
-        max_len_right = self._fixed_length_right
-
-        data_pack.left['length_left'] = \
-            data_pack.left['length_left'].apply(
-                lambda val: min(val, max_len_left))
-
-        data_pack.right['length_right'] = \
-            data_pack.right['length_right'].apply(
-                lambda val: min(val, max_len_right))
-
-        return data_pack
-
-
-def build_entity_unit(
-    column: str,
-    data_pack: DataPack,
-    mode: str = 'both',
-    verbose: int = 1
-) -> Vocabulary:
-    """
-    Build a :class:`preprocessor.units.Vocabulary` given `data_pack`.
-
-    The `data_pack` should be preprocessed forehand, and each item in
-    `text_left` and `text_right` columns of the `data_pack` should be a list
-    of tokens.
-
-    :param column: `str` the selected column to build units
-    :param data_pack: The :class:`DataPack` to build vocabulary upon.
-    :param mode: One of 'left', 'right', and 'both', to determine the source
-    data for building the :class:`VocabularyUnit`.
-    :param verbose: Verbosity.
-    :return: A built vocabulary unit.
-
-    """
-    unit = Vocabulary()
-    corpus = []
-    def func(entity: str): corpus.append(entity.strip())
-    assert mode in ["left", "right"]
-    if mode == "left":
-        data_pack.left[column].progress_apply(func)
-    elif mode == "right":
-        data_pack.right[column].progress_apply(func)
-    else:
-        raise NotImplemented("Not coded for both columns")
-
-    if verbose:
-        description = 'Building ' + unit.__class__.__name__ + ' from a datapack.'
-        corpus = tqdm(corpus, desc=description)
-    unit.fit(corpus)
-    return unit
diff --git a/matchzoo/preprocessors/dssm_preprocessor.py b/matchzoo/preprocessors/dssm_preprocessor.py
deleted file mode 100644
index fb2ebab..0000000
--- a/matchzoo/preprocessors/dssm_preprocessor.py
+++ /dev/null
@@ -1,124 +0,0 @@
-"""DSSM Preprocessor."""
-
-from tqdm import tqdm
-
-from matchzoo.data_pack import DataPack
-from matchzoo.engine.base_preprocessor import BasePreprocessor
-from .chain_transform import chain_transform
-from .build_vocab_unit import build_vocab_unit
-from . import units
-
-tqdm.pandas()
-
-
-class DSSMPreprocessor(BasePreprocessor):
-    """DSSM Model preprocessor."""
-
-    def __init__(self, with_word_hashing: bool = True):
-        """
-        DSSM Model preprocessor.
-
-        The word hashing step could eats up a lot of memory. To workaround
-        this problem, set `with_word_hashing` to `False` and use  a
-        :class:`matchzoo.DynamicDataGenerator` with a
-        :class:`matchzoo.preprocessor.units.WordHashing`.
-
-        :param with_word_hashing: Include a word hashing step if `True`.
-
-        Example:
-            >>> import matchzoo as mz
-            >>> train_data = mz.datasets.toy.load_data()
-            >>> test_data = mz.datasets.toy.load_data(stage='test')
-            >>> dssm_preprocessor = mz.preprocessors.DSSMPreprocessor()
-            >>> train_data_processed = dssm_preprocessor.fit_transform(
-            ...     train_data, verbose=0
-            ... )
-            >>> type(train_data_processed)
-            <class 'matchzoo.data_pack.data_pack.DataPack'>
-            >>> test_data_transformed = dssm_preprocessor.transform(test_data,
-            ...                                                     verbose=0)
-            >>> type(test_data_transformed)
-            <class 'matchzoo.data_pack.data_pack.DataPack'>
-
-        """
-        super().__init__()
-        self._with_word_hashing = with_word_hashing
-
-    def fit(self, data_pack: DataPack, verbose: int = 1):
-        """
-        Fit pre-processing context for transformation.
-
-        :param verbose: Verbosity.
-        :param data_pack: data_pack to be preprocessed.
-        :return: class:`DSSMPreprocessor` instance.
-        """
-        DEBUG = False
-        if DEBUG:
-            func2 = chain_transform(self.old_units())
-            data_packx = data_pack.apply_on_text(func2, verbose = verbose)
-            # transform text, after tokenizing, remove stop words and blah blah
-            vocab_unit2 = build_vocab_unit(data_packx, verbose = verbose)
-            vocab_size_without_using_letter_ngram = len(vocab_unit2.state['term_index']) + 1
-            print("Vocab size without using letter_ngram", vocab_size_without_using_letter_ngram)
-
-        func = chain_transform(self._default_units())
-        data_pack = data_pack.apply_on_text(func, verbose=verbose)
-        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
-
-        self._context['vocab_unit'] = vocab_unit
-        vocab_size = len(vocab_unit.state['term_index']) + 1
-        if DEBUG:
-            print("Vocab size using letter_ngram", vocab_size)
-        self._context['vocab_size'] = vocab_size
-        self._context['embedding_input_dim'] = vocab_size
-        self._context['input_shapes'] = [(vocab_size,), (vocab_size,)]
-        return self
-
-    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
-        """
-        Apply transformation on data, create `tri-letter` representation.
-
-        :param data_pack: Inputs to be preprocessed.
-        :param verbose: Verbosity.
-
-        :return: Transformed data as :class:`DataPack` object.
-        """
-        data_pack = data_pack.copy()
-        units_ = self._default_units()
-        assert len(units_) == 5, "Must have 5 pre-processing step in DSSM "
-        if self._with_word_hashing:
-            term_index = self._context['vocab_unit'].state['term_index']
-            units_.append(units.WordHashing(term_index))
-        func = chain_transform(units_)
-        data_pack.apply_on_text(func, inplace=True, verbose=verbose)
-        return data_pack
-
-    @classmethod
-    def _default_units(cls) -> list:
-        """Prepare needed process units."""
-        return [
-            units.Tokenize(),
-            units.Lowercase(),
-            units.PuncRemoval(),
-            units.StopRemoval(),
-            units.NgramLetter(),
-        ]
-
-    @property
-    def with_word_hashing(self):
-        """`with_word_hashing` getter."""
-        return self._with_word_hashing
-
-    @with_word_hashing.setter
-    def with_word_hashing(self, value):
-        """`with_word_hashing` setter."""
-        self._with_word_hashing = value
-
-    def old_units(self) -> list:
-        """Prepare needed process units."""
-        return [
-            units.Tokenize(),
-            units.Lowercase(),
-            units.PuncRemoval(),
-            units.StopRemoval()
-        ]
\ No newline at end of file
diff --git a/matchzoo/preprocessors/elmo_basic_preprocessor.py b/matchzoo/preprocessors/elmo_basic_preprocessor.py
deleted file mode 100644
index aae3f01..0000000
--- a/matchzoo/preprocessors/elmo_basic_preprocessor.py
+++ /dev/null
@@ -1,168 +0,0 @@
-"""Basic Preprocessor."""
-
-from tqdm import tqdm
-
-from . import units
-from matchzoo import DataPack
-from matchzoo.engine.base_preprocessor import BasePreprocessor
-from .build_vocab_unit import build_vocab_unit
-from .build_unit_from_data_pack import build_unit_from_data_pack
-from .chain_transform import chain_transform
-from handlers.output_handler import FileHandler
-from matchzoo.preprocessors.basic_preprocessor import BasicPreprocessor
-from matchzoo.preprocessors.units import Unit
-tqdm.pandas()
-
-
-class ElmoPreprocessor(BasicPreprocessor):
-    """
-    Baisc preprocessor helper.
-
-    :param fixed_length_left: Integer, maximize length of :attr:`left` in the
-        data_pack.
-    :param fixed_length_right: Integer, maximize length of :attr:`right` in the
-        data_pack.
-    :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can
-        be 'df', 'cf', and 'idf'.
-    :param filter_low_freq: Float, lower bound value used by
-        :class:`FrequenceFilterUnit`.
-    :param filter_high_freq: Float, upper bound value used by
-        :class:`FrequenceFilterUnit`.
-    :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not.
-
-    Example:
-        >>> import matchzoo as mz
-        >>> train_data = mz.datasets.toy.load_data('train')
-        >>> test_data = mz.datasets.toy.load_data('test')
-        >>> preprocessor = mz.preprocessors.BasicPreprocessor(
-        ...     fixed_length_left=10,
-        ...     fixed_length_right=20,
-        ...     filter_mode='df',
-        ...     filter_low_freq=2,
-        ...     filter_high_freq=1000,
-        ...     remove_stop_words=True
-        ... )
-        >>> preprocessor = preprocessor.fit(train_data, verbose=0)
-        >>> preprocessor.context['input_shapes']
-        [(10,), (20,)]
-        >>> preprocessor.context['vocab_size']
-        225
-        >>> processed_train_data = preprocessor.transform(train_data,
-        ...                                               verbose=0)
-        >>> type(processed_train_data)
-        <class 'matchzoo.data_pack.data_pack.DataPack'>
-        >>> test_data_transformed = preprocessor.transform(test_data,
-        ...                                                verbose=0)
-        >>> type(test_data_transformed)
-        <class 'matchzoo.data_pack.data_pack.DataPack'>
-
-    """
-
-    def __init__(self, fixed_length_left: int = 30,
-                 fixed_length_right: int = 30,
-                 filter_mode: str = 'df',
-                 filter_low_freq: float = 2,
-                 filter_high_freq: float = float('inf'),
-                 remove_stop_words: bool = False):
-        """Initialization."""
-        super().__init__()
-        self._fixed_length_left = fixed_length_left
-        self._fixed_length_right = fixed_length_right
-        self._left_fixedlength_unit = units.FixedLength(
-            self._fixed_length_left,
-            pad_mode='post'
-        )
-        self._right_fixedlength_unit = units.FixedLength(
-            self._fixed_length_right,
-            pad_mode='post'
-        )
-        # self._filter_unit = units.FrequencyFilter(
-        #     low=filter_low_freq,
-        #     high=filter_high_freq,
-        #     mode=filter_mode
-        # )
-        self._units = [SplitTokenize()]
-        # self._char_left = units.AllenCharUnit(self._fixed_length_left)
-        # self._char_right = units.AllenCharUnit(self._fixed_length_right)
-        if remove_stop_words:
-            self._units.append(units.stop_removal.StopRemoval())
-
-    def fit(self, data_pack: DataPack, verbose: int = 1):
-        """
-        Fit pre-processing context for transformation.
-
-        :param data_pack: data_pack to be preprocessed.
-        :param verbose: Verbosity.
-        :return: class:`BasicPreprocessor` instance.
-        """
-        data_pack = data_pack.apply_on_text(chain_transform(self._units), verbose=verbose)
-        # fitted_filter_unit = build_unit_from_data_pack(self._filter_unit,
-        #                                                data_pack,
-        #                                                flatten=False,
-        #                                                mode='right',
-        #                                                verbose=verbose)
-        # data_pack = data_pack.apply_on_text(fitted_filter_unit.transform,
-        #                                     mode='right', verbose=verbose)
-        # self._context['filter_unit'] = fitted_filter_unit
-
-        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
-        self._context['vocab_unit'] = vocab_unit
-
-        vocab_size = len(vocab_unit.state['term_index']) # + 1  # +1 for padding
-        self._context['vocab_size'] = vocab_size
-        self._context['embedding_input_dim'] = vocab_size
-        self._context['input_shapes'] = [(self._fixed_length_left,),
-                                         (self._fixed_length_right,)]
-
-        return self
-
-    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
-        """
-        Apply transformation on data, create fixed length representation.
-
-        :param data_pack: Inputs to be preprocessed.
-        :param verbose: Verbosity.
-
-        :return: Transformed data as :class:`DataPack` object.
-        """
-        data_pack = data_pack.copy()
-        data_pack.apply_on_text(chain_transform(self._units), inplace=True, verbose=verbose)
-
-        # data_pack.apply_on_text(self._context['filter_unit'].transform,
-        #                         mode='right', inplace=True, verbose=verbose)
-        # data_pack.apply_on_text(self._char_left.transform, mode='left', inplace=True, verbose=verbose, rename="char_left")
-        # data_pack.apply_on_text(self._char_right.transform, mode='right', inplace=True, verbose=verbose, rename="char_right")
-
-        data_pack.apply_on_text(self._context['vocab_unit'].transform,
-                                mode='both', inplace=True, verbose=verbose)
-        data_pack.append_text_length(inplace=True, verbose=verbose)
-        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
-                                mode='left', inplace=True, verbose=verbose)
-        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
-                                mode='right', inplace=True, verbose=verbose)
-
-        max_len_left = self._fixed_length_left
-        max_len_right = self._fixed_length_right
-
-        data_pack.left['length_left'] = \
-            data_pack.left['length_left'].apply(
-                lambda val: min(val, max_len_left))
-
-        data_pack.right['length_right'] = \
-            data_pack.right['length_right'].apply(
-                lambda val: min(val, max_len_right))
-        return data_pack
-
-
-class SplitTokenize(Unit):
-    """Process unit for text tokenization."""
-
-    def transform(self, input_: str) -> list:
-        """
-        Process input data from raw terms to list of tokens.
-
-        :param input_: raw textual input.
-
-        :return tokens: tokenized tokens as a list.
-        """
-        return input_.split()
diff --git a/matchzoo/preprocessors/fact_checking_elmo_preprocessor.py b/matchzoo/preprocessors/fact_checking_elmo_preprocessor.py
deleted file mode 100644
index bf4b6e5..0000000
--- a/matchzoo/preprocessors/fact_checking_elmo_preprocessor.py
+++ /dev/null
@@ -1,173 +0,0 @@
-"""Basic Preprocessor."""
-
-from tqdm import tqdm
-
-from . import units
-from matchzoo import DataPack
-from matchzoo.engine.base_preprocessor import BasePreprocessor
-from .build_vocab_unit import build_vocab_unit
-from .build_unit_from_data_pack import build_unit_from_data_pack
-from .chain_transform import chain_transform
-from handlers.output_handler import FileHandler
-from matchzoo.preprocessors.basic_preprocessor import BasicPreprocessor
-from matchzoo.preprocessors.units import Unit
-from matchzoo.preprocessors.declare_preprocessor import build_entity_unit
-tqdm.pandas()
-
-
-class FactCheckingElmoPreprocessor(BasicPreprocessor):
-    """
-    Baisc preprocessor helper for fact-checking with external evidences for my
-
-    :param fixed_length_left: Integer, maximize length of :attr:`left` in the
-        data_pack.
-    :param fixed_length_right: Integer, maximize length of :attr:`right` in the
-        data_pack.
-    :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can
-        be 'df', 'cf', and 'idf'.
-    :param filter_low_freq: Float, lower bound value used by
-        :class:`FrequenceFilterUnit`.
-    :param filter_high_freq: Float, upper bound value used by
-        :class:`FrequenceFilterUnit`.
-    :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not.
-
-    Example:
-        >>> import matchzoo as mz
-        >>> train_data = mz.datasets.toy.load_data('train')
-        >>> test_data = mz.datasets.toy.load_data('test')
-        >>> preprocessor = mz.preprocessors.BasicPreprocessor(
-        ...     fixed_length_left=10,
-        ...     fixed_length_right=20,
-        ...     filter_mode='df',
-        ...     filter_low_freq=2,
-        ...     filter_high_freq=1000,
-        ...     remove_stop_words=True
-        ... )
-        >>> preprocessor = preprocessor.fit(train_data, verbose=0)
-        >>> preprocessor.context['input_shapes']
-        [(10,), (20,)]
-        >>> preprocessor.context['vocab_size']
-        225
-        >>> processed_train_data = preprocessor.transform(train_data,
-        ...                                               verbose=0)
-        >>> type(processed_train_data)
-        <class 'matchzoo.data_pack.data_pack.DataPack'>
-        >>> test_data_transformed = preprocessor.transform(test_data,
-        ...                                                verbose=0)
-        >>> type(test_data_transformed)
-        <class 'matchzoo.data_pack.data_pack.DataPack'>
-
-    """
-
-    def __init__(self, fixed_length_left: int = 30,
-                 fixed_length_right: int = 30,
-                 filter_mode: str = 'df',
-                 filter_low_freq: float = 2,
-                 filter_high_freq: float = float('inf'),
-                 remove_stop_words: bool = False):
-        """Initialization."""
-        super().__init__()
-        self._fixed_length_left = fixed_length_left
-        self._fixed_length_right = fixed_length_right
-        self._left_fixedlength_unit = units.FixedLength(
-            self._fixed_length_left,
-            pad_mode='post'
-        )
-        self._right_fixedlength_unit = units.FixedLength(
-            self._fixed_length_right,
-            pad_mode='post'
-        )
-
-        self._units = [SplitTokenize()]
-        if remove_stop_words:
-            self._units.append(units.stop_removal.StopRemoval())
-
-    def fit(self, data_pack: DataPack, verbose: int = 1):
-        """
-        Fit pre-processing context for transformation.
-
-        :param data_pack: data_pack to be preprocessed.
-        :param verbose: Verbosity.
-        :return: class:`BasicPreprocessor` instance.
-        """
-        data_pack = data_pack.apply_on_text(chain_transform(self._units), verbose=verbose)
-        # fitted_filter_unit = build_unit_from_data_pack(self._filter_unit,
-        #                                                data_pack,
-        #                                                flatten=False,
-        #                                                mode='right',
-        #                                                verbose=verbose)
-        # data_pack = data_pack.apply_on_text(fitted_filter_unit.transform,
-        #                                     mode='right', verbose=verbose)
-        # self._context['filter_unit'] = fitted_filter_unit
-
-        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
-        self._context['vocab_unit'] = vocab_unit
-
-        vocab_size = len(vocab_unit.state['term_index'])  # + 1  # +1 for padding
-        self._context['vocab_size'] = vocab_size
-        self._context['embedding_input_dim'] = vocab_size
-        self._context['input_shapes'] = [(self._fixed_length_left,),
-                                         (self._fixed_length_right,)]
-
-        claim_source_unit = build_entity_unit(column="claim_source", data_pack=data_pack, mode="left")
-        article_source_unit = build_entity_unit(column="evidence_source", data_pack=data_pack, mode="right")
-        self._context['claim_source_unit'] = claim_source_unit
-        self._context['article_source_unit'] = article_source_unit
-
-        return self
-
-    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
-        """
-        Apply transformation on data, create fixed length representation.
-
-        :param data_pack: Inputs to be preprocessed.
-        :param verbose: Verbosity.
-
-        :return: Transformed data as :class:`DataPack` object.
-        """
-        data_pack = data_pack.copy()
-
-        def map_claim_source(entity: str): return self._context['claim_source_unit'].transform([entity])
-
-        def map_evidence_source(entity: str): return self._context['article_source_unit'].transform([entity])
-
-        data_pack.left["claim_source"] = data_pack.left["claim_source"].progress_apply(map_claim_source)
-        data_pack.right["evidence_source"] = data_pack.right["evidence_source"].progress_apply(map_evidence_source)
-        data_pack.apply_on_text(chain_transform(self._units), inplace=True, verbose=verbose)
-
-        # data_pack.apply_on_text(self._context['filter_unit'].transform,
-        #                         mode='right', inplace=True, verbose=verbose)
-        data_pack.apply_on_text(self._context['vocab_unit'].transform,
-                                mode='both', inplace=True, verbose=verbose)
-        data_pack.append_text_length(inplace=True, verbose=verbose)
-        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
-                                mode='left', inplace=True, verbose=verbose)
-        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
-                                mode='right', inplace=True, verbose=verbose)
-
-        max_len_left = self._fixed_length_left
-        max_len_right = self._fixed_length_right
-
-        data_pack.left['length_left'] = \
-            data_pack.left['length_left'].apply(
-                lambda val: min(val, max_len_left))
-
-        data_pack.right['length_right'] = \
-            data_pack.right['length_right'].apply(
-                lambda val: min(val, max_len_right))
-
-        return data_pack
-
-
-class SplitTokenize(Unit):
-    """Process unit for text tokenization."""
-
-    def transform(self, input_: str) -> list:
-        """
-        Process input data from raw terms to list of tokens.
-
-        :param input_: raw textual input.
-
-        :return tokens: tokenized tokens as a list.
-        """
-        return input_.split()
diff --git a/matchzoo/preprocessors/mz_pretrained_preprocessor.py b/matchzoo/preprocessors/mz_pretrained_preprocessor.py
deleted file mode 100644
index d897a8e..0000000
--- a/matchzoo/preprocessors/mz_pretrained_preprocessor.py
+++ /dev/null
@@ -1,250 +0,0 @@
-
-from tqdm import tqdm
-
-from . import units
-from matchzoo import DataPack
-from matchzoo.engine.base_preprocessor import BasePreprocessor
-from .build_vocab_unit import build_vocab_unit
-from .build_unit_from_data_pack import build_unit_from_data_pack
-from .chain_transform import chain_transform
-from handlers.output_handler import FileHandler
-from pytorch_transformers import PreTrainedTokenizer
-from pytorch_transformers.utils_glue import _truncate_seq_pair
-from typing import List, Tuple
-import pandas as pd
-tqdm.pandas()
-
-
-class PreTrainedModelsProcessor(PreTrainedTokenizer):
-    """
-    a preprocessor for transform DataPack.
-
-    :param fixed_length_left: Integer, maximize length of :attr:`left` in the
-        data_pack.
-    :param fixed_length_right: Integer, maximize length of :attr:`right` in the
-        data_pack.
-    :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can
-        be 'df', 'cf', and 'idf'.
-    :param filter_low_freq: Float, lower bound value used by
-        :class:`FrequenceFilterUnit`.
-    :param filter_high_freq: Float, upper bound value used by
-        :class:`FrequenceFilterUnit`.
-    :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not.
-
-
-    """
-
-    def __init__(self, max_seq_length: int, fixed_length_left: int = -1,
-                 fixed_length_right: int = -1,
-                 filter_mode: str = 'df',
-                 filter_low_freq: float = 2,
-                 filter_high_freq: float = float('inf'),
-                 remove_stop_words: bool = False,
-
-                 tokenizer: PreTrainedTokenizer = None):
-        """Initialization. We may need to store vocab path file, number of tokens, blah blah.
-        """
-        FileHandler.myprint("Query truncated to " + str(fixed_length_left) +
-                            " Doc truncated to " + str(fixed_length_right))
-        super().__init__()
-        self.tokenizer = tokenizer
-        self.max_seq_length = max_seq_length
-        assert fixed_length_left > 0 and fixed_length_right > 0
-        self.fixed_length_left = fixed_length_left
-        self.fixed_length_right = fixed_length_right
-        assert self.fixed_length_left + self.fixed_length_right < self.max_seq_length, \
-            "Left + right should be smaller than max length"
-
-
-    def fit(self, data_pack: pd.DataFrame, verbose: int = 1):
-        """
-        Fit pre-processing context for transformation.
-
-        :param data_pack: data_pack to be preprocessed.
-        :param verbose: Verbosity.
-        :return: class:`BasicPreprocessor` instance.
-        """
-        raise NotImplementedError("Not coded yet")
-
-    def transform(self, data_pack: pd.DataFrame, verbose: int = 1) -> Tuple[pd.DataFrame, List]:
-        """
-        Apply transformation on data, create fixed length representation.
-
-        :param data_pack: Inputs to be preprocessed.
-        :param verbose: Verbosity.
-
-        :return: Transformed data as :class:`DataPack` object.
-        """
-
-
-        # data_pack.append_text_length(inplace = True, verbose = verbose)
-        # we need to split each text_left to an array of tokens, then we can convert them to ids
-        converted_features = self._convert_examples_to_features(data_pack, label_list = [0, 1], max_seq_length = self.max_seq_length,
-                                     tokenizer = self.tokenizer, output_mode = "classification")
-
-        # data_pack.apply_on_text(str.split, mode = 'left', inplace = True, verbose = verbose)
-        # data_pack.apply_on_text(self.tokenizer.convert_tokens_to_ids,
-        #                         mode = 'left', inplace = True, verbose = verbose)
-
-        # data_pack.apply_on_text(str.split, mode = 'right', inplace = True, verbose = verbose)
-        # data_pack.apply_on_text(self.tokenizer.convert_tokens_to_ids,
-        #                         mode = 'right', inplace = True, verbose = verbose)
-
-        # max_len_left = self._fixed_length_left
-        # max_len_right = self._fixed_length_right
-        #
-        # data_pack.left['length_left'] = \
-        #     data_pack.left['length_left'].apply(
-        #         lambda val: min(val, max_len_left))
-        #
-        # data_pack.right['length_right'] = \
-        #     data_pack.right['length_right'].apply(
-        #         lambda val: min(val, max_len_right))
-        return data_pack, converted_features
-
-
-
-    def _convert_examples_to_features(self, examples: pd.DataFrame, label_list, max_seq_length,
-                                     tokenizer, output_mode,
-                                     cls_token_at_end = False,
-                                     cls_token = '[CLS]',
-                                     cls_token_segment_id = 1,
-                                     sep_token = '[SEP]',
-                                     sep_token_extra = False,
-                                     pad_on_left = False,
-                                     pad_token = 0,
-                                     pad_token_segment_id = 0,
-                                     sequence_a_segment_id = 0,
-                                     sequence_b_segment_id = 1,
-                                     mask_padding_with_zero = True):
-        """ Loads a data file into a list of `InputBatch`s
-            `cls_token_at_end` define the location of the CLS token:
-                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
-                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
-            `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
-        """
-
-        label_map = {label: i for i, label in enumerate(label_list)}
-        # from tqdm import tqdm
-        features = []
-        ex_index = -1
-        FileHandler.myprint("Processing text_left and text_right to make it a full sequence for BERT........")
-        assert type(examples) == pd.DataFrame
-        for q_id, text_a, doc_id, text_b, label in zip(examples["id_left"], examples["text_left"],
-                                                       examples["id_right"], examples["text_right"], examples["label"]):
-            ex_index += 1
-            if ex_index % 10000 == 0: FileHandler.myprint("Processed xample %d of %d" % (ex_index, len(examples)))
-            tokens_a = text_a.split()
-            tokens_a = tokens_a[:self.fixed_length_left]
-            tokens_b = None
-            assert len(text_b) != 0, "Length of documents must be not zero!"
-            if text_b:
-                tokens_b = text_b.split()
-                tokens_b = tokens_b[: self.fixed_length_right]
-                # Modifies `tokens_a` and `tokens_b` in place so that the total
-                # length is less than the specified length.
-                # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
-                special_tokens_count = 4 if sep_token_extra else 3
-                _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
-            else:
-                # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
-                special_tokens_count = 3 if sep_token_extra else 2
-                if len(tokens_a) > max_seq_length - special_tokens_count:
-                    tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
-
-            # The convention in BERT is:
-            # (a) For sequence pairs:
-            #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-            #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
-            # (b) For single sequences:
-            #  tokens:   [CLS] the dog is hairy . [SEP]
-            #  type_ids:   0   0   0   0  0     0   0
-            #
-            # Where "type_ids" are used to indicate whether this is the first
-            # sequence or the second sequence. The embedding vectors for `type=0` and
-            # `type=1` were learned during pre-training and are added to the wordpiece
-            # embedding vector (and position vector). This is not *strictly* necessary
-            # since the [SEP] token unambiguously separates the sequences, but it makes
-            # it easier for the model to learn the concept of sequences.
-            #
-            # For classification tasks, the first vector (corresponding to [CLS]) is
-            # used as as the "sentence vector". Note that this only makes sense because
-            # the entire model is fine-tuned.
-            tokens = tokens_a + [sep_token]
-            if sep_token_extra:
-                # roberta uses an extra separator b/w pairs of sentences
-                tokens += [sep_token]
-            segment_ids = [sequence_a_segment_id] * len(tokens)
-
-            if tokens_b:
-                tokens += tokens_b + [sep_token]
-                segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
-
-            if cls_token_at_end:
-                tokens = tokens + [cls_token]
-                segment_ids = segment_ids + [cls_token_segment_id]
-            else:
-                tokens = [cls_token] + tokens
-                segment_ids = [cls_token_segment_id] + segment_ids
-
-            input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-            # The mask has 1 for real tokens and 0 for padding tokens. Only real
-            # tokens are attended to.
-            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            padding_length = max_seq_length - len(input_ids)
-            if pad_on_left:
-                input_ids = ([pad_token] * padding_length) + input_ids
-                input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
-                segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
-            else:
-                input_ids = input_ids + ([pad_token] * padding_length)
-                input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
-                segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
-
-            assert len(input_ids) == max_seq_length
-            assert len(input_mask) == max_seq_length
-            assert len(segment_ids) == max_seq_length
-
-            if output_mode == "classification":
-                label_id = label_map[label]
-            elif output_mode == "regression":
-                label_id = float(label)
-            else:
-                raise KeyError(output_mode)
-
-            if ex_index < 5:
-                FileHandler.myprint("*** Example ***")
-                # FileHandler.myprint("guid: %s" % (example.guid))
-                FileHandler.myprint("tokens: %s" % " ".join(
-                    [str(x) for x in tokens]))
-                FileHandler.myprint("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-                FileHandler.myprint("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-                FileHandler.myprint("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-                FileHandler.myprint("label: %s (id = %d)" % (label, label_id))
-
-            features.append(
-                InputFeatures(left_id = q_id, right_id = doc_id,
-                              text_left = text_a, text_right = text_b,
-                              input_ids = input_ids,
-                              input_mask = input_mask,
-                              segment_ids = segment_ids,
-                              label_id = label_id))
-        return features
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self, left_id: int, right_id: int,
-                 text_left: str, text_right: str,
-                 input_ids, input_mask, segment_ids, label_id):
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.label_id = label_id
-        self.left_id = left_id
-        self.right_id = right_id
-        self.text_left = text_left
-        self.text_right = text_right
\ No newline at end of file
diff --git a/matchzoo/preprocessors/naive_preprocessor.py b/matchzoo/preprocessors/naive_preprocessor.py
deleted file mode 100644
index 139da4e..0000000
--- a/matchzoo/preprocessors/naive_preprocessor.py
+++ /dev/null
@@ -1,61 +0,0 @@
-"""Naive Preprocessor."""
-
-from tqdm import tqdm
-
-from matchzoo.engine.base_preprocessor import BasePreprocessor
-from matchzoo import DataPack
-from .chain_transform import chain_transform
-from .build_vocab_unit import build_vocab_unit
-from . import units
-
-tqdm.pandas()
-
-
-class NaivePreprocessor(BasePreprocessor):
-    """
-    Naive preprocessor.
-
-    Example:
-        >>> import matchzoo as mz
-        >>> train_data = mz.datasets.toy.load_data()
-        >>> test_data = mz.datasets.toy.load_data(stage='test')
-        >>> preprocessor = mz.preprocessors.NaivePreprocessor()
-        >>> train_data_processed = preprocessor.fit_transform(train_data,
-        ...                                                   verbose=0)
-        >>> type(train_data_processed)
-        <class 'matchzoo.data_pack.data_pack.DataPack'>
-        >>> test_data_transformed = preprocessor.transform(test_data,
-        ...                                                verbose=0)
-        >>> type(test_data_transformed)
-        <class 'matchzoo.data_pack.data_pack.DataPack'>
-
-    """
-
-    def fit(self, data_pack: DataPack, verbose: int = 1):
-        """
-        Fit pre-processing context for transformation.
-
-        :param data_pack: data_pack to be preprocessed.
-        :param verbose: Verbosity.
-        :return: class:`NaivePreprocessor` instance.
-        """
-        func = chain_transform(self._default_units())
-        data_pack = data_pack.apply_on_text(func, verbose=verbose)
-        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
-        self._context['vocab_unit'] = vocab_unit
-        return self
-
-    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
-        """
-        Apply transformation on data, create `tri-letter` representation.
-
-        :param data_pack: Inputs to be preprocessed.
-        :param verbose: Verbosity.
-
-        :return: Transformed data as :class:`DataPack` object.
-        """
-        units_ = self._default_units()
-        units_.append(self._context['vocab_unit'])
-        units_.append(units.FixedLength(text_length=30, pad_mode='post'))
-        func = chain_transform(units_)
-        return data_pack.apply_on_text(func, verbose=verbose)
diff --git a/matchzoo/preprocessors/tfidf_preprocessor.py b/matchzoo/preprocessors/tfidf_preprocessor.py
deleted file mode 100644
index 896aa77..0000000
--- a/matchzoo/preprocessors/tfidf_preprocessor.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import matchzoo
-from typing import List, Dict, Tuple
-import torch_utils
-import collections
-import numpy as np
-
-class TFIDF:
-
-	idf_dict = {}
-	idf_char_ngram = {}
-
-	@classmethod
-	def init(cls, corpus: List[List[str]], char_ngram_copus: List[List[str]] = []):
-		"""docid, value: list of words """
-		stats = cls._idf(corpus)
-		cls.idf_dict = stats
-		if char_ngram_copus:
-			cls.idf_char_ngram = cls._idf(char_ngram_copus)
-
-	@classmethod
-	def get_term_idf(cls):
-		return cls.idf_dict
-
-	@classmethod
-	def get_char_ngram_idf(cls):
-		return cls.idf_char_ngram
-
-	@classmethod
-	def _df(cls, list_of_tokens: list) -> dict:
-		stats = collections.Counter()
-		for tokens in list_of_tokens:
-			stats.update(set(tokens))
-		return stats
-
-	@classmethod
-	def _idf(cls, list_of_tokens: list) -> dict:
-		num_docs = len(list_of_tokens)
-		stats = cls._df(list_of_tokens)
-		for key, val in stats.most_common():
-			stats[key] = np.log((1.0 + num_docs) / (1.0 + val)) + 1.0
-		return stats
\ No newline at end of file
diff --git a/setting_keywords.py b/setting_keywords.py
new file mode 100644
index 0000000..a8cd122
--- /dev/null
+++ b/setting_keywords.py
@@ -0,0 +1,78 @@
+
+
+class KeyWordSettings(object):
+
+    Doc_cID = "doc_cid"
+    Doc_URL = "doc_ciurl"
+    Doc_cLabel = "doc_clabel"
+    Doc_wImages = "doc_wimages"
+    Doc_wContent = "doc_wcontent"
+    Relevant_Score = "relevant_score"
+
+    Query_id = "qid"
+    Query_TweetID = "qtweetid"
+    Query_Images = "query_images"
+    Ranked_Docs = "ranked_docs"
+    Query_Content = "query_content"
+
+    Query_lens = "query_lens"
+    Doc_lens = "docs_lens"
+
+    # for lstm keywords
+    QueryLensIndices = "query_lens_indices"
+    DocLensIndices = "doc_lens_indices"
+
+    QueryIDs = "query_ids"
+    DocIDs = "doc_ids"
+    UseVisual = "use_visual"
+
+    OutputRankingKey = "output_ranking"
+
+    QueryCountVal = [1116, 1000, 187, 1159]
+    QueryCountTest = [1001, 1164, 1118, 187, 156, 1160, 1500]
+
+    UseCuda = "use_cuda"
+    QuerySources = "query_sources"
+    DocSources = "doc_sources"
+    TempLabel = "fc_labels"
+    DocContentNoPaddingEvidence = "doc_content_without_padding_evidences"  # to avoid empty sequences to lstm
+    QueryContentNoPaddingEvidence = "query_content_without_padding_evidences"
+    ClaimEmbeddings = "claim_embeddings"
+    EvidenceEmbeddings = "evidences_embeddings"
+
+    EvidenceCountPerQuery = "evd_cnt_each_query"
+    FIXED_NUM_EVIDENCES = "fixed_num_evidences"
+
+    LOSS_FUNCTIONS = ("cross_entropy")
+
+    ClaimCountVal = [433, 356]
+    ClaimCountTest = [782, 781, 391, 390, 644, 642, 323, 321]
+
+    AUC_metric = "auc"
+    F1_macro = "f1_macro"
+    F1_micro = "f1_micro"
+    F1 = "f1"
+    PrecisionTrueCls = "precision_true_cls"
+    RecallTrueCls = "recall_true_cls"
+    F1TrueCls = "f1_true_cls"
+
+    PrecisionFalseCls = "precision_false_cls"
+    RecallFalseCls = "recall_false_cls"
+    F1FalseCls = "f1_false_cls"
+
+    # for fact-checking error analysis
+    class FCClass:
+        DocAttentionScore = "doc_attention_score"
+        WordAttentionScore = "word_attention_score"
+        ClaimLabel = "claim_label"
+        PredictedProb = "predicted_prob"
+        AttentionWeightsInfo = "attention_weights_info"
+        CharSourceKey = "char_source"
+        QueryCharSource = "query_char_source"  # characters of claims' source (i.e. chars of speakers' names)
+        DocCharSource = "doc_char_source"
+
+    CLS_METRICS = [AUC_metric, F1_macro, F1_micro, F1,
+                   PrecisionTrueCls, RecallTrueCls, F1TrueCls,
+                   PrecisionFalseCls, RecallFalseCls, F1FalseCls]
+
+    OutputHandlerFactChecking = "output_handler_fact_checking"