From d40185f781387172cd4f3ad5d98976e2f78f136b Mon Sep 17 00:00:00 2001 From: nguyenvo09 Date: Sun, 17 Jan 2021 23:52:36 -0500 Subject: [PATCH] refactor code --- .../FittingFC/char_man_fitter_query_repr1.py | 1 - Fitting/densebaseline_fit.py | 2 - __init__.py | 1 + handlers/mz_sampler.py | 156 ++++++++++ handlers/output_handler_FC.py | 79 +++++ handlers/tensorboard_writer.py | 17 + handlers/tensorboard_writer_class.py | 17 + interactions.py | 291 ++++++++++++++++++ matchzoo/preprocessors/__init__.py | 9 - matchzoo/preprocessors/basic_preprocessor.py | 1 - matchzoo/preprocessors/bow_preprocessor.py | 1 - matchzoo/preprocessors/cdssm_preprocessor.py | 125 -------- .../char_man_elmo_preprocessor.py | 269 ---------------- .../preprocessors/char_man_preprocessor.py | 1 - .../preprocessors/char_ngram_preprocessor.py | 95 ------ .../preprocessors/declare_preprocessor.py | 202 ------------ matchzoo/preprocessors/dssm_preprocessor.py | 124 -------- .../preprocessors/elmo_basic_preprocessor.py | 168 ---------- .../fact_checking_elmo_preprocessor.py | 173 ----------- .../mz_pretrained_preprocessor.py | 250 --------------- matchzoo/preprocessors/naive_preprocessor.py | 61 ---- matchzoo/preprocessors/tfidf_preprocessor.py | 41 --- setting_keywords.py | 78 +++++ 23 files changed, 639 insertions(+), 1523 deletions(-) create mode 100644 __init__.py create mode 100644 handlers/mz_sampler.py create mode 100644 handlers/output_handler_FC.py create mode 100644 handlers/tensorboard_writer.py create mode 100644 handlers/tensorboard_writer_class.py create mode 100644 interactions.py delete mode 100644 matchzoo/preprocessors/cdssm_preprocessor.py delete mode 100644 matchzoo/preprocessors/char_man_elmo_preprocessor.py delete mode 100644 matchzoo/preprocessors/char_ngram_preprocessor.py delete mode 100644 matchzoo/preprocessors/declare_preprocessor.py delete mode 100644 matchzoo/preprocessors/dssm_preprocessor.py delete mode 100644 matchzoo/preprocessors/elmo_basic_preprocessor.py delete mode 100644 matchzoo/preprocessors/fact_checking_elmo_preprocessor.py delete mode 100644 matchzoo/preprocessors/mz_pretrained_preprocessor.py delete mode 100644 matchzoo/preprocessors/naive_preprocessor.py delete mode 100644 matchzoo/preprocessors/tfidf_preprocessor.py create mode 100644 setting_keywords.py diff --git a/Fitting/FittingFC/char_man_fitter_query_repr1.py b/Fitting/FittingFC/char_man_fitter_query_repr1.py index d019c68..78961d4 100644 --- a/Fitting/FittingFC/char_man_fitter_query_repr1.py +++ b/Fitting/FittingFC/char_man_fitter_query_repr1.py @@ -5,7 +5,6 @@ import torch_utils as my_utils import time import interactions -from handlers.output_handler import FileHandler from handlers.tensorboard_writer import TensorboardWrapper from setting_keywords import KeyWordSettings from Fitting.FittingFC.multi_level_attention_composite_fitter import MultiLevelAttentionCompositeFitter diff --git a/Fitting/densebaseline_fit.py b/Fitting/densebaseline_fit.py index 7cbabbd..ce0fc42 100644 --- a/Fitting/densebaseline_fit.py +++ b/Fitting/densebaseline_fit.py @@ -14,9 +14,7 @@ import json import matchzoo import interactions -from handlers.output_handler import FileHandler from handlers.tensorboard_writer import TensorboardWrapper -from matchzoo.preprocessors.tfidf_preprocessor import TFIDF from setting_keywords import KeyWordSettings from matchzoo.metrics import average_precision, discounted_cumulative_gain, \ mean_average_precision, mean_reciprocal_rank, normalized_discounted_cumulative_gain, precision diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e8cc56f --- /dev/null +++ b/__init__.py @@ -0,0 +1 @@ +__version__ = 'v0.1.5' diff --git a/handlers/mz_sampler.py b/handlers/mz_sampler.py new file mode 100644 index 0000000..292dc7c --- /dev/null +++ b/handlers/mz_sampler.py @@ -0,0 +1,156 @@ +""" +Module containing functions for negative item sampling. +""" + +import numpy as np +from scipy.sparse import csr_matrix +import torch_utils +import time +import interactions + + +class Sampler(object): + def __init__(self): + super(Sampler, self).__init__() + + def get_train_instances(self, interactions: interactions.MatchInteraction, num_negatives: int): + """ + Sample negative from a candidate set of each user. The + candidate set of each user is defined by: + {All Items} \ {Items Rated by User} + Parameters + ---------- + interactions: :class:`matchzoo.DataPack` + training instances, used for generate candidates. Note that + since I am using MatchZoo datapack, there are negative cases in left-right relation ship as + well. + num_negatives: int + total number of negatives to sample for each sequence + """ + + query_ids = interactions.pos_queries.astype(np.int64) # may not be unique + query_contents = interactions.np_query_contents.astype(np.int64) + query_lengths = interactions.np_query_lengths.astype(np.int64) + + doc_ids = interactions.pos_docs.astype(np.int64) + doc_contents = interactions.np_doc_contents.astype(np.int64) + doc_lengths = interactions.np_doc_lengths.astype(np.int64) + + negative_samples = np.zeros((query_ids.shape[0], num_negatives, interactions.padded_doc_length), np.int64) + negative_samples_lens = np.zeros((query_ids.shape[0], num_negatives), np.int64) + negative_docs_ids = np.zeros((query_ids.shape[0], num_negatives), np.int64) + self._candidate = interactions.negatives + + for i, u in enumerate(query_ids): + for j in range(num_negatives): + x = self._candidate[u] + neg_item = x[np.random.randint(len(x))] # int + # print("Neg_item: ", neg_item) + neg_item_content = interactions.dict_doc_contents[neg_item] # np.array + negative_samples[i, j] = neg_item_content + negative_samples_lens[i, j] = interactions.dict_doc_lengths[neg_item] + negative_docs_ids[i, j] = neg_item + # if u <= 0: + # print("Negative samples: ", negative_samples[i]) + # print(negative_samples) + return query_ids, query_contents, query_lengths, \ + doc_ids, doc_contents, doc_lengths, \ + negative_docs_ids, negative_samples, negative_samples_lens + + def get_train_instances_declare(self, interactions: interactions.ClassificationInteractions, + fixed_num_evidences: int): + """ + ---------- + interactions: :class:`interactions.ClassificationInteractions` + training instances, + fixed_num_evidences: `int` + fixed number of evidences for each claim + """ + claim_sources = np.array([interactions.dict_claim_source[e] for e in interactions.claims]) + evidence_sources = np.array([interactions.dict_evd_source[e] for e in interactions.evidences]) + return interactions.claims, interactions.claims_contents, interactions.claims_lens, claim_sources, \ + interactions.evidences, interactions.evd_contents, interactions.evd_lens, evidence_sources, \ + interactions.pair_labels + + def get_train_instances_hanfc(self, interactions: interactions.ClassificationInteractions, + fixed_num_evidences: int): + """ + For each query/claim, we get its x number of evidences. + Parameters + ---------- + interactions: :class:`interactions.ClassificationInteractions` + training instances, + fixed_num_evidences: `int` + fixed number of evidences for each claim + """ + + query_ids = interactions.claims.astype(np.int64) # must be all unique + query_labels = interactions.claims_labels + query_contents = interactions.np_query_contents.astype(np.int64) + query_lengths = interactions.np_query_lengths.astype(np.int64) + query_sources = np.array([interactions.dict_claim_source[q] for q in query_ids]) + + evd_docs_ids = np.zeros((query_ids.shape[0], fixed_num_evidences), np.int64) - 1 # all indices are -1 + # by default it is all pad tokens + evd_docs_contents = np.zeros((query_ids.shape[0], fixed_num_evidences, interactions.padded_doc_length), np.int64) + evd_docs_lens = np.zeros((query_ids.shape[0], fixed_num_evidences), np.int64) + evd_sources = np.zeros((query_ids.shape[0], fixed_num_evidences), np.int64) - 1 # for padding sources are -1 + evd_cnt_each_query = np.zeros((query_ids.shape[0]), np.int64) + + for i, u in enumerate(query_ids): + evidences_info = interactions.dict_claims_and_evidences_test[u] # use u not i + assert len(evidences_info) <= fixed_num_evidences + evd_cnt_each_query[i] = len(evidences_info[0]) # number of real evidences for the query i + # we have a list of evidences, now I need to take the content and doc_id + for idx, (doc_id, doc_label, doc_content, doc_len) in enumerate(zip(*evidences_info)): + evd_docs_contents[i][idx] = doc_content # we already pad the content array with zeros due to init + evd_docs_lens[i][idx] = doc_len # we set 0 length for padding evidences + evd_docs_ids[i][idx] = doc_id # we set -1 as index for padding evidences + evd_sources[i][idx] = interactions.dict_evd_source[doc_id][0] # -1 since we have an array size 1 + + return query_ids, query_contents, query_lengths, query_sources, \ + evd_docs_ids, evd_docs_contents, evd_docs_lens, evd_sources, evd_cnt_each_query, query_labels + + def get_train_instances_char_man(self, interactions: interactions.ClassificationInteractions, + fixed_num_evidences: int): + """ + For each query/claim, we get its x number of evidences. + Parameters + ---------- + interactions: :class:`interactions.ClassificationInteractions` + training instances, + fixed_num_evidences: `int` + fixed number of evidences for each claim + """ + + query_ids = interactions.claims.astype(np.int64) # must be all unique + query_labels = interactions.claims_labels + query_contents = interactions.np_query_contents.astype(np.int64) + query_lengths = interactions.np_query_lengths.astype(np.int64) + query_char_source = interactions.np_query_char_source.astype(np.int64) + query_sources = np.array([interactions.dict_claim_source[q] for q in query_ids]) + + evd_docs_ids = np.zeros((query_ids.shape[0], fixed_num_evidences), np.int64) - 1 # all indices are -1 + # by default it is all pad tokens + evd_docs_contents = np.zeros((query_ids.shape[0], fixed_num_evidences, interactions.padded_doc_length), np.int64) + evd_docs_lens = np.zeros((query_ids.shape[0], fixed_num_evidences), np.int64) + evd_sources = np.zeros((query_ids.shape[0], fixed_num_evidences), np.int64) - 1 # for padding sources are -1 + evd_cnt_each_query = np.zeros((query_ids.shape[0]), np.int64) + evd_docs_char_source_contents = np.zeros((query_ids.shape[0], fixed_num_evidences, + interactions.padded_doc_char_source_length), np.int64) + + for i, u in enumerate(query_ids): + evidences_info = interactions.dict_claims_and_evidences_test[u] # use u not i + assert len(evidences_info) <= fixed_num_evidences + evd_cnt_each_query[i] = len(evidences_info[0]) # number of real evidences for the query i + # we have a list of evidences, now I need to take the content and doc_id + for idx, (doc_id, doc_label, doc_content, doc_len) in enumerate(zip(*evidences_info)): + evd_docs_contents[i][idx] = doc_content # we already pad the content array with zeros due to init + evd_docs_lens[i][idx] = doc_len # we set 0 length for padding evidences + evd_docs_ids[i][idx] = doc_id # we set -1 as index for padding evidences + evd_sources[i][idx] = interactions.dict_evd_source[doc_id][0] # -1 since we have an array size 1 + evd_docs_char_source_contents[i][idx] = interactions.dict_char_right_src[doc_id] + + return query_ids, query_contents, query_lengths, query_sources, query_char_source, \ + evd_docs_ids, evd_docs_contents, evd_docs_lens, evd_sources, evd_cnt_each_query, \ + evd_docs_char_source_contents, query_labels diff --git a/handlers/output_handler_FC.py b/handlers/output_handler_FC.py new file mode 100644 index 0000000..090e072 --- /dev/null +++ b/handlers/output_handler_FC.py @@ -0,0 +1,79 @@ +import sys + +class FileHandlerFC(object): + # mylogfile = None + # mylogfile_details = None + # error_analysis_log_validation = None + # error_analysis_log_testing = None + # error_analysis_log_test2 = None + # error_analysis_log_test3 = None + + def __init__(self): + pass + + # @classmethod + def init_log_files(self, log_file): + if log_file != None: + self.mylogfile = open(log_file, "w") + self.mylogfile_details = open(log_file + "_best_details.json", "w") + self.error_analysis_log_validation = open(log_file + "_error_analysis_validation.json", "w") + self.error_analysis_log_testing = open(log_file + "_error_analysis_testing.json", "w") + self.error_analysis_log_test2 = open(log_file + "_error_analysis_test2.json", "w") + self.error_analysis_log_test3 = open(log_file + "_error_analysis_test3.json", "w") + + # @classmethod + def myprint(self, message): + assert self.mylogfile != None, "The LogFile is not initialized yet!" + print(message) + sys.stdout.flush() + if self.mylogfile != None: + print(message, file = self.mylogfile) + self.mylogfile.flush() + + # @classmethod + def myprint_details(self, message): + assert self.mylogfile_details != None, "The Detailed JSON log file is not initialized yet!" + # print(message) + if self.mylogfile_details != None: + print(message, file = self.mylogfile_details) + self.mylogfile_details.flush() + + # @classmethod + def save_error_analysis_validation(self, message: str): + assert self.error_analysis_log_validation != None, "The Detailed JSON log file is not initialized yet!" + # print(message) + if self.error_analysis_log_validation != None: + print(message, file = self.error_analysis_log_validation) + self.error_analysis_log_validation.flush() + + # @classmethod + def save_error_analysis_testing(self, message: str): + assert self.error_analysis_log_testing != None, "The Detailed JSON log file is not initialized yet!" + # print(message) + if self.error_analysis_log_testing != None: + print(message, file = self.error_analysis_log_testing) + self.error_analysis_log_testing.flush() + + # @classmethod + def save_error_analysis_test2(self, message: str): + assert self.error_analysis_log_test2 != None, "The Detailed JSON log file is not initialized yet!" + # print(message) + if self.error_analysis_log_test2 != None: + print(message, file=self.error_analysis_log_test2) + self.error_analysis_log_test2.flush() + + # @classmethod + def save_error_analysis_test3(self, message: str): + assert self.error_analysis_log_test3 != None, "The Detailed JSON log file is not initialized yet!" + # print(message) + if self.error_analysis_log_test3 != None: + print(message, file=self.error_analysis_log_test3) + self.error_analysis_log_test3.flush() + + def close(self): + self.mylogfile.close() + self.mylogfile_details.close() + self.error_analysis_log_validation.close() + self.error_analysis_log_testing.close() + self.error_analysis_log_test2.close() + self.error_analysis_log_test3.close() \ No newline at end of file diff --git a/handlers/tensorboard_writer.py b/handlers/tensorboard_writer.py new file mode 100644 index 0000000..51e7a2c --- /dev/null +++ b/handlers/tensorboard_writer.py @@ -0,0 +1,17 @@ +from tensorboardX import SummaryWriter + +class TensorboardWrapper(): + my_tensorboard_writer = None + + def __init__(self): + pass + + @classmethod + def init_log_files(cls, log_file): + if log_file != None: + cls.my_tensorboard_writer = SummaryWriter(log_file) + + @classmethod + def mywriter(cls): + assert cls.my_tensorboard_writer != None, "The LogFile is not initialized yet!" + return cls.my_tensorboard_writer \ No newline at end of file diff --git a/handlers/tensorboard_writer_class.py b/handlers/tensorboard_writer_class.py new file mode 100644 index 0000000..d8f4154 --- /dev/null +++ b/handlers/tensorboard_writer_class.py @@ -0,0 +1,17 @@ +from tensorboardX import SummaryWriter + +class TensorboardWrapperClass(object): + # my_tensorboard_writer = None + + def __init__(self): + pass + + # @classmethod + def init_log_files(self, log_file): + if log_file != None: + self.my_tensorboard_writer = SummaryWriter(log_file) + + # @classmethod + def mywriter(self): + assert self.my_tensorboard_writer != None, "The LogFile is not initialized yet!" + return self.my_tensorboard_writer \ No newline at end of file diff --git a/interactions.py b/interactions.py new file mode 100644 index 0000000..c9016ca --- /dev/null +++ b/interactions.py @@ -0,0 +1,291 @@ +import numpy as np +import pandas as pd +import matchzoo +import collections +from setting_keywords import KeyWordSettings +from handlers.output_handler import FileHandler + + +class MatchInteraction(object): + """ + Interactions object. Contains (at a minimum) pair of user-item + interactions, but can also be enriched with ratings, timestamps, + and interaction weights. + + For *implicit feedback* scenarios, user ids and item ids should + only be provided for user-item pairs where an interaction was + observed. All pairs that are not provided are treated as missing + observations, and often interpreted as (implicit) negative + signals. + + For *explicit feedback* scenarios, user ids, item ids, and + ratings should be provided for all user-item-rating triplets + that were observed in the dataset. + + This class is designed specificlaly for matching models only. Since I don't want + to use MatchZoo datapack at all. + Parameters + ---------- + + data_pack: + Attributes + ---------- + + unique_query_ids: `np.ndarray`array of np.int32 + array of user ids of the user-item pairs + query_contents: array of np.int32 + array of item ids of the user-item pairs + query_lengths: array of np.float32, optional + array of ratings + + unique_doc_ids: array of np.int32, optional + array of timestamps + doc_contents: array of np.float32, optional + array of weights + doc_lengths: int, optional + Number of distinct users in the dataset. + + pos_queries: list[int] + Number of distinct items in the dataset. + pos_docs: list[int] + Number of distinct items in the dataset. + negatives: dict + + """ + + def __init__(self, data_pack: matchzoo.DataPack, **kargs): + # Note that, these indices are not from 0. + FileHandler.myprint("Converting DataFrame to Normal Dictionary of Data") + self.unique_query_ids, \ + self.dict_query_contents, \ + self.dict_query_lengths, \ + self.dict_query_raw_contents, \ + self.dict_query_positions = self.convert_leftright(data_pack.left, text_key = "text_left", + length_text_key = "length_left", + raw_text_key = "raw_text_left") + self.data_pack = data_pack + assert len(self.unique_query_ids) == len(set(self.unique_query_ids)), "Must be unique ids" + """ Why do I need to sort it? I have no idea why did I do it? """ + + self.unique_doc_ids, \ + self.dict_doc_contents, \ + self.dict_doc_lengths, \ + self.dict_doc_raw_contents, \ + self.dict_doc_positions = self.convert_leftright(data_pack.right, text_key = "text_right", + length_text_key = "length_right", + raw_text_key = "raw_text_right") + + assert len(self.unique_doc_ids) == len(set(self.unique_doc_ids)), "Must be unique ids for doc ids" + assert len(self.unique_query_ids) != len(self.unique_doc_ids), "Impossible to have equal number of docs and number of original tweets" + + self.pos_queries, \ + self.pos_docs, \ + self.negatives, \ + self.unique_queries_test = self.convert_relations(data_pack.relation) + + # for queries, padded + self.np_query_contents = np.array([self.dict_query_contents[q] for q in self.pos_queries]) + self.np_query_lengths = np.array([self.dict_query_lengths[q] for q in self.pos_queries]) + self.query_positions = np.array([self.dict_query_positions[q] for q in self.pos_queries]) + + # for docs, padded + self.np_doc_contents = np.array([self.dict_doc_contents[d] for d in self.pos_docs]) + self.np_doc_lengths = np.array([self.dict_doc_lengths[d] for d in self.pos_docs]) + self.doc_positions = np.array([self.dict_doc_positions[d] for d in self.pos_docs]) + + assert self.np_query_lengths.shape == self.np_doc_lengths.shape + self.padded_doc_length = len(self.np_doc_contents[0]) + self.padded_query_length = len(self.np_query_contents[0]) + + def convert_leftright(self, part: pd.DataFrame, text_key: str, length_text_key: str, raw_text_key: str, **kargs): + """ Converting the dataframe of interactions """ + ids, contents_dict, lengths_dict, position_dict = [], {}, {}, {} + raw_content_dict = {} + # Why don't we use the queryID as the key for dictionary???? + FileHandler.myprint("[NOTICE] MatchZoo use queryID and docID as index in dataframe left and right, " + "therefore, iterrows will return index which is left_id or right_id") + for index, row in part.iterrows(): # very dangerous, be careful because it may change order!!! + ids.append(index) + text_ = row[text_key] # text_ here is converted to numbers and padded + raw_content_dict[index] = row[raw_text_key] + + if length_text_key not in row: length_ = len(text_) + else: length_ = row[length_text_key] + assert length_ != 0 + assert index not in contents_dict + contents_dict[index] = text_ + lengths_dict[index] = length_ + position_dict[index] = np.pad(np.arange(length_) + 1, (0, len(text_) - length_), 'constant') + + return np.array(ids), contents_dict, lengths_dict, raw_content_dict, position_dict + + def convert_relations(self, relation: pd.DataFrame): + """ Convert relations. + We want to retrieve positive interactions and negative interactions. Particularly, + for every pair (query, doc) = 1, we get a list of negatives of the query q + + It is possible that a query may have multiple positive docs. Therefore, negatives[q] + may vary the lengths but not too much. + """ + queries, docs, negatives = [], [], collections.defaultdict(list) + unique_queries = collections.defaultdict(list) + + for index, row in relation.iterrows(): + query = row["id_left"] + doc = row["id_right"] + label = row["label"] + assert label == 0 or label == 1 + unique_queries[query] = unique_queries.get(query, [[], [], [], []]) # doc, label, content, length + a, b, c, d = unique_queries[query] + a.append(doc) + b.append(label) + c.append(self.dict_doc_contents[doc]) + d.append(self.dict_doc_lengths[doc]) + + if label == 1: + queries.append(query) + docs.append(doc) + elif label == 0: + negatives[query].append(doc) + assert len(queries) == len(docs) + return np.array(queries), np.array(docs), negatives, unique_queries + + def __repr__(self): + + return ('' + .format( + num_users = self.num_users, + num_items = self.num_items, + num_interactions = len(self) + )) + + def _check(self): + pass + + +class BaseClassificationInteractions(object): + """ Base classification interactions for fact-checking with evidences """ + + def __init__(self, data_pack: matchzoo.DataPack, **kargs): + # FileHandler.myprint("Converting DataFrame to Normal Dictionary of Data") + self.output_handler = kargs[KeyWordSettings.OutputHandlerFactChecking] + self.output_handler.myprint("Converting DataFrame to Normal Dictionary of Data") + additional_field = {KeyWordSettings.FCClass.CharSourceKey: "char_claim_source"} + self.unique_query_ids, \ + self.dict_claim_contents, \ + self.dict_claim_lengths, \ + self.dict_query_raw_contents, \ + self.dict_query_positions, \ + self.dict_claim_source, \ + self.dict_raw_claim_source, \ + self.dict_char_left_src = self.convert_leftright(data_pack.left, text_key="text_left", + length_text_key="length_left", raw_text_key="raw_text_left", + source_key="claim_source", raw_source_key="raw_claim_source", **additional_field) + self.data_pack = data_pack + assert len(self.unique_query_ids) == len(set(self.unique_query_ids)), "Must be unique ids" + """ Why do I need to sort it? I have no idea why did I do it? """ + additional_field = {KeyWordSettings.FCClass.CharSourceKey: "char_evidence_source"} + self.unique_doc_ids, \ + self.dict_doc_contents, \ + self.dict_doc_lengths, \ + self.dict_doc_raw_contents, \ + self.dict_doc_positions, \ + self.dict_evd_source,\ + self.dict_raw_evd_source, \ + self.dict_char_right_src = self.convert_leftright(data_pack.right, text_key="text_right", + length_text_key="length_right", + raw_text_key="raw_text_right", source_key="evidence_source", + raw_source_key="raw_evidence_source", **additional_field) + + assert len(self.unique_doc_ids) == len(set(self.unique_doc_ids)), "Must be unique ids for doc ids" + assert len(self.unique_query_ids) != len( + self.unique_doc_ids), "Impossible to have equal number of docs and number of original tweets" + + def convert_leftright(self, part: pd.DataFrame, text_key: str, length_text_key: str, raw_text_key: str, + source_key: str, raw_source_key: str, **kargs): + """ Converting the dataframe of interactions """ + ids, contents_dict, lengths_dict, position_dict = [], {}, {}, {} + raw_content_dict, sources, raw_sources, char_sources = {}, {}, {}, {} + char_source_key = kargs[KeyWordSettings.FCClass.CharSourceKey] + # Why don't we use the queryID as the key for dictionary???? + self.output_handler.myprint("[NOTICE] MatchZoo use queryID and docID as index in dataframe left and right, " + "therefore, iterrows will return index which is left_id or right_id") + for index, row in part.iterrows(): # very dangerous, be careful because it may change order!!! + ids.append(index) + text_ = row[text_key] # text_ here is converted to numbers and padded + raw_content_dict[index] = row[raw_text_key] + + if length_text_key not in row: length_ = len(text_) + else: length_ = row[length_text_key] + assert length_ != 0 + assert index not in contents_dict + contents_dict[index] = text_ + lengths_dict[index] = length_ + position_dict[index] = np.pad(np.arange(length_) + 1, (0, len(text_) - length_), 'constant') + sources[index] = row[source_key] + raw_sources[index] = row[raw_source_key] + char_sources[index] = row[char_source_key] + + return np.array(ids), contents_dict, lengths_dict, raw_content_dict, \ + position_dict, sources, raw_sources, char_sources + + def convert_relations(self, relation: pd.DataFrame): pass + + +class ClassificationInteractions(BaseClassificationInteractions): + """ + This class is for classification based on evidences. + Query - [list of evidences] -> labels + """ + + def __init__(self, data_pack: matchzoo.DataPack, **kargs): + super(ClassificationInteractions, self).__init__(data_pack, **kargs) + + # (1) unique claims, (2) labels for each claim and (3) info of each claim + self.claims, self.claims_labels, self.dict_claims_and_evidences_test = \ + self.convert_relations(data_pack.relation) + + # for queries, padded + self.np_query_contents = np.array([self.dict_claim_contents[q] for q in self.claims]) + self.np_query_lengths = np.array([self.dict_claim_lengths[q] for q in self.claims]) + self.np_query_char_source = np.array([self.dict_char_left_src[q] for q in self.claims]) + self.query_positions = np.array([self.dict_query_positions[q] for q in self.claims]) + + # assert self.np_query_lengths.shape == self.np_doc_lengths.shape + self.padded_doc_length = len(self.dict_doc_contents[self.unique_doc_ids[0]]) + self.padded_doc_char_source_length = len(self.dict_char_right_src[self.unique_doc_ids[0]]) + # self.padded_query_length = len(self.np_query_contents[0]) + + def convert_relations(self, relation: pd.DataFrame): + """ Convert relations. + We want to retrieve positive interactions and negative interactions. Particularly, + for every pair (query, doc) = 1, we get a list of negatives of the query q + + It is possible that a query may have multiple positive docs. Therefore, negatives[q] + may vary the lengths but not too much. + """ + queries = [] # , collections.defaultdict(list) + queries_labels = [] + unique_queries = collections.defaultdict(list) + set_queries = set() + + for index, row in relation.iterrows(): + query = row["id_left"] + doc = row["id_right"] + label = row["label"] + # assert label == 0 or label == 1 + unique_queries[query] = unique_queries.get(query, [[], [], [], []]) # doc, label, content, length + a, b, c, d = unique_queries[query] + a.append(doc) + b.append(label) + c.append(self.dict_doc_contents[doc]) + d.append(self.dict_doc_lengths[doc]) + + if query not in set_queries: + queries.append(query) # same as unique_queries + queries_labels.append(label) + set_queries.add(query) + + assert len(queries) == len(unique_queries) + return np.array(queries), np.array(queries_labels), unique_queries diff --git a/matchzoo/preprocessors/__init__.py b/matchzoo/preprocessors/__init__.py index 9ca462e..98340c3 100644 --- a/matchzoo/preprocessors/__init__.py +++ b/matchzoo/preprocessors/__init__.py @@ -1,16 +1,7 @@ from . import units -from .dssm_preprocessor import DSSMPreprocessor -from .naive_preprocessor import NaivePreprocessor from .basic_preprocessor import BasicPreprocessor -from .cdssm_preprocessor import CDSSMPreprocessor -from .mz_pretrained_preprocessor import PreTrainedModelsProcessor -from .char_ngram_preprocessor import CharNGramPreprocessor -from .elmo_basic_preprocessor import ElmoPreprocessor from .bow_preprocessor import BoWPreprocessor -from .declare_preprocessor import DeClarePreprocessor -from .fact_checking_elmo_preprocessor import FactCheckingElmoPreprocessor from .char_man_preprocessor import CharManPreprocessor -from .char_man_elmo_preprocessor import CharManElmoPreprocessor def list_available() -> list: diff --git a/matchzoo/preprocessors/basic_preprocessor.py b/matchzoo/preprocessors/basic_preprocessor.py index d72df17..7a042aa 100644 --- a/matchzoo/preprocessors/basic_preprocessor.py +++ b/matchzoo/preprocessors/basic_preprocessor.py @@ -8,7 +8,6 @@ from .build_vocab_unit import build_vocab_unit from .build_unit_from_data_pack import build_unit_from_data_pack from .chain_transform import chain_transform -from handlers.output_handler import FileHandler tqdm.pandas() diff --git a/matchzoo/preprocessors/bow_preprocessor.py b/matchzoo/preprocessors/bow_preprocessor.py index eb33559..4c15c04 100644 --- a/matchzoo/preprocessors/bow_preprocessor.py +++ b/matchzoo/preprocessors/bow_preprocessor.py @@ -8,7 +8,6 @@ from .build_vocab_unit import build_vocab_unit from .build_unit_from_data_pack import build_unit_from_data_pack from .chain_transform import chain_transform -from handlers.output_handler import FileHandler from typing import List import torch import itertools, os diff --git a/matchzoo/preprocessors/cdssm_preprocessor.py b/matchzoo/preprocessors/cdssm_preprocessor.py deleted file mode 100644 index edeac4e..0000000 --- a/matchzoo/preprocessors/cdssm_preprocessor.py +++ /dev/null @@ -1,125 +0,0 @@ -"""CDSSM Preprocessor.""" - -from tqdm import tqdm - -from . import units -from .chain_transform import chain_transform -from matchzoo import DataPack -from matchzoo.engine.base_preprocessor import BasePreprocessor -from .build_vocab_unit import build_vocab_unit - -tqdm.pandas() - - -class CDSSMPreprocessor(BasePreprocessor): - """CDSSM Model preprocessor.""" - - def __init__(self, - fixed_length_left: int = 10, - fixed_length_right: int = 40, - with_word_hashing: bool = True): - """ - CDSSM Model preprocessor. - - The word hashing step could eats up a lot of memory. To workaround - this problem, set `with_word_hashing` to `False` and use a - :class:`matchzoo.DynamicDataGenerator` with a - :class:`matchzoo.preprocessor.units.WordHashing`. - - TODO: doc here. - - :param with_word_hashing: Include a word hashing step if `True`. - - Example: - >>> import matchzoo as mz - >>> train_data = mz.datasets.toy.load_data() - >>> test_data = mz.datasets.toy.load_data(stage='test') - >>> cdssm_preprocessor = mz.preprocessors.CDSSMPreprocessor() - >>> train_data_processed = cdssm_preprocessor.fit_transform( - ... train_data, verbose=0 - ... ) - >>> type(train_data_processed) - - >>> test_data_transformed = cdssm_preprocessor.transform(test_data, - ... verbose=0) - >>> type(test_data_transformed) - - - """ - super().__init__() - self._fixed_length_left = fixed_length_left - self._fixed_length_right = fixed_length_right - self._left_fixedlength_unit = units.FixedLength( - self._fixed_length_left, - pad_value='0', pad_mode='post' - ) - self._right_fixedlength_unit = units.FixedLength( - self._fixed_length_right, - pad_value='0', pad_mode='post' - ) - self._with_word_hashing = with_word_hashing - - def fit(self, data_pack: DataPack, verbose: int = 1): - """ - Fit pre-processing context for transformation. - - :param verbose: Verbosity. - :param data_pack: Data_pack to be preprocessed. - :return: class:`CDSSMPreprocessor` instance. - """ - fit_units = self._default_units() + [units.NgramLetter()] - func = chain_transform(fit_units) - data_pack = data_pack.apply_on_text(func, verbose=verbose) - vocab_unit = build_vocab_unit(data_pack, verbose=verbose) - - self._context['vocab_unit'] = vocab_unit - vocab_size = len(vocab_unit.state['term_index']) + 1 - self._context['input_shapes'] = [ - (self._fixed_length_left, vocab_size), - (self._fixed_length_right, vocab_size) - ] - return self - - def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: - """ - Apply transformation on data, create `letter-ngram` representation. - - :param data_pack: Inputs to be preprocessed. - :param verbose: Verbosity. - - :return: Transformed data as :class:`DataPack` object. - """ - data_pack = data_pack.copy() - func = chain_transform(self._default_units()) - data_pack.apply_on_text(func, inplace=True, verbose=verbose) - data_pack.apply_on_text(self._left_fixedlength_unit.transform, - mode='left', inplace=True, verbose=verbose) - data_pack.apply_on_text(self._right_fixedlength_unit.transform, - mode='right', inplace=True, verbose=verbose) - post_units = [units.NgramLetter(reduce_dim=False)] - if self._with_word_hashing: - term_index = self._context['vocab_unit'].state['term_index'] - post_units.append(units.WordHashing(term_index)) - data_pack.apply_on_text(chain_transform(post_units), - inplace=True, verbose=verbose) - return data_pack - - @classmethod - def _default_units(cls) -> list: - """Prepare needed process units.""" - return [ - units.Tokenize(), - units.Lowercase(), - units.PuncRemoval(), - units.StopRemoval(), - ] - - @property - def with_word_hashing(self): - """`with_word_hashing` getter.""" - return self._with_word_hashing - - @with_word_hashing.setter - def with_word_hashing(self, value): - """`with_word_hashing` setter.""" - self._with_word_hashing = value diff --git a/matchzoo/preprocessors/char_man_elmo_preprocessor.py b/matchzoo/preprocessors/char_man_elmo_preprocessor.py deleted file mode 100644 index ec4fda0..0000000 --- a/matchzoo/preprocessors/char_man_elmo_preprocessor.py +++ /dev/null @@ -1,269 +0,0 @@ -"""Basic Preprocessor.""" - -from tqdm import tqdm - -from . import units -from matchzoo import DataPack -from matchzoo.engine.base_preprocessor import BasePreprocessor -from .build_vocab_unit import build_vocab_unit -from .build_unit_from_data_pack import build_unit_from_data_pack -from .chain_transform import chain_transform -from handlers.output_handler import FileHandler -from matchzoo.preprocessors.basic_preprocessor import BasicPreprocessor -from matchzoo.preprocessors.units import Unit -from .units import Vocabulary -tqdm.pandas() - - -class CharManElmoPreprocessor(BasicPreprocessor): - """ - Baisc preprocessor helper for fact-checking with external evidences for my - - :param fixed_length_left: Integer, maximize length of :attr:`left` in the - data_pack. - :param fixed_length_right: Integer, maximize length of :attr:`right` in the - data_pack. - :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can - be 'df', 'cf', and 'idf'. - :param filter_low_freq: Float, lower bound value used by - :class:`FrequenceFilterUnit`. - :param filter_high_freq: Float, upper bound value used by - :class:`FrequenceFilterUnit`. - :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not. - - Example: - >>> import matchzoo as mz - >>> train_data = mz.datasets.toy.load_data('train') - >>> test_data = mz.datasets.toy.load_data('test') - >>> preprocessor = mz.preprocessors.BasicPreprocessor( - ... fixed_length_left=10, - ... fixed_length_right=20, - ... filter_mode='df', - ... filter_low_freq=2, - ... filter_high_freq=1000, - ... remove_stop_words=True - ... ) - >>> preprocessor = preprocessor.fit(train_data, verbose=0) - >>> preprocessor.context['input_shapes'] - [(10,), (20,)] - >>> preprocessor.context['vocab_size'] - 225 - >>> processed_train_data = preprocessor.transform(train_data, - ... verbose=0) - >>> type(processed_train_data) - - >>> test_data_transformed = preprocessor.transform(test_data, - ... verbose=0) - >>> type(test_data_transformed) - - - """ - - def __init__(self, fixed_length_left: int = 30, - fixed_length_right: int = 30, - fixed_length_left_src: int = 30, - fixed_length_right_src: int = 30, - filter_mode: str = 'df', - filter_low_freq: float = 2, - filter_high_freq: float = float('inf'), - remove_stop_words: bool = False): - """Initialization.""" - super().__init__() - self._fixed_length_left = fixed_length_left - self._fixed_length_right = fixed_length_right - self._fixed_length_left_src = fixed_length_left_src - self._fixed_length_right_src = fixed_length_right_src - self._left_fixedlength_unit = units.FixedLength( - self._fixed_length_left, - pad_mode='post' - ) - self._right_fixedlength_unit = units.FixedLength( - self._fixed_length_right, - pad_mode='post' - ) - # for padding character level of left_source and right_source - self._left_char_src_fixedlength_unit = units.FixedLength(self._fixed_length_left_src, pad_mode='post') - self._right_char_src_fixedlength_unit = units.FixedLength(self._fixed_length_right_src, pad_mode='post') - - self.char_unit = units.ngram_letter.NgramLetter(ngram=1, reduce_dim=True) - self._units = [SplitTokenize()] - if remove_stop_words: - self._units.append(units.stop_removal.StopRemoval()) - - def fit(self, data_pack: DataPack, verbose: int = 1): - """ - Fit pre-processing context for transformation. - - :param data_pack: data_pack to be preprocessed. - :param verbose: Verbosity. - :return: class:`BasicPreprocessor` instance. - """ - data_pack = data_pack.apply_on_text(chain_transform(self._units), verbose=verbose) - # fitted_filter_unit = build_unit_from_data_pack(self._filter_unit, - # data_pack, - # flatten=False, - # mode='right', - # verbose=verbose) - # data_pack = data_pack.apply_on_text(fitted_filter_unit.transform, - # mode='right', verbose=verbose) - # self._context['filter_unit'] = fitted_filter_unit - - vocab_unit = build_vocab_unit(data_pack, verbose=verbose) - self._context['vocab_unit'] = vocab_unit - - vocab_size = len(vocab_unit.state['term_index']) # + 1 # +1 for padding - self._context['vocab_size'] = vocab_size - self._context['embedding_input_dim'] = vocab_size - self._context['input_shapes'] = [(self._fixed_length_left,), - (self._fixed_length_right,)] - - claim_source_unit = build_entity_unit(column="claim_source", data_pack=data_pack, mode="left") - article_source_unit = build_entity_unit(column="evidence_source", data_pack=data_pack, mode="right") - self._context['claim_source_unit'] = claim_source_unit - self._context['article_source_unit'] = article_source_unit - - char_source_unit = build_ngram_unit(left_column="claim_source", right_column="evidence_source", - data_pack=data_pack, mode="both") - self._context['char_source_unit'] = char_source_unit - return self - - def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: - """ - Apply transformation on data, create fixed length representation. - - :param data_pack: Inputs to be preprocessed. - :param verbose: Verbosity. - - :return: Transformed data as :class:`DataPack` object. - """ - data_pack = data_pack.copy() - - def map_claim_source(entity: str): return self._context['claim_source_unit'].transform([entity]) - - def map_evidence_source(entity: str): return self._context['article_source_unit'].transform([entity]) - - def map_src2char(entity: str): - return self._context['char_source_unit'].transform(list(entity)) - - data_pack.left["claim_source"] = data_pack.left["claim_source"].progress_apply(map_claim_source) - data_pack.left["char_claim_source"] = data_pack.left["char_claim_source"].progress_apply(map_src2char) - data_pack.left["char_claim_source"] = data_pack.left["char_claim_source"].progress_apply( - self._left_char_src_fixedlength_unit.transform) - - data_pack.right["evidence_source"] = data_pack.right["evidence_source"].progress_apply(map_evidence_source) - data_pack.right["char_evidence_source"] = data_pack.right["char_evidence_source"].progress_apply(map_src2char) - data_pack.right["char_evidence_source"] = data_pack.right["char_evidence_source"].progress_apply( - self._right_char_src_fixedlength_unit.transform) - - data_pack.apply_on_text(chain_transform(self._units), inplace=True, verbose=verbose) - - # data_pack.apply_on_text(self._context['filter_unit'].transform, - # mode='right', inplace=True, verbose=verbose) - data_pack.apply_on_text(self._context['vocab_unit'].transform, - mode='both', inplace=True, verbose=verbose) - data_pack.append_text_length(inplace=True, verbose=verbose) - data_pack.apply_on_text(self._left_fixedlength_unit.transform, - mode='left', inplace=True, verbose=verbose) - data_pack.apply_on_text(self._right_fixedlength_unit.transform, - mode='right', inplace=True, verbose=verbose) - - max_len_left = self._fixed_length_left - max_len_right = self._fixed_length_right - - data_pack.left['length_left'] = \ - data_pack.left['length_left'].apply( - lambda val: min(val, max_len_left)) - - data_pack.right['length_right'] = \ - data_pack.right['length_right'].apply( - lambda val: min(val, max_len_right)) - - return data_pack - - -class SplitTokenize(Unit): - """Process unit for text tokenization.""" - - def transform(self, input_: str) -> list: - """ - Process input data from raw terms to list of tokens. - - :param input_: raw textual input. - - :return tokens: tokenized tokens as a list. - """ - return input_.split() - - -def build_entity_unit( - column: str, - data_pack: DataPack, - mode: str = 'both', - verbose: int = 1 -) -> Vocabulary: - """ - Build a :class:`preprocessor.units.Vocabulary` given `data_pack`. - - The `data_pack` should be preprocessed forehand, and each item in - `text_left` and `text_right` columns of the `data_pack` should be a list - of tokens. - - :param column: `str` the selected column to build units - :param data_pack: The :class:`DataPack` to build vocabulary upon. - :param mode: One of 'left', 'right', and 'both', to determine the source - data for building the :class:`VocabularyUnit`. - :param verbose: Verbosity. - :return: A built vocabulary unit. - - """ - unit = Vocabulary() - corpus = [] - def func(entity: str): corpus.append(entity.strip()) - assert mode in ["left", "right"] - if mode == "left": - data_pack.left[column].progress_apply(func) - elif mode == "right": - data_pack.right[column].progress_apply(func) - else: - raise NotImplemented("Not coded for both columns") - - if verbose: - description = 'Building Entities ' + unit.__class__.__name__ + ' from a datapack.' - corpus = tqdm(corpus, desc=description) - unit.fit(corpus) - return unit - - -def build_ngram_unit(left_column: str, right_column: str, data_pack: DataPack, mode: str = 'both', verbose: int = 1): - """ - Build a :class:`preprocessor.units.Vocabulary` given `data_pack`. - - The `data_pack` should be preprocessed forehand, and each item in - `text_left` and `text_right` columns of the `data_pack` should be a list - of tokens. - - :param column: `str` the selected column to build units - :param data_pack: The :class:`DataPack` to build vocabulary upon. - :param mode: One of 'left', 'right', and 'both', to determine the source - data for building the :class:`VocabularyUnit`. - :param verbose: Verbosity. - :return: A built vocabulary unit. - - """ - unit = Vocabulary() - corpus = [] - - def func(entity: str): - assert type(entity) == str - entity = entity.strip() - for c in entity: corpus.append(c) - - assert mode == "both" - data_pack.left[left_column].progress_apply(func) - data_pack.right[right_column].progress_apply(func) - - if verbose: - description = 'Building Characters ' + unit.__class__.__name__ + ' from a datapack.' - corpus = tqdm(corpus, desc=description) - unit.fit(corpus) - return unit diff --git a/matchzoo/preprocessors/char_man_preprocessor.py b/matchzoo/preprocessors/char_man_preprocessor.py index 779fd69..7bc207a 100644 --- a/matchzoo/preprocessors/char_man_preprocessor.py +++ b/matchzoo/preprocessors/char_man_preprocessor.py @@ -8,7 +8,6 @@ from .build_vocab_unit import build_vocab_unit from .build_unit_from_data_pack import build_unit_from_data_pack from .chain_transform import chain_transform -from handlers.output_handler import FileHandler from .units import Vocabulary from .units import StatefulUnit tqdm.pandas() diff --git a/matchzoo/preprocessors/char_ngram_preprocessor.py b/matchzoo/preprocessors/char_ngram_preprocessor.py deleted file mode 100644 index 22087fc..0000000 --- a/matchzoo/preprocessors/char_ngram_preprocessor.py +++ /dev/null @@ -1,95 +0,0 @@ -"""Basic Preprocessor.""" - -from tqdm import tqdm - -from . import units -from matchzoo import DataPack -from matchzoo.preprocessors.basic_preprocessor import BasicPreprocessor -from .build_vocab_unit import build_vocab_unit -from .build_unit_from_data_pack import build_unit_from_data_pack -from .chain_transform import chain_transform -from handlers.output_handler import FileHandler - -tqdm.pandas() - - -class CharNGramPreprocessor(BasicPreprocessor): - """ - Baisc preprocessor helper. - - :param fixed_length_left: Integer, maximize length of :attr:`left` in the - data_pack. - :param fixed_length_right: Integer, maximize length of :attr:`right` in the - data_pack. - :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can - be 'df', 'cf', and 'idf'. - :param filter_low_freq: Float, lower bound value used by - :class:`FrequenceFilterUnit`. - :param filter_high_freq: Float, upper bound value used by - :class:`FrequenceFilterUnit`. - :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not. - - Example: - >>> import matchzoo as mz - >>> train_data = mz.datasets.toy.load_data('train') - >>> test_data = mz.datasets.toy.load_data('test') - >>> preprocessor = mz.preprocessors.BasicPreprocessor( - ... fixed_length_left=10, - ... fixed_length_right=20, - ... filter_mode='df', - ... filter_low_freq=2, - ... filter_high_freq=1000, - ... remove_stop_words=True - ... ) - >>> preprocessor = preprocessor.fit(train_data, verbose=0) - >>> preprocessor.context['input_shapes'] - [(10,), (20,)] - >>> preprocessor.context['vocab_size'] - 225 - >>> processed_train_data = preprocessor.transform(train_data, - ... verbose=0) - >>> type(processed_train_data) - - >>> test_data_transformed = preprocessor.transform(test_data, - ... verbose=0) - >>> type(test_data_transformed) - - - """ - - def __init__(self, fixed_length_left: int = 30, - fixed_length_right: int = 30, - filter_mode: str = 'df', - filter_low_freq: float = 2, - filter_high_freq: float = float('inf'), - remove_stop_words: bool = False): - """Initialization.""" - # super().__init__() - super(BasicPreprocessor, self).__init__() - self._fixed_length_left = fixed_length_left - self._fixed_length_right = fixed_length_right - self._left_fixedlength_unit = units.FixedLength( - self._fixed_length_left, - pad_mode='post' - ) - self._right_fixedlength_unit = units.FixedLength( - self._fixed_length_right, - pad_mode='post' - ) - self._filter_unit = units.FrequencyFilter( - low=filter_low_freq, - high=filter_high_freq, - mode=filter_mode - ) - self._units = self._default_units() - # if remove_stop_words: - # self._units.append(units.stop_removal.StopRemoval()) - - def _default_units(cls) -> list: - return [ - units.Tokenize(), - units.Lowercase(), - units.PuncRemoval(), - units.StopRemoval(), - units.NgramLetter(), - ] diff --git a/matchzoo/preprocessors/declare_preprocessor.py b/matchzoo/preprocessors/declare_preprocessor.py deleted file mode 100644 index 3bca49f..0000000 --- a/matchzoo/preprocessors/declare_preprocessor.py +++ /dev/null @@ -1,202 +0,0 @@ -"""Basic Preprocessor.""" - -from tqdm import tqdm - -from . import units -from matchzoo import DataPack -from matchzoo.engine.base_preprocessor import BasePreprocessor -from .build_vocab_unit import build_vocab_unit -from .build_unit_from_data_pack import build_unit_from_data_pack -from .chain_transform import chain_transform -from handlers.output_handler import FileHandler -from .units import Vocabulary -from .units import StatefulUnit -tqdm.pandas() - - -class DeClarePreprocessor(BasePreprocessor): - """ - Declare preprocessor helper which has source embeddings. - - :param fixed_length_left: Integer, maximize length of :attr:`left` in the - data_pack. - :param fixed_length_right: Integer, maximize length of :attr:`right` in the - data_pack. - :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can - be 'df', 'cf', and 'idf'. - :param filter_low_freq: Float, lower bound value used by - :class:`FrequenceFilterUnit`. - :param filter_high_freq: Float, upper bound value used by - :class:`FrequenceFilterUnit`. - :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not. - - Example: - >>> import matchzoo as mz - >>> train_data = mz.datasets.toy.load_data('train') - >>> test_data = mz.datasets.toy.load_data('test') - >>> preprocessor = mz.preprocessors.BasicPreprocessor( - ... fixed_length_left=10, - ... fixed_length_right=20, - ... filter_mode='df', - ... filter_low_freq=2, - ... filter_high_freq=1000, - ... remove_stop_words=True - ... ) - >>> preprocessor = preprocessor.fit(train_data, verbose=0) - >>> preprocessor.context['input_shapes'] - [(10,), (20,)] - >>> preprocessor.context['vocab_size'] - 225 - >>> processed_train_data = preprocessor.transform(train_data, - ... verbose=0) - >>> type(processed_train_data) - - >>> test_data_transformed = preprocessor.transform(test_data, - ... verbose=0) - >>> type(test_data_transformed) - - - """ - - def __init__(self, fixed_length_left: int = 30, - fixed_length_right: int = 30, - filter_mode: str = 'df', - filter_low_freq: float = 2, - filter_high_freq: float = float('inf'), - remove_stop_words: bool = False): - """Initialization.""" - super().__init__() - self._fixed_length_left = fixed_length_left - self._fixed_length_right = fixed_length_right - self._left_fixedlength_unit = units.FixedLength( - self._fixed_length_left, - pad_mode='post' - ) - self._right_fixedlength_unit = units.FixedLength( - self._fixed_length_right, - pad_mode='post' - ) - # self._filter_unit = units.FrequencyFilter( - # low=filter_low_freq, - # high=filter_high_freq, - # mode=filter_mode - # ) - self._units = self._default_units() - if remove_stop_words: - self._units.append(units.stop_removal.StopRemoval()) - - def fit(self, data_pack: DataPack, verbose: int = 1): - """ - Fit pre-processing context for transformation. - - :param data_pack: data_pack to be preprocessed. - :param verbose: Verbosity. - :return: class:`BasicPreprocessor` instance. - """ - data_pack = data_pack.apply_on_text(chain_transform(self._units), - verbose=verbose) - # fitted_filter_unit = build_unit_from_data_pack(self._filter_unit, - # data_pack, - # flatten=False, - # mode='right', - # verbose=verbose) - # data_pack = data_pack.apply_on_text(fitted_filter_unit.transform, - # mode='right', verbose=verbose) - # self._context['filter_unit'] = fitted_filter_unit - - vocab_unit = build_vocab_unit(data_pack, verbose=verbose) - self._context['vocab_unit'] = vocab_unit - - vocab_size = len(vocab_unit.state['term_index']) # + 1 # +1 for padding - self._context['vocab_size'] = vocab_size - self._context['embedding_input_dim'] = vocab_size - self._context['input_shapes'] = [(self._fixed_length_left,), - (self._fixed_length_right,)] - - claim_source_unit = build_entity_unit(column = "claim_source", data_pack = data_pack, mode = "left") - article_source_unit = build_entity_unit(column = "evidence_source", data_pack = data_pack, mode = "right") - self._context['claim_source_unit'] = claim_source_unit - self._context['article_source_unit'] = article_source_unit - - return self - - def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: - """ - Apply transformation on data, create fixed length representation. - - :param data_pack: Inputs to be preprocessed. - :param verbose: Verbosity. - - :return: Transformed data as :class:`DataPack` object. - """ - data_pack = data_pack.copy() - - def map_claim_source(entity: str): return self._context['claim_source_unit'].transform([entity]) - - def map_evidence_source(entity: str): return self._context['article_source_unit'].transform([entity]) - - data_pack.left["claim_source"] = data_pack.left["claim_source"].progress_apply(map_claim_source) - data_pack.right["evidence_source"] = data_pack.right["evidence_source"].progress_apply(map_evidence_source) - data_pack.apply_on_text(chain_transform(self._units), inplace=True, verbose=verbose) - - # data_pack.apply_on_text(self._context['filter_unit'].transform, - # mode='right', inplace=True, verbose=verbose) - data_pack.apply_on_text(self._context['vocab_unit'].transform, - mode='both', inplace=True, verbose=verbose) - data_pack.append_text_length(inplace=True, verbose=verbose) - data_pack.apply_on_text(self._left_fixedlength_unit.transform, - mode='left', inplace=True, verbose=verbose) - data_pack.apply_on_text(self._right_fixedlength_unit.transform, - mode='right', inplace=True, verbose=verbose) - - max_len_left = self._fixed_length_left - max_len_right = self._fixed_length_right - - data_pack.left['length_left'] = \ - data_pack.left['length_left'].apply( - lambda val: min(val, max_len_left)) - - data_pack.right['length_right'] = \ - data_pack.right['length_right'].apply( - lambda val: min(val, max_len_right)) - - return data_pack - - -def build_entity_unit( - column: str, - data_pack: DataPack, - mode: str = 'both', - verbose: int = 1 -) -> Vocabulary: - """ - Build a :class:`preprocessor.units.Vocabulary` given `data_pack`. - - The `data_pack` should be preprocessed forehand, and each item in - `text_left` and `text_right` columns of the `data_pack` should be a list - of tokens. - - :param column: `str` the selected column to build units - :param data_pack: The :class:`DataPack` to build vocabulary upon. - :param mode: One of 'left', 'right', and 'both', to determine the source - data for building the :class:`VocabularyUnit`. - :param verbose: Verbosity. - :return: A built vocabulary unit. - - """ - unit = Vocabulary() - corpus = [] - def func(entity: str): corpus.append(entity.strip()) - assert mode in ["left", "right"] - if mode == "left": - data_pack.left[column].progress_apply(func) - elif mode == "right": - data_pack.right[column].progress_apply(func) - else: - raise NotImplemented("Not coded for both columns") - - if verbose: - description = 'Building ' + unit.__class__.__name__ + ' from a datapack.' - corpus = tqdm(corpus, desc=description) - unit.fit(corpus) - return unit diff --git a/matchzoo/preprocessors/dssm_preprocessor.py b/matchzoo/preprocessors/dssm_preprocessor.py deleted file mode 100644 index fb2ebab..0000000 --- a/matchzoo/preprocessors/dssm_preprocessor.py +++ /dev/null @@ -1,124 +0,0 @@ -"""DSSM Preprocessor.""" - -from tqdm import tqdm - -from matchzoo.data_pack import DataPack -from matchzoo.engine.base_preprocessor import BasePreprocessor -from .chain_transform import chain_transform -from .build_vocab_unit import build_vocab_unit -from . import units - -tqdm.pandas() - - -class DSSMPreprocessor(BasePreprocessor): - """DSSM Model preprocessor.""" - - def __init__(self, with_word_hashing: bool = True): - """ - DSSM Model preprocessor. - - The word hashing step could eats up a lot of memory. To workaround - this problem, set `with_word_hashing` to `False` and use a - :class:`matchzoo.DynamicDataGenerator` with a - :class:`matchzoo.preprocessor.units.WordHashing`. - - :param with_word_hashing: Include a word hashing step if `True`. - - Example: - >>> import matchzoo as mz - >>> train_data = mz.datasets.toy.load_data() - >>> test_data = mz.datasets.toy.load_data(stage='test') - >>> dssm_preprocessor = mz.preprocessors.DSSMPreprocessor() - >>> train_data_processed = dssm_preprocessor.fit_transform( - ... train_data, verbose=0 - ... ) - >>> type(train_data_processed) - - >>> test_data_transformed = dssm_preprocessor.transform(test_data, - ... verbose=0) - >>> type(test_data_transformed) - - - """ - super().__init__() - self._with_word_hashing = with_word_hashing - - def fit(self, data_pack: DataPack, verbose: int = 1): - """ - Fit pre-processing context for transformation. - - :param verbose: Verbosity. - :param data_pack: data_pack to be preprocessed. - :return: class:`DSSMPreprocessor` instance. - """ - DEBUG = False - if DEBUG: - func2 = chain_transform(self.old_units()) - data_packx = data_pack.apply_on_text(func2, verbose = verbose) - # transform text, after tokenizing, remove stop words and blah blah - vocab_unit2 = build_vocab_unit(data_packx, verbose = verbose) - vocab_size_without_using_letter_ngram = len(vocab_unit2.state['term_index']) + 1 - print("Vocab size without using letter_ngram", vocab_size_without_using_letter_ngram) - - func = chain_transform(self._default_units()) - data_pack = data_pack.apply_on_text(func, verbose=verbose) - vocab_unit = build_vocab_unit(data_pack, verbose=verbose) - - self._context['vocab_unit'] = vocab_unit - vocab_size = len(vocab_unit.state['term_index']) + 1 - if DEBUG: - print("Vocab size using letter_ngram", vocab_size) - self._context['vocab_size'] = vocab_size - self._context['embedding_input_dim'] = vocab_size - self._context['input_shapes'] = [(vocab_size,), (vocab_size,)] - return self - - def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: - """ - Apply transformation on data, create `tri-letter` representation. - - :param data_pack: Inputs to be preprocessed. - :param verbose: Verbosity. - - :return: Transformed data as :class:`DataPack` object. - """ - data_pack = data_pack.copy() - units_ = self._default_units() - assert len(units_) == 5, "Must have 5 pre-processing step in DSSM " - if self._with_word_hashing: - term_index = self._context['vocab_unit'].state['term_index'] - units_.append(units.WordHashing(term_index)) - func = chain_transform(units_) - data_pack.apply_on_text(func, inplace=True, verbose=verbose) - return data_pack - - @classmethod - def _default_units(cls) -> list: - """Prepare needed process units.""" - return [ - units.Tokenize(), - units.Lowercase(), - units.PuncRemoval(), - units.StopRemoval(), - units.NgramLetter(), - ] - - @property - def with_word_hashing(self): - """`with_word_hashing` getter.""" - return self._with_word_hashing - - @with_word_hashing.setter - def with_word_hashing(self, value): - """`with_word_hashing` setter.""" - self._with_word_hashing = value - - def old_units(self) -> list: - """Prepare needed process units.""" - return [ - units.Tokenize(), - units.Lowercase(), - units.PuncRemoval(), - units.StopRemoval() - ] \ No newline at end of file diff --git a/matchzoo/preprocessors/elmo_basic_preprocessor.py b/matchzoo/preprocessors/elmo_basic_preprocessor.py deleted file mode 100644 index aae3f01..0000000 --- a/matchzoo/preprocessors/elmo_basic_preprocessor.py +++ /dev/null @@ -1,168 +0,0 @@ -"""Basic Preprocessor.""" - -from tqdm import tqdm - -from . import units -from matchzoo import DataPack -from matchzoo.engine.base_preprocessor import BasePreprocessor -from .build_vocab_unit import build_vocab_unit -from .build_unit_from_data_pack import build_unit_from_data_pack -from .chain_transform import chain_transform -from handlers.output_handler import FileHandler -from matchzoo.preprocessors.basic_preprocessor import BasicPreprocessor -from matchzoo.preprocessors.units import Unit -tqdm.pandas() - - -class ElmoPreprocessor(BasicPreprocessor): - """ - Baisc preprocessor helper. - - :param fixed_length_left: Integer, maximize length of :attr:`left` in the - data_pack. - :param fixed_length_right: Integer, maximize length of :attr:`right` in the - data_pack. - :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can - be 'df', 'cf', and 'idf'. - :param filter_low_freq: Float, lower bound value used by - :class:`FrequenceFilterUnit`. - :param filter_high_freq: Float, upper bound value used by - :class:`FrequenceFilterUnit`. - :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not. - - Example: - >>> import matchzoo as mz - >>> train_data = mz.datasets.toy.load_data('train') - >>> test_data = mz.datasets.toy.load_data('test') - >>> preprocessor = mz.preprocessors.BasicPreprocessor( - ... fixed_length_left=10, - ... fixed_length_right=20, - ... filter_mode='df', - ... filter_low_freq=2, - ... filter_high_freq=1000, - ... remove_stop_words=True - ... ) - >>> preprocessor = preprocessor.fit(train_data, verbose=0) - >>> preprocessor.context['input_shapes'] - [(10,), (20,)] - >>> preprocessor.context['vocab_size'] - 225 - >>> processed_train_data = preprocessor.transform(train_data, - ... verbose=0) - >>> type(processed_train_data) - - >>> test_data_transformed = preprocessor.transform(test_data, - ... verbose=0) - >>> type(test_data_transformed) - - - """ - - def __init__(self, fixed_length_left: int = 30, - fixed_length_right: int = 30, - filter_mode: str = 'df', - filter_low_freq: float = 2, - filter_high_freq: float = float('inf'), - remove_stop_words: bool = False): - """Initialization.""" - super().__init__() - self._fixed_length_left = fixed_length_left - self._fixed_length_right = fixed_length_right - self._left_fixedlength_unit = units.FixedLength( - self._fixed_length_left, - pad_mode='post' - ) - self._right_fixedlength_unit = units.FixedLength( - self._fixed_length_right, - pad_mode='post' - ) - # self._filter_unit = units.FrequencyFilter( - # low=filter_low_freq, - # high=filter_high_freq, - # mode=filter_mode - # ) - self._units = [SplitTokenize()] - # self._char_left = units.AllenCharUnit(self._fixed_length_left) - # self._char_right = units.AllenCharUnit(self._fixed_length_right) - if remove_stop_words: - self._units.append(units.stop_removal.StopRemoval()) - - def fit(self, data_pack: DataPack, verbose: int = 1): - """ - Fit pre-processing context for transformation. - - :param data_pack: data_pack to be preprocessed. - :param verbose: Verbosity. - :return: class:`BasicPreprocessor` instance. - """ - data_pack = data_pack.apply_on_text(chain_transform(self._units), verbose=verbose) - # fitted_filter_unit = build_unit_from_data_pack(self._filter_unit, - # data_pack, - # flatten=False, - # mode='right', - # verbose=verbose) - # data_pack = data_pack.apply_on_text(fitted_filter_unit.transform, - # mode='right', verbose=verbose) - # self._context['filter_unit'] = fitted_filter_unit - - vocab_unit = build_vocab_unit(data_pack, verbose=verbose) - self._context['vocab_unit'] = vocab_unit - - vocab_size = len(vocab_unit.state['term_index']) # + 1 # +1 for padding - self._context['vocab_size'] = vocab_size - self._context['embedding_input_dim'] = vocab_size - self._context['input_shapes'] = [(self._fixed_length_left,), - (self._fixed_length_right,)] - - return self - - def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: - """ - Apply transformation on data, create fixed length representation. - - :param data_pack: Inputs to be preprocessed. - :param verbose: Verbosity. - - :return: Transformed data as :class:`DataPack` object. - """ - data_pack = data_pack.copy() - data_pack.apply_on_text(chain_transform(self._units), inplace=True, verbose=verbose) - - # data_pack.apply_on_text(self._context['filter_unit'].transform, - # mode='right', inplace=True, verbose=verbose) - # data_pack.apply_on_text(self._char_left.transform, mode='left', inplace=True, verbose=verbose, rename="char_left") - # data_pack.apply_on_text(self._char_right.transform, mode='right', inplace=True, verbose=verbose, rename="char_right") - - data_pack.apply_on_text(self._context['vocab_unit'].transform, - mode='both', inplace=True, verbose=verbose) - data_pack.append_text_length(inplace=True, verbose=verbose) - data_pack.apply_on_text(self._left_fixedlength_unit.transform, - mode='left', inplace=True, verbose=verbose) - data_pack.apply_on_text(self._right_fixedlength_unit.transform, - mode='right', inplace=True, verbose=verbose) - - max_len_left = self._fixed_length_left - max_len_right = self._fixed_length_right - - data_pack.left['length_left'] = \ - data_pack.left['length_left'].apply( - lambda val: min(val, max_len_left)) - - data_pack.right['length_right'] = \ - data_pack.right['length_right'].apply( - lambda val: min(val, max_len_right)) - return data_pack - - -class SplitTokenize(Unit): - """Process unit for text tokenization.""" - - def transform(self, input_: str) -> list: - """ - Process input data from raw terms to list of tokens. - - :param input_: raw textual input. - - :return tokens: tokenized tokens as a list. - """ - return input_.split() diff --git a/matchzoo/preprocessors/fact_checking_elmo_preprocessor.py b/matchzoo/preprocessors/fact_checking_elmo_preprocessor.py deleted file mode 100644 index bf4b6e5..0000000 --- a/matchzoo/preprocessors/fact_checking_elmo_preprocessor.py +++ /dev/null @@ -1,173 +0,0 @@ -"""Basic Preprocessor.""" - -from tqdm import tqdm - -from . import units -from matchzoo import DataPack -from matchzoo.engine.base_preprocessor import BasePreprocessor -from .build_vocab_unit import build_vocab_unit -from .build_unit_from_data_pack import build_unit_from_data_pack -from .chain_transform import chain_transform -from handlers.output_handler import FileHandler -from matchzoo.preprocessors.basic_preprocessor import BasicPreprocessor -from matchzoo.preprocessors.units import Unit -from matchzoo.preprocessors.declare_preprocessor import build_entity_unit -tqdm.pandas() - - -class FactCheckingElmoPreprocessor(BasicPreprocessor): - """ - Baisc preprocessor helper for fact-checking with external evidences for my - - :param fixed_length_left: Integer, maximize length of :attr:`left` in the - data_pack. - :param fixed_length_right: Integer, maximize length of :attr:`right` in the - data_pack. - :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can - be 'df', 'cf', and 'idf'. - :param filter_low_freq: Float, lower bound value used by - :class:`FrequenceFilterUnit`. - :param filter_high_freq: Float, upper bound value used by - :class:`FrequenceFilterUnit`. - :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not. - - Example: - >>> import matchzoo as mz - >>> train_data = mz.datasets.toy.load_data('train') - >>> test_data = mz.datasets.toy.load_data('test') - >>> preprocessor = mz.preprocessors.BasicPreprocessor( - ... fixed_length_left=10, - ... fixed_length_right=20, - ... filter_mode='df', - ... filter_low_freq=2, - ... filter_high_freq=1000, - ... remove_stop_words=True - ... ) - >>> preprocessor = preprocessor.fit(train_data, verbose=0) - >>> preprocessor.context['input_shapes'] - [(10,), (20,)] - >>> preprocessor.context['vocab_size'] - 225 - >>> processed_train_data = preprocessor.transform(train_data, - ... verbose=0) - >>> type(processed_train_data) - - >>> test_data_transformed = preprocessor.transform(test_data, - ... verbose=0) - >>> type(test_data_transformed) - - - """ - - def __init__(self, fixed_length_left: int = 30, - fixed_length_right: int = 30, - filter_mode: str = 'df', - filter_low_freq: float = 2, - filter_high_freq: float = float('inf'), - remove_stop_words: bool = False): - """Initialization.""" - super().__init__() - self._fixed_length_left = fixed_length_left - self._fixed_length_right = fixed_length_right - self._left_fixedlength_unit = units.FixedLength( - self._fixed_length_left, - pad_mode='post' - ) - self._right_fixedlength_unit = units.FixedLength( - self._fixed_length_right, - pad_mode='post' - ) - - self._units = [SplitTokenize()] - if remove_stop_words: - self._units.append(units.stop_removal.StopRemoval()) - - def fit(self, data_pack: DataPack, verbose: int = 1): - """ - Fit pre-processing context for transformation. - - :param data_pack: data_pack to be preprocessed. - :param verbose: Verbosity. - :return: class:`BasicPreprocessor` instance. - """ - data_pack = data_pack.apply_on_text(chain_transform(self._units), verbose=verbose) - # fitted_filter_unit = build_unit_from_data_pack(self._filter_unit, - # data_pack, - # flatten=False, - # mode='right', - # verbose=verbose) - # data_pack = data_pack.apply_on_text(fitted_filter_unit.transform, - # mode='right', verbose=verbose) - # self._context['filter_unit'] = fitted_filter_unit - - vocab_unit = build_vocab_unit(data_pack, verbose=verbose) - self._context['vocab_unit'] = vocab_unit - - vocab_size = len(vocab_unit.state['term_index']) # + 1 # +1 for padding - self._context['vocab_size'] = vocab_size - self._context['embedding_input_dim'] = vocab_size - self._context['input_shapes'] = [(self._fixed_length_left,), - (self._fixed_length_right,)] - - claim_source_unit = build_entity_unit(column="claim_source", data_pack=data_pack, mode="left") - article_source_unit = build_entity_unit(column="evidence_source", data_pack=data_pack, mode="right") - self._context['claim_source_unit'] = claim_source_unit - self._context['article_source_unit'] = article_source_unit - - return self - - def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: - """ - Apply transformation on data, create fixed length representation. - - :param data_pack: Inputs to be preprocessed. - :param verbose: Verbosity. - - :return: Transformed data as :class:`DataPack` object. - """ - data_pack = data_pack.copy() - - def map_claim_source(entity: str): return self._context['claim_source_unit'].transform([entity]) - - def map_evidence_source(entity: str): return self._context['article_source_unit'].transform([entity]) - - data_pack.left["claim_source"] = data_pack.left["claim_source"].progress_apply(map_claim_source) - data_pack.right["evidence_source"] = data_pack.right["evidence_source"].progress_apply(map_evidence_source) - data_pack.apply_on_text(chain_transform(self._units), inplace=True, verbose=verbose) - - # data_pack.apply_on_text(self._context['filter_unit'].transform, - # mode='right', inplace=True, verbose=verbose) - data_pack.apply_on_text(self._context['vocab_unit'].transform, - mode='both', inplace=True, verbose=verbose) - data_pack.append_text_length(inplace=True, verbose=verbose) - data_pack.apply_on_text(self._left_fixedlength_unit.transform, - mode='left', inplace=True, verbose=verbose) - data_pack.apply_on_text(self._right_fixedlength_unit.transform, - mode='right', inplace=True, verbose=verbose) - - max_len_left = self._fixed_length_left - max_len_right = self._fixed_length_right - - data_pack.left['length_left'] = \ - data_pack.left['length_left'].apply( - lambda val: min(val, max_len_left)) - - data_pack.right['length_right'] = \ - data_pack.right['length_right'].apply( - lambda val: min(val, max_len_right)) - - return data_pack - - -class SplitTokenize(Unit): - """Process unit for text tokenization.""" - - def transform(self, input_: str) -> list: - """ - Process input data from raw terms to list of tokens. - - :param input_: raw textual input. - - :return tokens: tokenized tokens as a list. - """ - return input_.split() diff --git a/matchzoo/preprocessors/mz_pretrained_preprocessor.py b/matchzoo/preprocessors/mz_pretrained_preprocessor.py deleted file mode 100644 index d897a8e..0000000 --- a/matchzoo/preprocessors/mz_pretrained_preprocessor.py +++ /dev/null @@ -1,250 +0,0 @@ - -from tqdm import tqdm - -from . import units -from matchzoo import DataPack -from matchzoo.engine.base_preprocessor import BasePreprocessor -from .build_vocab_unit import build_vocab_unit -from .build_unit_from_data_pack import build_unit_from_data_pack -from .chain_transform import chain_transform -from handlers.output_handler import FileHandler -from pytorch_transformers import PreTrainedTokenizer -from pytorch_transformers.utils_glue import _truncate_seq_pair -from typing import List, Tuple -import pandas as pd -tqdm.pandas() - - -class PreTrainedModelsProcessor(PreTrainedTokenizer): - """ - a preprocessor for transform DataPack. - - :param fixed_length_left: Integer, maximize length of :attr:`left` in the - data_pack. - :param fixed_length_right: Integer, maximize length of :attr:`right` in the - data_pack. - :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can - be 'df', 'cf', and 'idf'. - :param filter_low_freq: Float, lower bound value used by - :class:`FrequenceFilterUnit`. - :param filter_high_freq: Float, upper bound value used by - :class:`FrequenceFilterUnit`. - :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not. - - - """ - - def __init__(self, max_seq_length: int, fixed_length_left: int = -1, - fixed_length_right: int = -1, - filter_mode: str = 'df', - filter_low_freq: float = 2, - filter_high_freq: float = float('inf'), - remove_stop_words: bool = False, - - tokenizer: PreTrainedTokenizer = None): - """Initialization. We may need to store vocab path file, number of tokens, blah blah. - """ - FileHandler.myprint("Query truncated to " + str(fixed_length_left) + - " Doc truncated to " + str(fixed_length_right)) - super().__init__() - self.tokenizer = tokenizer - self.max_seq_length = max_seq_length - assert fixed_length_left > 0 and fixed_length_right > 0 - self.fixed_length_left = fixed_length_left - self.fixed_length_right = fixed_length_right - assert self.fixed_length_left + self.fixed_length_right < self.max_seq_length, \ - "Left + right should be smaller than max length" - - - def fit(self, data_pack: pd.DataFrame, verbose: int = 1): - """ - Fit pre-processing context for transformation. - - :param data_pack: data_pack to be preprocessed. - :param verbose: Verbosity. - :return: class:`BasicPreprocessor` instance. - """ - raise NotImplementedError("Not coded yet") - - def transform(self, data_pack: pd.DataFrame, verbose: int = 1) -> Tuple[pd.DataFrame, List]: - """ - Apply transformation on data, create fixed length representation. - - :param data_pack: Inputs to be preprocessed. - :param verbose: Verbosity. - - :return: Transformed data as :class:`DataPack` object. - """ - - - # data_pack.append_text_length(inplace = True, verbose = verbose) - # we need to split each text_left to an array of tokens, then we can convert them to ids - converted_features = self._convert_examples_to_features(data_pack, label_list = [0, 1], max_seq_length = self.max_seq_length, - tokenizer = self.tokenizer, output_mode = "classification") - - # data_pack.apply_on_text(str.split, mode = 'left', inplace = True, verbose = verbose) - # data_pack.apply_on_text(self.tokenizer.convert_tokens_to_ids, - # mode = 'left', inplace = True, verbose = verbose) - - # data_pack.apply_on_text(str.split, mode = 'right', inplace = True, verbose = verbose) - # data_pack.apply_on_text(self.tokenizer.convert_tokens_to_ids, - # mode = 'right', inplace = True, verbose = verbose) - - # max_len_left = self._fixed_length_left - # max_len_right = self._fixed_length_right - # - # data_pack.left['length_left'] = \ - # data_pack.left['length_left'].apply( - # lambda val: min(val, max_len_left)) - # - # data_pack.right['length_right'] = \ - # data_pack.right['length_right'].apply( - # lambda val: min(val, max_len_right)) - return data_pack, converted_features - - - - def _convert_examples_to_features(self, examples: pd.DataFrame, label_list, max_seq_length, - tokenizer, output_mode, - cls_token_at_end = False, - cls_token = '[CLS]', - cls_token_segment_id = 1, - sep_token = '[SEP]', - sep_token_extra = False, - pad_on_left = False, - pad_token = 0, - pad_token_segment_id = 0, - sequence_a_segment_id = 0, - sequence_b_segment_id = 1, - mask_padding_with_zero = True): - """ Loads a data file into a list of `InputBatch`s - `cls_token_at_end` define the location of the CLS token: - - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] - `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) - """ - - label_map = {label: i for i, label in enumerate(label_list)} - # from tqdm import tqdm - features = [] - ex_index = -1 - FileHandler.myprint("Processing text_left and text_right to make it a full sequence for BERT........") - assert type(examples) == pd.DataFrame - for q_id, text_a, doc_id, text_b, label in zip(examples["id_left"], examples["text_left"], - examples["id_right"], examples["text_right"], examples["label"]): - ex_index += 1 - if ex_index % 10000 == 0: FileHandler.myprint("Processed xample %d of %d" % (ex_index, len(examples))) - tokens_a = text_a.split() - tokens_a = tokens_a[:self.fixed_length_left] - tokens_b = None - assert len(text_b) != 0, "Length of documents must be not zero!" - if text_b: - tokens_b = text_b.split() - tokens_b = tokens_b[: self.fixed_length_right] - # Modifies `tokens_a` and `tokens_b` in place so that the total - # length is less than the specified length. - # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa. - special_tokens_count = 4 if sep_token_extra else 3 - _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count) - else: - # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. - special_tokens_count = 3 if sep_token_extra else 2 - if len(tokens_a) > max_seq_length - special_tokens_count: - tokens_a = tokens_a[:(max_seq_length - special_tokens_count)] - - # The convention in BERT is: - # (a) For sequence pairs: - # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] - # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 - # (b) For single sequences: - # tokens: [CLS] the dog is hairy . [SEP] - # type_ids: 0 0 0 0 0 0 0 - # - # Where "type_ids" are used to indicate whether this is the first - # sequence or the second sequence. The embedding vectors for `type=0` and - # `type=1` were learned during pre-training and are added to the wordpiece - # embedding vector (and position vector). This is not *strictly* necessary - # since the [SEP] token unambiguously separates the sequences, but it makes - # it easier for the model to learn the concept of sequences. - # - # For classification tasks, the first vector (corresponding to [CLS]) is - # used as as the "sentence vector". Note that this only makes sense because - # the entire model is fine-tuned. - tokens = tokens_a + [sep_token] - if sep_token_extra: - # roberta uses an extra separator b/w pairs of sentences - tokens += [sep_token] - segment_ids = [sequence_a_segment_id] * len(tokens) - - if tokens_b: - tokens += tokens_b + [sep_token] - segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1) - - if cls_token_at_end: - tokens = tokens + [cls_token] - segment_ids = segment_ids + [cls_token_segment_id] - else: - tokens = [cls_token] + tokens - segment_ids = [cls_token_segment_id] + segment_ids - - input_ids = tokenizer.convert_tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) - - # Zero-pad up to the sequence length. - padding_length = max_seq_length - len(input_ids) - if pad_on_left: - input_ids = ([pad_token] * padding_length) + input_ids - input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask - segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids - else: - input_ids = input_ids + ([pad_token] * padding_length) - input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) - segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - - if output_mode == "classification": - label_id = label_map[label] - elif output_mode == "regression": - label_id = float(label) - else: - raise KeyError(output_mode) - - if ex_index < 5: - FileHandler.myprint("*** Example ***") - # FileHandler.myprint("guid: %s" % (example.guid)) - FileHandler.myprint("tokens: %s" % " ".join( - [str(x) for x in tokens])) - FileHandler.myprint("input_ids: %s" % " ".join([str(x) for x in input_ids])) - FileHandler.myprint("input_mask: %s" % " ".join([str(x) for x in input_mask])) - FileHandler.myprint("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) - FileHandler.myprint("label: %s (id = %d)" % (label, label_id)) - - features.append( - InputFeatures(left_id = q_id, right_id = doc_id, - text_left = text_a, text_right = text_b, - input_ids = input_ids, - input_mask = input_mask, - segment_ids = segment_ids, - label_id = label_id)) - return features - -class InputFeatures(object): - """A single set of features of data.""" - - def __init__(self, left_id: int, right_id: int, - text_left: str, text_right: str, - input_ids, input_mask, segment_ids, label_id): - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.label_id = label_id - self.left_id = left_id - self.right_id = right_id - self.text_left = text_left - self.text_right = text_right \ No newline at end of file diff --git a/matchzoo/preprocessors/naive_preprocessor.py b/matchzoo/preprocessors/naive_preprocessor.py deleted file mode 100644 index 139da4e..0000000 --- a/matchzoo/preprocessors/naive_preprocessor.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Naive Preprocessor.""" - -from tqdm import tqdm - -from matchzoo.engine.base_preprocessor import BasePreprocessor -from matchzoo import DataPack -from .chain_transform import chain_transform -from .build_vocab_unit import build_vocab_unit -from . import units - -tqdm.pandas() - - -class NaivePreprocessor(BasePreprocessor): - """ - Naive preprocessor. - - Example: - >>> import matchzoo as mz - >>> train_data = mz.datasets.toy.load_data() - >>> test_data = mz.datasets.toy.load_data(stage='test') - >>> preprocessor = mz.preprocessors.NaivePreprocessor() - >>> train_data_processed = preprocessor.fit_transform(train_data, - ... verbose=0) - >>> type(train_data_processed) - - >>> test_data_transformed = preprocessor.transform(test_data, - ... verbose=0) - >>> type(test_data_transformed) - - - """ - - def fit(self, data_pack: DataPack, verbose: int = 1): - """ - Fit pre-processing context for transformation. - - :param data_pack: data_pack to be preprocessed. - :param verbose: Verbosity. - :return: class:`NaivePreprocessor` instance. - """ - func = chain_transform(self._default_units()) - data_pack = data_pack.apply_on_text(func, verbose=verbose) - vocab_unit = build_vocab_unit(data_pack, verbose=verbose) - self._context['vocab_unit'] = vocab_unit - return self - - def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: - """ - Apply transformation on data, create `tri-letter` representation. - - :param data_pack: Inputs to be preprocessed. - :param verbose: Verbosity. - - :return: Transformed data as :class:`DataPack` object. - """ - units_ = self._default_units() - units_.append(self._context['vocab_unit']) - units_.append(units.FixedLength(text_length=30, pad_mode='post')) - func = chain_transform(units_) - return data_pack.apply_on_text(func, verbose=verbose) diff --git a/matchzoo/preprocessors/tfidf_preprocessor.py b/matchzoo/preprocessors/tfidf_preprocessor.py deleted file mode 100644 index 896aa77..0000000 --- a/matchzoo/preprocessors/tfidf_preprocessor.py +++ /dev/null @@ -1,41 +0,0 @@ -import matchzoo -from typing import List, Dict, Tuple -import torch_utils -import collections -import numpy as np - -class TFIDF: - - idf_dict = {} - idf_char_ngram = {} - - @classmethod - def init(cls, corpus: List[List[str]], char_ngram_copus: List[List[str]] = []): - """docid, value: list of words """ - stats = cls._idf(corpus) - cls.idf_dict = stats - if char_ngram_copus: - cls.idf_char_ngram = cls._idf(char_ngram_copus) - - @classmethod - def get_term_idf(cls): - return cls.idf_dict - - @classmethod - def get_char_ngram_idf(cls): - return cls.idf_char_ngram - - @classmethod - def _df(cls, list_of_tokens: list) -> dict: - stats = collections.Counter() - for tokens in list_of_tokens: - stats.update(set(tokens)) - return stats - - @classmethod - def _idf(cls, list_of_tokens: list) -> dict: - num_docs = len(list_of_tokens) - stats = cls._df(list_of_tokens) - for key, val in stats.most_common(): - stats[key] = np.log((1.0 + num_docs) / (1.0 + val)) + 1.0 - return stats \ No newline at end of file diff --git a/setting_keywords.py b/setting_keywords.py new file mode 100644 index 0000000..a8cd122 --- /dev/null +++ b/setting_keywords.py @@ -0,0 +1,78 @@ + + +class KeyWordSettings(object): + + Doc_cID = "doc_cid" + Doc_URL = "doc_ciurl" + Doc_cLabel = "doc_clabel" + Doc_wImages = "doc_wimages" + Doc_wContent = "doc_wcontent" + Relevant_Score = "relevant_score" + + Query_id = "qid" + Query_TweetID = "qtweetid" + Query_Images = "query_images" + Ranked_Docs = "ranked_docs" + Query_Content = "query_content" + + Query_lens = "query_lens" + Doc_lens = "docs_lens" + + # for lstm keywords + QueryLensIndices = "query_lens_indices" + DocLensIndices = "doc_lens_indices" + + QueryIDs = "query_ids" + DocIDs = "doc_ids" + UseVisual = "use_visual" + + OutputRankingKey = "output_ranking" + + QueryCountVal = [1116, 1000, 187, 1159] + QueryCountTest = [1001, 1164, 1118, 187, 156, 1160, 1500] + + UseCuda = "use_cuda" + QuerySources = "query_sources" + DocSources = "doc_sources" + TempLabel = "fc_labels" + DocContentNoPaddingEvidence = "doc_content_without_padding_evidences" # to avoid empty sequences to lstm + QueryContentNoPaddingEvidence = "query_content_without_padding_evidences" + ClaimEmbeddings = "claim_embeddings" + EvidenceEmbeddings = "evidences_embeddings" + + EvidenceCountPerQuery = "evd_cnt_each_query" + FIXED_NUM_EVIDENCES = "fixed_num_evidences" + + LOSS_FUNCTIONS = ("cross_entropy") + + ClaimCountVal = [433, 356] + ClaimCountTest = [782, 781, 391, 390, 644, 642, 323, 321] + + AUC_metric = "auc" + F1_macro = "f1_macro" + F1_micro = "f1_micro" + F1 = "f1" + PrecisionTrueCls = "precision_true_cls" + RecallTrueCls = "recall_true_cls" + F1TrueCls = "f1_true_cls" + + PrecisionFalseCls = "precision_false_cls" + RecallFalseCls = "recall_false_cls" + F1FalseCls = "f1_false_cls" + + # for fact-checking error analysis + class FCClass: + DocAttentionScore = "doc_attention_score" + WordAttentionScore = "word_attention_score" + ClaimLabel = "claim_label" + PredictedProb = "predicted_prob" + AttentionWeightsInfo = "attention_weights_info" + CharSourceKey = "char_source" + QueryCharSource = "query_char_source" # characters of claims' source (i.e. chars of speakers' names) + DocCharSource = "doc_char_source" + + CLS_METRICS = [AUC_metric, F1_macro, F1_micro, F1, + PrecisionTrueCls, RecallTrueCls, F1TrueCls, + PrecisionFalseCls, RecallFalseCls, F1FalseCls] + + OutputHandlerFactChecking = "output_handler_fact_checking"