diff --git a/matchzoo/__init__.py b/matchzoo/__init__.py
new file mode 100644
index 0000000..1e2479e
--- /dev/null
+++ b/matchzoo/__init__.py
@@ -0,0 +1,60 @@
+import os
+
+# USER_DIR = Path.expanduser(Path('~')).joinpath('.matchzoo')
+USER_DIR = os.path.expanduser("~")
+USER_DIR = os.path.join(USER_DIR, ".matchzoo")
+if not os.path.exists(USER_DIR):
+    os.mkdir(USER_DIR)
+    # USER_DIR.mkdir()
+USER_DATA_DIR = os.path.join(USER_DIR, 'datasets')
+if not os.path.exists(USER_DATA_DIR):
+    os.mkdir(USER_DATA_DIR)
+    # USER_DATA_DIR.mkdir()
+USER_TUNED_MODELS_DIR = os.path.join(USER_DIR, 'tuned_models')
+
+from .version import __version__
+
+from .data_pack import DataPack
+from .data_pack import pack
+from .data_pack import load_data_pack
+
+# from . import metrics
+from . import tasks
+
+from . import preprocessors
+# from . import data_generator
+# from .data_generator import DataGenerator
+# from .data_generator import DataGeneratorBuilder
+
+from .preprocessors.chain_transform import chain_transform
+from .datasets import embeddings
+# from . import metrics
+# from . import losses
+from . import engine
+# from . import models
+# from . import embedding
+# from . import datasets
+# from . import layers
+# from . import auto
+# from . import contrib
+
+# from .engine import hyper_spaces
+# from .engine.base_model import load_model
+# from .engine.base_preprocessor import load_preprocessor
+# from .engine import callbacks
+# from .engine.param import Param
+# from .engine.param_table import ParamTable
+
+# from .embedding.embedding import Embedding
+
+from .utils import one_hot
+from .preprocessors.build_unit_from_data_pack import build_unit_from_data_pack
+from .preprocessors.build_vocab_unit import build_vocab_unit
+
+# deprecated, should be removed in v2.2
+# from .contrib.legacy_data_generator import DPoolDataGenerator
+# from .contrib.legacy_data_generator import DPoolPairDataGenerator
+# from .contrib.legacy_data_generator import HistogramDataGenerator
+# from .contrib.legacy_data_generator import HistogramPairDataGenerator
+# from .contrib.legacy_data_generator import DynamicDataGenerator
+# from .contrib.legacy_data_generator import PairDataGenerator
diff --git a/matchzoo/data_generator/__init__.py b/matchzoo/data_generator/__init__.py
new file mode 100644
index 0000000..3feb6a0
--- /dev/null
+++ b/matchzoo/data_generator/__init__.py
@@ -0,0 +1,3 @@
+from . import callbacks
+from .data_generator import DataGenerator
+from .data_generator_builder import DataGeneratorBuilder
diff --git a/matchzoo/data_generator/callbacks/__init__.py b/matchzoo/data_generator/callbacks/__init__.py
new file mode 100644
index 0000000..93e6066
--- /dev/null
+++ b/matchzoo/data_generator/callbacks/__init__.py
@@ -0,0 +1,4 @@
+from .callback import Callback
+from .lambda_callback import LambdaCallback
+from .dynamic_pooling import DynamicPooling
+from .histogram import Histogram
diff --git a/matchzoo/data_generator/callbacks/callback.py b/matchzoo/data_generator/callbacks/callback.py
new file mode 100644
index 0000000..af02369
--- /dev/null
+++ b/matchzoo/data_generator/callbacks/callback.py
@@ -0,0 +1,36 @@
+import numpy as np
+
+import matchzoo as mz
+
+
+class Callback(object):
+    """
+    DataGenerator callback base class.
+
+    To build your own callbacks, inherit `mz.data_generator.callbacks.Callback`
+    and overrides corresponding methods.
+
+    A batch is processed in the following way:
+
+    - slice data pack based on batch index
+    - handle `on_batch_data_pack` callbacks
+    - unpack data pack into x, y
+    - handle `on_batch_x_y` callbacks
+    - return x, y
+
+    """
+
+    def on_batch_data_pack(self, data_pack: mz.DataPack):
+        """
+        `on_batch_data_pack`.
+
+        :param data_pack: a sliced DataPack before unpacking.
+        """
+
+    def on_batch_unpacked(self, x: dict, y: np.ndarray):
+        """
+        `on_batch_unpacked`.
+
+        :param x: unpacked x.
+        :param y: unpacked y.
+        """
diff --git a/matchzoo/data_generator/callbacks/dynamic_pooling.py b/matchzoo/data_generator/callbacks/dynamic_pooling.py
new file mode 100644
index 0000000..4a1a1f4
--- /dev/null
+++ b/matchzoo/data_generator/callbacks/dynamic_pooling.py
@@ -0,0 +1,92 @@
+import numpy as np
+
+from matchzoo.data_generator.callbacks import Callback
+
+
+class DynamicPooling(Callback):
+    """:class:`DPoolPairDataGenerator` constructor.
+
+    :param fixed_length_left: max length of left text.
+    :param fixed_length_right: max length of right text.
+    :param compress_ratio_left: the length change ratio,
+        especially after normal pooling layers.
+    :param compress_ratio_right: the length change ratio,
+        especially after normal pooling layers.
+    """
+
+    def __init__(
+        self,
+        fixed_length_left: int,
+        fixed_length_right: int,
+        compress_ratio_left: float = 1,
+        compress_ratio_right: float = 1,
+    ):
+        """Init."""
+        self._fixed_length_left = fixed_length_left
+        self._fixed_length_right = fixed_length_right
+        self._compress_ratio_left = compress_ratio_left
+        self._compress_ratio_right = compress_ratio_right
+
+    def on_batch_unpacked(self, x, y):
+        """
+        Insert `dpool_index` into `x`.
+
+        :param x: unpacked x.
+        :param y: unpacked y.
+        """
+        x['dpool_index'] = _dynamic_pooling_index(
+            x['length_left'],
+            x['length_right'],
+            self._fixed_length_left,
+            self._fixed_length_right,
+            self._compress_ratio_left,
+            self._compress_ratio_right
+        )
+
+
+def _dynamic_pooling_index(length_left: np.array,
+                           length_right: np.array,
+                           fixed_length_left: int,
+                           fixed_length_right: int,
+                           compress_ratio_left: float,
+                           compress_ratio_right: float) -> np.array:
+    def _dpool_index(one_length_left: int,
+                     one_length_right: int,
+                     fixed_length_left: int,
+                     fixed_length_right: int):
+        if one_length_left == 0:
+            stride_left = fixed_length_left
+        else:
+            stride_left = 1.0 * fixed_length_left / one_length_left
+
+        if one_length_right == 0:
+            stride_right = fixed_length_right
+        else:
+            stride_right = 1.0 * fixed_length_right / one_length_right
+
+        one_idx_left = [int(i / stride_left)
+                        for i in range(fixed_length_left)]
+        one_idx_right = [int(i / stride_right)
+                         for i in range(fixed_length_right)]
+        mesh1, mesh2 = np.meshgrid(one_idx_left, one_idx_right)
+        index_one = np.transpose(
+            np.stack([mesh1, mesh2]), (2, 1, 0))
+        return index_one
+
+    index = []
+    dpool_bias_left = dpool_bias_right = 0
+    if fixed_length_left % compress_ratio_left != 0:
+        dpool_bias_left = 1
+    if fixed_length_right % compress_ratio_right != 0:
+        dpool_bias_right = 1
+    cur_fixed_length_left = int(
+        fixed_length_left // compress_ratio_left) + dpool_bias_left
+    cur_fixed_length_right = int(
+        fixed_length_right // compress_ratio_right) + dpool_bias_right
+    for i in range(len(length_left)):
+        index.append(_dpool_index(
+            length_left[i] // compress_ratio_left,
+            length_right[i] // compress_ratio_right,
+            cur_fixed_length_left,
+            cur_fixed_length_right))
+    return np.array(index)
diff --git a/matchzoo/data_generator/callbacks/histogram.py b/matchzoo/data_generator/callbacks/histogram.py
new file mode 100644
index 0000000..c5d56d3
--- /dev/null
+++ b/matchzoo/data_generator/callbacks/histogram.py
@@ -0,0 +1,65 @@
+import numpy as np
+
+import matchzoo as mz
+from matchzoo.data_generator.callbacks import Callback
+
+
+class Histogram(Callback):
+    """
+    Generate data with matching histogram.
+
+    :param embedding_matrix: The embedding matrix used to generator match
+                             histogram.
+    :param bin_size: The number of bin size of the histogram.
+    :param hist_mode: The mode of the :class:`MatchingHistogramUnit`, one of
+                     `CH`, `NH`, and `LCH`.
+    """
+
+    def __init__(
+        self,
+        embedding_matrix: np.ndarray,
+        bin_size: int = 30,
+        hist_mode: str = 'CH',
+    ):
+        """Init."""
+        self._match_hist_unit = mz.preprocessors.units.MatchingHistogram(
+            bin_size=bin_size,
+            embedding_matrix=embedding_matrix,
+            normalize=True,
+            mode=hist_mode
+        )
+
+    def on_batch_unpacked(self, x, y):
+        """Insert `match_histogram` to `x`."""
+        x['match_histogram'] = _build_match_histogram(x, self._match_hist_unit)
+
+
+def _trunc_text(input_text: list, length: list) -> list:
+    """
+    Truncating the input text according to the input length.
+
+    :param input_text: The input text need to be truncated.
+    :param length: The length used to truncated the text.
+    :return: The truncated text.
+    """
+    return [row[:length[idx]] for idx, row in enumerate(input_text)]
+
+
+def _build_match_histogram(
+    x: dict,
+    match_hist_unit: mz.preprocessors.units.MatchingHistogram
+) -> np.ndarray:
+    """
+    Generate the matching hisogram for input.
+
+    :param x: The input `dict`.
+    :param match_hist_unit: The histogram unit :class:`MatchingHistogramUnit`.
+    :return: The matching histogram.
+    """
+    match_hist = []
+    text_left = x['text_left'].tolist()
+    text_right = _trunc_text(x['text_right'].tolist(),
+                             x['length_right'].tolist())
+    for pair in zip(text_left, text_right):
+        match_hist.append(match_hist_unit.transform(list(pair)))
+    return np.asarray(match_hist)
diff --git a/matchzoo/data_generator/callbacks/lambda_callback.py b/matchzoo/data_generator/callbacks/lambda_callback.py
new file mode 100644
index 0000000..2779715
--- /dev/null
+++ b/matchzoo/data_generator/callbacks/lambda_callback.py
@@ -0,0 +1,31 @@
+from matchzoo.data_generator.callbacks.callback import Callback
+
+
+class LambdaCallback(Callback):
+    """
+    LambdaCallback. Just a shorthand for creating a callback class.
+
+    See :class:`matchzoo.data_generator.callbacks.Callback` for more details.
+
+    Example:
+        >>> from matchzoo.data_generator.callbacks import LambdaCallback
+        >>> callback = LambdaCallback(on_batch_unpacked=print)
+        >>> callback.on_batch_unpacked('x', 'y')
+        x y
+
+    """
+
+    def __init__(self, on_batch_data_pack=None, on_batch_unpacked=None):
+        """Init."""
+        self._on_batch_unpacked = on_batch_unpacked
+        self._on_batch_data_pack = on_batch_data_pack
+
+    def on_batch_data_pack(self, data_pack):
+        """`on_batch_data_pack`."""
+        if self._on_batch_data_pack:
+            self._on_batch_data_pack(data_pack)
+
+    def on_batch_unpacked(self, x, y):
+        """`on_batch_unpacked`."""
+        if self._on_batch_unpacked:
+            self._on_batch_unpacked(x, y)
diff --git a/matchzoo/data_generator/data_generator.py b/matchzoo/data_generator/data_generator.py
new file mode 100644
index 0000000..bd4984e
--- /dev/null
+++ b/matchzoo/data_generator/data_generator.py
@@ -0,0 +1,292 @@
+"""Base generator."""
+
+import math
+import typing
+
+import numpy as np
+import pandas as pd
+
+import matchzoo as mz
+from matchzoo.data_generator.callbacks import Callback
+
+
+class DataGenerator(keras.utils.Sequence):
+    """
+    Data Generator.
+
+    Used to divide a :class:`matchzoo.DataPack` into batches. This is helpful
+    for generating batch-wise features and delaying data preprocessing to the
+    `fit` time.
+
+    See `tutorials/data_handling.ipynb` for a walkthrough.
+
+    :param data_pack: DataPack to generator data from.
+    :param mode: One of "point", "pair", and "list". (default: "point")
+    :param num_dup: Number of duplications per instance, only effective when
+        `mode` is "pair". (default: 1)
+    :param num_neg: Number of negative samples per instance, only effective
+        when `mode` is "pair". (default: 1)
+    :param resample: Either to resample for each epoch, only effective when
+        `mode` is "pair". (default: `True`)
+    :param batch_size: Batch size. (default: 128)
+    :param shuffle: Either to shuffle the samples/instances. (default: `True`)
+    :param callbacks: Callbacks. See `matchzoo.data_generator.callbacks` for
+        more details.
+
+    Examples::
+        >>> import numpy as np
+        >>> import matchzoo as mz
+        >>> np.random.seed(0)
+        >>> data_pack = mz.datasets.toy.load_data()
+        >>> batch_size = 8
+
+    To generate data points:
+        >>> point_gen = mz.DataGenerator(
+        ...     data_pack=data_pack,
+        ...     batch_size=batch_size
+        ... )
+        >>> len(point_gen)
+        13
+        >>> x, y = point_gen[0]
+        >>> for key, value in sorted(x.items()):
+        ...     print(key, str(value)[:30])
+        id_left ['Q6' 'Q17' 'Q1' 'Q13' 'Q16' '
+        id_right ['D6-6' 'D17-1' 'D1-2' 'D13-3'
+        text_left ['how long is the term for fed
+        text_right ['See Article I and Article II
+
+    To generate data pairs:
+        >>> pair_gen = mz.DataGenerator(
+        ...     data_pack=data_pack,
+        ...     mode='pair',
+        ...     num_dup=4,
+        ...     num_neg=4,
+        ...     batch_size=batch_size,
+        ...     shuffle=False
+        ... )
+        >>> len(pair_gen)
+        3
+        >>> x, y = pair_gen[0]
+        >>> for key, value in sorted(x.items()):
+        ...     print(key, str(value)[:30])
+        id_left ['Q1' 'Q1' 'Q1' 'Q1' 'Q1' 'Q1'
+        id_right ['D1-3' 'D1-4' 'D1-0' 'D1-1' '
+        text_left ['how are glacier caves formed
+        text_right ['A glacier cave is a cave for
+
+    To generate data lists:
+        # TODO:
+
+    """
+
+    def __init__(
+        self,
+        data_pack: mz.DataPack,
+        mode='point',
+        num_dup: int = 1,
+        num_neg: int = 1,
+        resample: bool = True,
+        batch_size: int = 128,
+        shuffle: bool = True,
+        callbacks: typing.List[Callback] = None
+    ):
+        """Init."""
+        if callbacks is None:
+            callbacks = []
+
+        if mode not in ('point', 'pair', 'list'):
+            raise ValueError(f"{mode} is not a valid mode type."
+                             f"Must be one of `point`, `pair` or `list`.")
+
+        self._mode = mode
+        self._num_dup = num_dup
+        self._num_neg = num_neg
+        self._batch_size = batch_size
+        self._shuffle = shuffle
+        self._resample = resample
+        self._orig_relation = data_pack.relation
+        self._callbacks = callbacks
+
+        if mode == 'pair':
+            data_pack.relation = self._reorganize_pair_wise(
+                data_pack.relation,
+                num_dup=num_dup,
+                num_neg=num_neg
+            )
+
+        self._data_pack = data_pack
+        self._batch_indices = None
+
+        self.reset_index()
+
+    def __getitem__(self, item: int) -> typing.Tuple[dict, np.ndarray]:
+        """Get a batch from index idx.
+
+        :param item: the index of the batch.
+        """
+        if isinstance(item, slice):
+            indices = sum(self._batch_indices[item], [])
+        else:
+            indices = self._batch_indices[item]
+        batch_data_pack = self._data_pack[indices]
+        self._handle_callbacks_on_batch_data_pack(batch_data_pack)
+        x, y = batch_data_pack.unpack()
+        self._handle_callbacks_on_batch_unpacked(x, y)
+        return x, y
+
+    def __len__(self) -> int:
+        """Get the total number of batches."""
+        return len(self._batch_indices)
+
+    def on_epoch_end(self):
+        """Reorganize the index array while epoch is ended."""
+        if self._mode == 'pair' and self._resample:
+            self._data_pack.relation = self._reorganize_pair_wise(
+                relation=self._orig_relation,
+                num_dup=self._num_dup,
+                num_neg=self._num_neg
+            )
+        self.reset_index()
+
+    def reset_index(self):
+        """
+        Set the :attr:`index_array`.
+
+        Here the :attr:`index_array` records the index of all the instances.
+        """
+        # index pool: index -> instance index
+        if self._mode == 'point':
+            num_instances = len(self._data_pack)
+            index_pool = list(range(num_instances))
+        elif self._mode == 'pair':
+            index_pool = []
+            step_size = self._num_neg + 1
+            num_instances = int(len(self._data_pack) / step_size)
+            for i in range(num_instances):
+                lower = i * step_size
+                upper = (i + 1) * step_size
+                indices = list(range(lower, upper))
+                if indices:
+                    index_pool.append(indices)
+        elif self._mode == 'list':
+            raise NotImplementedError(
+                f'{self._mode} data generator not implemented.')
+        else:
+            raise ValueError(f"{self._mode} is not a valid mode type"
+                             f"Must be one of `point`, `pair` or `list`.")
+
+        if self._shuffle:
+            np.random.shuffle(index_pool)
+
+        # batch_indices: index -> batch of indices
+        self._batch_indices = []
+        for i in range(math.ceil(num_instances / self._batch_size)):
+            lower = self._batch_size * i
+            upper = self._batch_size * (i + 1)
+            candidates = index_pool[lower:upper]
+            if self._mode == 'pair':
+                candidates = sum(candidates, [])
+            if candidates:
+                self._batch_indices.append(candidates)
+
+    def _handle_callbacks_on_batch_data_pack(self, batch_data_pack):
+        for callback in self._callbacks:
+            callback.on_batch_data_pack(batch_data_pack)
+
+    def _handle_callbacks_on_batch_unpacked(self, x, y):
+        for callback in self._callbacks:
+            callback.on_batch_unpacked(x, y)
+
+    @property
+    def callbacks(self):
+        """`callbacks` getter."""
+        return self._callbacks
+
+    @callbacks.setter
+    def callbacks(self, value):
+        """`callbacks` setter."""
+        self._callbacks = value
+
+    @property
+    def num_neg(self):
+        """`num_neg` getter."""
+        return self._num_neg
+
+    @num_neg.setter
+    def num_neg(self, value):
+        """`num_neg` setter."""
+        self._num_neg = value
+        self.reset_index()
+
+    @property
+    def num_dup(self):
+        """`num_dup` getter."""
+        return self._num_dup
+
+    @num_dup.setter
+    def num_dup(self, value):
+        """`num_dup` setter."""
+        self._num_dup = value
+        self.reset_index()
+
+    @property
+    def mode(self):
+        """`mode` getter."""
+        return self._mode
+
+    @mode.setter
+    def mode(self, value):
+        """`mode` setter."""
+        self._mode = value
+        self.reset_index()
+
+    @property
+    def batch_size(self):
+        """`batch_size` getter."""
+        return self._batch_size
+
+    @batch_size.setter
+    def batch_size(self, value):
+        """`batch_size` setter."""
+        self._batch_size = value
+        self.reset_index()
+
+    @property
+    def shuffle(self):
+        """`shuffle` getter."""
+        return self._shuffle
+
+    @shuffle.setter
+    def shuffle(self, value):
+        """`shuffle` setter."""
+        self._shuffle = value
+        self.reset_index()
+
+    @property
+    def batch_indices(self):
+        """`batch_indices` getter."""
+        return self._batch_indices
+
+    @classmethod
+    def _reorganize_pair_wise(
+        cls,
+        relation: pd.DataFrame,
+        num_dup: int = 1,
+        num_neg: int = 1
+    ):
+        """Re-organize the data pack as pair-wise format."""
+        pairs = []
+        groups = relation.sort_values(
+            'label', ascending=False).groupby('id_left')
+        for idx, group in groups:
+            labels = group.label.unique()
+            for label in labels[:-1]:
+                pos_samples = group[group.label == label]
+                pos_samples = pd.concat([pos_samples] * num_dup)
+                neg_samples = group[group.label < label]
+                for _, pos_sample in pos_samples.iterrows():
+                    pos_sample = pd.DataFrame([pos_sample])
+                    neg_sample = neg_samples.sample(num_neg, replace=True)
+                    pairs.extend((pos_sample, neg_sample))
+        new_relation = pd.concat(pairs, ignore_index=True)
+        return new_relation
diff --git a/matchzoo/data_generator/data_generator_builder.py b/matchzoo/data_generator/data_generator_builder.py
new file mode 100644
index 0000000..d13e320
--- /dev/null
+++ b/matchzoo/data_generator/data_generator_builder.py
@@ -0,0 +1,36 @@
+import matchzoo as mz
+from matchzoo.data_generator.data_generator import DataGenerator
+
+
+class DataGeneratorBuilder(object):
+    """
+    Data Generator Bulider. In essense a wrapped partial function.
+
+    Example:
+        >>> import matchzoo as mz
+        >>> builder = mz.DataGeneratorBuilder(mode='pair', batch_size=32)
+        >>> data = mz.datasets.toy.load_data()
+        >>> gen = builder.build(data)
+        >>> type(gen)
+        <class 'matchzoo.data_generator.data_generator.DataGenerator'>
+        >>> gen.batch_size
+        32
+        >>> gen_64 = builder.build(data, batch_size=64)
+        >>> gen_64.batch_size
+        64
+
+    """
+
+    def __init__(self, **kwargs):
+        """Init."""
+        self._kwargs = kwargs
+
+    def build(self, data_pack, **kwargs) -> DataGenerator:
+        """
+        Build a DataGenerator.
+
+        :param data_pack: DataPack to build upon.
+        :param kwargs: Additional keyword arguments to override the keyword
+            arguments passed in `__init__`.
+        """
+        return mz.DataGenerator(data_pack, **{**self._kwargs, **kwargs})
diff --git a/matchzoo/data_pack/__init__.py b/matchzoo/data_pack/__init__.py
new file mode 100644
index 0000000..c685f73
--- /dev/null
+++ b/matchzoo/data_pack/__init__.py
@@ -0,0 +1,2 @@
+from .data_pack import DataPack, load_data_pack
+from .pack import pack
diff --git a/matchzoo/data_pack/data_pack.py b/matchzoo/data_pack/data_pack.py
new file mode 100644
index 0000000..e5a8bad
--- /dev/null
+++ b/matchzoo/data_pack/data_pack.py
@@ -0,0 +1,497 @@
+"""Matchzoo DataPack, pair-wise tuple (feature) and context as input."""
+
+import typing
+import inspect
+from pathlib import Path
+import functools
+
+import dill
+from tqdm import tqdm
+import numpy as np
+import pandas as pd
+
+import matchzoo
+
+tqdm.pandas()
+
+
+def _convert_to_list_index(
+    index: typing.Union[int, slice, np.array],
+    length: int
+):
+    if isinstance(index, int):
+        index = [index]
+    elif isinstance(index, slice):
+        index = list(range(*index.indices(length)))
+    return index
+
+
+class DataPack(object):
+    """
+    Matchzoo :class:`DataPack` data structure, store dataframe and context.
+
+    `DataPack` is a MatchZoo native data structure that most MatchZoo data
+    handling processes build upon. A `DataPack` consists of three parts:
+    `left`, `right` and `relation`, each one of is a `pandas.DataFrame`.
+
+    :param relation: Store the relation between left document
+        and right document use ids.
+    :param left: Store the content or features for id_left.
+    :param right: Store the content or features for
+        id_right.
+
+    Example:
+        >>> left = [
+        ...     ['qid1', 'query 1'],
+        ...     ['qid2', 'query 2']
+        ... ]
+        >>> right = [
+        ...     ['did1', 'document 1'],
+        ...     ['did2', 'document 2']
+        ... ]
+        >>> relation = [['qid1', 'did1', 1], ['qid2', 'did2', 1]]
+        >>> relation_df = pd.DataFrame(relation)
+        >>> left = pd.DataFrame(left)
+        >>> right = pd.DataFrame(right)
+        >>> dp = DataPack(
+        ...     relation=relation_df,
+        ...     left=left,
+        ...     right=right,
+        ... )
+        >>> len(dp)
+        2
+    """
+
+    DATA_FILENAME = 'data.dill'
+
+    def __init__(
+        self,
+        relation: pd.DataFrame,
+        left: pd.DataFrame,
+        right: pd.DataFrame
+    ):
+        """:class:`DataPack` initializer."""
+        self._relation = relation
+        self._left = left
+        self._right = right
+
+    @property
+    def has_label(self) -> bool:
+        """:return: `True` if `label` column exists, `False` other wise."""
+        return 'label' in self._relation.columns
+
+    def __len__(self) -> int:
+        """Get numer of rows in the class:`DataPack` object."""
+        return self._relation.shape[0]
+
+    @property
+    def frame(self) -> 'DataPack.FrameView':
+        """
+        View the data pack as a :class:`pandas.DataFrame`.
+
+        Returned data frame is created by merging the left data frame,
+        the right dataframe and the relation data frame. Use `[]` to access
+        an item or a slice of items.
+
+        :return: A :class:`matchzoo.DataPack.FrameView` instance.
+
+        Example:
+            >>> import matchzoo as mz
+            >>> data_pack = mz.datasets.toy.load_data()
+            >>> type(data_pack.frame)
+            <class 'matchzoo.data_pack.data_pack.DataPack.FrameView'>
+            >>> frame_slice = data_pack.frame[0:5]
+            >>> type(frame_slice)
+            <class 'pandas.core.frame.DataFrame'>
+            >>> list(frame_slice.columns)
+            ['id_left', 'text_left', 'id_right', 'text_right', 'label']
+            >>> full_frame = data_pack.frame()
+            >>> len(full_frame) == len(data_pack)
+            True
+
+        """
+        return DataPack.FrameView(self)
+
+    def unpack(self) -> typing.Tuple[typing.Dict[str, np.array],
+                                     typing.Optional[np.array]]:
+        """
+        Unpack the data for training.
+
+        The return value can be directly feed to `model.fit` or
+        `model.fit_generator`.
+
+        :return: A tuple of (X, y). `y` is `None` if `self` has no label.
+
+        Example:
+            >>> import matchzoo as mz
+            >>> data_pack = mz.datasets.toy.load_data()
+            >>> X, y = data_pack.unpack()
+            >>> type(X)
+            <class 'dict'>
+            >>> sorted(X.keys())
+            ['id_left', 'id_right', 'text_left', 'text_right']
+            >>> type(y)
+            <class 'numpy.ndarray'>
+            >>> X, y = data_pack.drop_label().unpack()
+            >>> type(y)
+            <class 'NoneType'>
+
+        """
+        frame = self.frame()
+
+        columns = list(frame.columns)
+        if self.has_label:
+            columns.remove('label')
+            y = np.vstack(np.asarray(frame['label']))
+        else:
+            y = None
+
+        x = frame[columns].to_dict(orient='list')
+        for key, val in x.items():
+            x[key] = np.array(val)
+
+        return x, y
+
+    def __getitem__(self, index: typing.Union[int, slice, np.array]
+                    ) -> 'DataPack':
+        """
+        Get specific item(s) as a new :class:`DataPack`.
+
+        The returned :class:`DataPack` will be a copy of the subset of the
+        original :class:`DataPack`.
+
+        :param index: Index of the item(s) to get.
+        :return: An instance of :class:`DataPack`.
+        """
+        index = _convert_to_list_index(index, len(self))
+        if len(index) == 0:
+            raise Exception("Fuck it, Stop it!!!!")
+        # print(index)
+        relation = self._relation.loc[index].reset_index(drop=True)
+        left = self._left.loc[relation['id_left'].unique()]
+        right = self._right.loc[relation['id_right'].unique()]
+        return DataPack(left=left.copy(),
+                        right=right.copy(),
+                        relation=relation.copy())
+
+    @property
+    def relation(self):
+        """`relation` getter."""
+        return self._relation
+
+    @relation.setter
+    def relation(self, value):
+        """`relation` setter."""
+        self._relation = value
+
+    @property
+    def left(self) -> pd.DataFrame:
+        """Get :meth:`left` of :class:`DataPack`."""
+        return self._left
+
+    @property
+    def right(self) -> pd.DataFrame:
+        """Get :meth:`right` of :class:`DataPack`."""
+        return self._right
+
+    def copy(self) -> 'DataPack':
+        """:return: A deep copy."""
+        return DataPack(left=self._left.copy(),
+                        right=self._right.copy(),
+                        relation=self._relation.copy())
+
+    def save(self, dirpath: typing.Union[str, Path]):
+        """
+        Save the :class:`DataPack` object.
+
+        A saved :class:`DataPack` is represented as a directory with a
+        :class:`DataPack` object (transformed user input as features and
+        context), it will be saved by `pickle`.
+
+        :param dirpath: directory path of the saved :class:`DataPack`.
+        """
+        dirpath = Path(dirpath)
+        data_file_path = dirpath.joinpath(self.DATA_FILENAME)
+
+        if data_file_path.exists():
+            raise FileExistsError(
+                '%s already exist, fail to save' % data_file_path)
+        elif not dirpath.exists():
+            dirpath.mkdir()
+
+        dill.dump(self, open(data_file_path, mode='wb'))
+
+    def _optional_inplace(func):
+        """
+        Decorator that adds `inplace` key word argument to a method.
+
+        Decorate any method that modifies inplace to make that inplace change
+        optional.
+        """
+        doc = ":param inplace: `True` to modify inplace, `False` to return " \
+              "a modified copy. (default: `False`)"
+
+        def _clean(s):
+            return s.replace(' ', '').replace('\n', '')
+
+        if _clean(doc) not in _clean(inspect.getdoc(func)):
+            raise NotImplementedError(
+                "`inplace` parameter of {func} not documented.\n"
+                "Please add the following line to its documentation:\n{doc}".format(func = func,doc = doc))
+
+        @functools.wraps(func)
+        def wrapper(
+            self, *args, inplace: bool = False, **kwargs
+        ) -> typing.Optional['DataPack']:
+
+            if inplace:
+                target = self
+            else:
+                target = self.copy()
+
+            func(target, *args, **kwargs)
+
+            if not inplace:
+                return target
+
+        return wrapper
+
+    @_optional_inplace
+    def shuffle(self):
+        """
+        Shuffle the data pack by shuffling the relation column.
+
+        :param inplace: `True` to modify inplace, `False` to return a modified
+            copy. (default: `False`)
+
+        Example:
+            >>> import matchzoo as mz
+            >>> import numpy.random
+            >>> numpy.random.seed(0)
+            >>> data_pack = mz.datasets.toy.load_data()
+            >>> orig_ids = data_pack.relation['id_left']
+            >>> shuffled = data_pack.shuffle()
+            >>> (shuffled.relation['id_left'] != orig_ids).any()
+            True
+
+        """
+        self._relation = self._relation.sample(frac=1)
+        self._relation.reset_index(drop=True, inplace=True)
+
+    @_optional_inplace
+    def drop_label(self):
+        """
+        Remove `label` column from the data pack.
+
+        :param inplace: `True` to modify inplace, `False` to return a modified
+            copy. (default: `False`)
+
+        Example:
+            >>> import matchzoo as mz
+            >>> data_pack = mz.datasets.toy.load_data()
+            >>> data_pack.has_label
+            True
+            >>> data_pack.drop_label(inplace=True)
+            >>> data_pack.has_label
+            False
+        """
+        self._relation = self._relation.drop(columns='label')
+
+    @_optional_inplace
+    def append_text_length(self, verbose=1):
+        """
+        Append `length_left` and `length_right` columns.
+
+        :param inplace: `True` to modify inplace, `False` to return a modified
+            copy. (default: `False`)
+        :param verbose: Verbosity.
+
+        Example:
+            >>> import matchzoo as mz
+            >>> data_pack = mz.datasets.toy.load_data()
+            >>> 'length_left' in data_pack.frame[0].columns
+            False
+            >>> new_data_pack = data_pack.append_text_length(verbose=0)
+            >>> 'length_left' in new_data_pack.frame[0].columns
+            True
+            >>> 'length_left' in data_pack.frame[0].columns
+            False
+            >>> data_pack.append_text_length(inplace=True, verbose=0)
+            >>> 'length_left' in data_pack.frame[0].columns
+            True
+
+        """
+        self.apply_on_text(len, rename=('length_left', 'length_right'),
+                           inplace=True, verbose=verbose)
+
+    @_optional_inplace
+    def apply_on_text(
+        self, func: typing.Callable,
+        mode: str = 'both',
+        rename: typing.Optional[str] = None,
+        verbose: int = 1
+    ):
+        """
+        Apply `func` to text columns based on `mode`.
+
+        :param func: The function to apply.
+        :param mode: One of "both", "left" and "right".
+        :param rename: If set, use new names for results instead of replacing
+            the original columns. To set `rename` in "both" mode, use a tuple
+            of `str`, e.g. ("text_left_new_name", "text_right_new_name").
+        :param inplace: `True` to modify inplace, `False` to return a modified
+            copy. (default: `False`)
+        :param verbose: Verbosity.
+
+        Examples::
+            >>> import matchzoo as mz
+            >>> data_pack = mz.datasets.toy.load_data()
+            >>> frame = data_pack.frame
+
+        To apply `len` on the left text and add the result as 'length_left':
+            >>> data_pack.apply_on_text(len, mode='left',
+            ...                         rename='length_left',
+            ...                         inplace=True,
+            ...                         verbose=0)
+            >>> list(frame[0].columns)
+            ['id_left', 'text_left', 'length_left', 'id_right', 'text_right', \
+'label']
+
+        To do the same to the right text:
+            >>> data_pack.apply_on_text(len, mode='right',
+            ...                         rename='length_right',
+            ...                         inplace=True,
+            ...                         verbose=0)
+            >>> list(frame[0].columns)
+            ['id_left', 'text_left', 'length_left', 'id_right', 'text_right', \
+'length_right', 'label']
+
+        To do the same to the both texts at the same time:
+            >>> data_pack.apply_on_text(len, mode='both',
+            ...                         rename=('extra_left', 'extra_right'),
+            ...                         inplace=True,
+            ...                         verbose=0)
+            >>> list(frame[0].columns)
+            ['id_left', 'text_left', 'length_left', 'extra_left', 'id_right', \
+'text_right', 'length_right', 'extra_right', 'label']
+
+        To suppress outputs:
+            >>> data_pack.apply_on_text(len, mode='both', verbose=0,
+            ...                         inplace=True)
+
+        """
+        if mode == 'both':
+            self._apply_on_text_both(func, rename, verbose=verbose)
+        elif mode == 'left':
+            self._apply_on_text_left(func, rename, verbose=verbose)
+        elif mode == 'right':
+            self._apply_on_text_right(func, rename, verbose=verbose)
+        else:
+            raise ValueError("{mode} is not a valid mode type."
+                             "Must be one of `left` `right` `both`.".format(mode = mode))
+
+    def _apply_on_text_right(self, func, rename, verbose=1):
+        name = rename or 'text_right'
+        if verbose:
+            tqdm.pandas(desc="Processing " + name + " with " + func.__name__)
+            self._right[name] = self._right['text_right'].progress_apply(func)
+        else:
+            self._right[name] = self._right['text_right'].apply(func)
+
+    def _apply_on_text_left(self, func, rename, verbose=1):
+        name = rename or 'text_left'
+        if verbose:
+            tqdm.pandas(desc="Processing " + name + " with " + func.__name__)
+            self._left[name] = self._left['text_left'].progress_apply(func)
+        else:
+            self._left[name] = self._left['text_left'].apply(func)
+
+    def _apply_on_text_both(self, func, rename, verbose=1):
+        left_name, right_name = rename or ('text_left', 'text_right')
+        self._apply_on_text_left(func, rename=left_name, verbose=verbose)
+        self._apply_on_text_right(func, rename=right_name, verbose=verbose)
+
+    @_optional_inplace
+    def one_hot_encode_label(self, num_classes=2):
+        """
+        One-hot encode `label` column of `relation`.
+
+        :param num_classes: Number of classes.
+        :param inplace: `True` to modify inplace, `False` to return a modified
+            copy. (default: `False`)
+        :return:
+        """
+        self._relation['label'] = self._relation['label'].apply(
+            lambda idx: matchzoo.one_hot(idx, num_classes))
+
+    class FrameView(object):
+        """FrameView."""
+
+        def __init__(self, data_pack: 'DataPack'):
+            """
+            View a data pack as a frame.
+
+            A slice of the view is genereated by merging three parts of the
+            data pack being viewed into a big table.
+
+            :param data_pack: :class:`DataPack` to view.
+
+            Examples::
+                >>> import matchzoo as mz
+                >>> data_pack = mz.datasets.toy.load_data()
+                >>> frame = data_pack.frame
+
+            Use `()` to get a full copy of the frame:
+                >>> list(frame().columns)
+                ['id_left', 'text_left', 'id_right', 'text_right', 'label']
+                >>> len(frame()) == len(data_pack)
+                True
+
+            Notice that a view is binded to the original data pack, so changing
+            contents of the data pack will affect a view previously created:
+                >>> data_pack.drop_label(inplace=True)
+                >>> list(frame().columns)
+                ['id_left', 'text_left', 'id_right', 'text_right']
+
+            To slice the view:
+                >>> frame_slice = frame[3:5]
+                >>> len(frame_slice)
+                2
+
+            """
+            self._data_pack = data_pack
+
+        def __getitem__(self, index: typing.Union[int, slice, np.array]
+                        ) -> pd.DataFrame:
+            """Slicer."""
+            dp = self._data_pack
+            index = _convert_to_list_index(index, len(dp))
+            left_df = dp.left.loc[dp.relation['id_left'][index]].reset_index()
+            right_df = dp.right.loc[
+                dp.relation['id_right'][index]].reset_index()
+            joined_table = left_df.join(right_df)
+            for column in dp.relation.columns:
+                if column not in ['id_left', 'id_right']:
+                    labels = dp.relation[column][index].to_frame()
+                    labels = labels.reset_index(drop=True)
+                    joined_table = joined_table.join(labels)
+            return joined_table
+
+        def __call__(self):
+            """:return: A full copy. Equivalant to `frame[:]`."""
+            return self[:]
+
+
+def load_data_pack(dirpath: typing.Union[str, Path]) -> DataPack:
+    """
+    Load a :class:`DataPack`. The reverse function of :meth:`save`.
+
+    :param dirpath: directory path of the saved model.
+    :return: a :class:`DataPack` instance.
+    """
+    dirpath = Path(dirpath)
+
+    data_file_path = dirpath.joinpath(DataPack.DATA_FILENAME)
+    dp = dill.load(open(data_file_path, 'rb'))
+
+    return dp
diff --git a/matchzoo/data_pack/pack.py b/matchzoo/data_pack/pack.py
new file mode 100644
index 0000000..24424f0
--- /dev/null
+++ b/matchzoo/data_pack/pack.py
@@ -0,0 +1,77 @@
+"""Convert list of input into class:`DataPack` expected format."""
+
+import typing
+
+import pandas as pd
+import numpy as np
+
+import matchzoo
+
+
+def pack(df: pd.DataFrame) -> 'matchzoo.DataPack':
+    """
+    Pack a :class:`DataPack` using `df`.
+
+    The `df` must have `text_left` and `text_right` columns. Optionally,
+    the `df` can have `id_left`, `id_right` to index `text_left` and
+    `text_right` respectively. `id_left`, `id_right` will be automatically
+    generated if not specified.
+
+    :param df: Input :class:`pandas.DataFrame` to use.
+
+    Examples::
+        >>> import matchzoo as mz
+        >>> import pandas as pd
+        >>> df = pd.DataFrame(data={'text_left': list('AABC'),
+        ...                         'text_right': list('abbc'),
+        ...                         'label': [0, 1, 1, 0]})
+        >>> mz.pack(df).frame()
+          id_left text_left id_right text_right  label
+        0     L-0         A      R-0          a      0
+        1     L-0         A      R-1          b      1
+        2     L-1         B      R-1          b      1
+        3     L-2         C      R-2          c      0
+
+    """
+    if 'text_left' not in df or 'text_right' not in df:
+        raise ValueError(
+            'Input data frame must have `text_left` and `text_right`.')
+
+    # Gather IDs
+    if 'id_left' not in df:
+        id_left = _gen_ids(df, 'text_left', 'L-')
+    else:
+        id_left = df['id_left']
+    if 'id_right' not in df:
+        id_right = _gen_ids(df, 'text_right', 'R-')
+    else:
+        id_right = df['id_right']
+
+    # Build Relation
+    relation = pd.DataFrame(data={'id_left': id_left, 'id_right': id_right})
+    for col in df:
+        if col not in ['id_left', 'id_right', 'text_left', 'text_right']:
+            relation[col] = df[col]
+
+    # Build Left and Right
+    left = _merge(df, id_left, 'text_left', 'id_left', 'raw_text_left', 'images_left')
+    right = _merge(df, id_right, 'text_right', 'id_right', 'raw_text_right', 'images_right')
+    return matchzoo.DataPack(relation, left, right)
+
+
+def _merge(data: pd.DataFrame, ids: typing.Union[list, np.array],
+           text_label: str, id_label: str, raw_text_label: str, images_path_label: str):
+    left = pd.DataFrame(data={
+        text_label: data[text_label], id_label: ids,
+        raw_text_label: data[raw_text_label], images_path_label: data[images_path_label]  # added by Ben
+    })
+    left.drop_duplicates(id_label, inplace=True)
+    left.set_index(id_label, inplace=True)
+    return left
+
+
+def _gen_ids(data: pd.DataFrame, col: str, prefix: str):
+    lookup = {}
+    for text in data[col].unique():
+        lookup[text] = prefix + str(len(lookup))
+    return data[col].map(lookup)
diff --git a/matchzoo/datasets/__init__.py b/matchzoo/datasets/__init__.py
new file mode 100644
index 0000000..b538783
--- /dev/null
+++ b/matchzoo/datasets/__init__.py
@@ -0,0 +1,11 @@
+# from . import toy
+# from . import wiki_qa
+from . import embeddings
+# from . import snli
+# from . import quora_qp
+from pathlib import Path
+
+
+def list_available():
+    return [p.name for p in Path(__file__).parent.iterdir()
+            if p.is_dir() and not p.name.startswith('_')]
diff --git a/matchzoo/datasets/embeddings/__init__.py b/matchzoo/datasets/embeddings/__init__.py
new file mode 100644
index 0000000..c2d6288
--- /dev/null
+++ b/matchzoo/datasets/embeddings/__init__.py
@@ -0,0 +1,10 @@
+from pathlib import Path
+from .load_glove_embedding import load_glove_embedding
+from .load_fasttext_embedding import load_fasttext_embedding
+from .load_glove_embedding import load_glove_embedding_FC
+from .load_glove_embedding import load_glove_embedding_matching
+
+DATA_ROOT = Path(__file__).parent
+EMBED_RANK = DATA_ROOT.joinpath('embed_rank.txt')
+EMBED_10 = DATA_ROOT.joinpath('embed_10_word2vec.txt')
+EMBED_10_GLOVE = DATA_ROOT.joinpath('embed_10_glove.txt')
diff --git a/matchzoo/datasets/embeddings/embed_10_glove.txt b/matchzoo/datasets/embeddings/embed_10_glove.txt
new file mode 100644
index 0000000..9bb2be5
--- /dev/null
+++ b/matchzoo/datasets/embeddings/embed_10_glove.txt
@@ -0,0 +1,5 @@
+A 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
+B 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
+C 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
+D 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
+E 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
diff --git a/matchzoo/datasets/embeddings/embed_10_word2vec.txt b/matchzoo/datasets/embeddings/embed_10_word2vec.txt
new file mode 100644
index 0000000..0c514eb
--- /dev/null
+++ b/matchzoo/datasets/embeddings/embed_10_word2vec.txt
@@ -0,0 +1,6 @@
+5 10
+A 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
+B 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
+C 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
+D 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
+E 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
diff --git a/matchzoo/datasets/embeddings/embed_err.txt.gb2312 b/matchzoo/datasets/embeddings/embed_err.txt.gb2312
new file mode 100644
index 0000000..dbd1abe
--- /dev/null
+++ b/matchzoo/datasets/embeddings/embed_err.txt.gb2312
@@ -0,0 +1,6 @@
+5 10
+B 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
+��� 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
+D 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
+A 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9
+E 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
diff --git a/matchzoo/datasets/embeddings/embed_rank.txt b/matchzoo/datasets/embeddings/embed_rank.txt
new file mode 100644
index 0000000..1286e32
--- /dev/null
+++ b/matchzoo/datasets/embeddings/embed_rank.txt
@@ -0,0 +1,501 @@
+501 50
+the 0.481192 0.114944 0.146258 0.0401524 -1.10979 -0.147589 -0.0632034 0.240782 0.708124 0.177662 0.385847 -0.553977 -0.287812 -0.0337654 -0.380266 0.0722159 -0.813727 0.236374 -0.0853274 0.150912 0.538448 0.207157 0.656751 0.350483 0.03201 0.252512 0.358818 -0.713966 0.281348 -0.181327 -0.404944 -0.0396742 0.0236526 -0.471879 -0.410541 0.0128125 0.10784 0.521248 0.0801202 -0.417603 -0.308933 0.0870871 0.606664 0.138819 0.130256 0.114381 -0.997155 -0.229526 0.460283 -0.0746122
+of 0.422419 -0.31498 0.289009 0.251502 -0.664988 -0.00388272 -0.120225 1.00474 0.751213 0.204627 0.105957 -0.191011 -0.520845 0.379055 -0.463837 -0.286444 -1.1002 -0.0300834 -0.0602586 -0.152382 0.389427 0.0221524 0.276443 -0.213152 -0.566351 0.448351 0.314629 -0.674973 0.0402766 -0.78957 -0.728583 -0.0351845 0.221431 -0.376866 -0.283492 0.006311 -0.166525 -0.02155 0.15057 0.073431 -0.467883 0.106812 0.590167 -0.147108 -0.250329 0.189407 -1.03214 0.128791 0.132738 0.189068
+and 0.414205 -0.0585991 0.341188 0.307435 -0.938015 -0.163284 -0.00489662 0.717564 0.676849 0.198116 0.243216 -0.344182 -0.329224 0.267194 -0.428923 -0.266595 -0.897164 -0.0738648 -0.0378047 0.0276701 0.556949 0.14016 0.407714 0.163641 -0.459422 0.235771 0.26932 -0.369004 -0.249506 -0.56535 -0.398916 -0.160479 0.283353 -0.0887171 -0.20105 0.146284 -0.134439 0.253019 0.261753 -0.123803 -0.491314 0.0466079 0.434152 0.320111 -0.0344912 0.191271 -0.941325 -0.0918534 0.366077 0.1307
+in 0.218001 -0.18912 0.504318 0.280393 -0.775002 0.11091 -0.236861 0.80575 0.657629 0.302427 0.433355 -0.059612 -0.533702 0.420355 -0.392482 -0.365949 -0.975239 -0.156092 -0.184766 -0.147236 0.328548 0.142733 0.328588 0.219087 -0.559219 0.29521 0.399618 -0.436524 -0.0428307 -0.620783 -0.454802 0.19574 0.0295431 -0.3088 -0.415296 -0.158645 -0.136125 0.308207 0.217258 -0.0045154 -0.455841 -0.0690503 0.453701 0.1836 -0.120332 0.269079 -0.933471 0.113543 0.284837 0.403468
+to 0.097482 -0.0247346 0.455669 0.0713241 -0.432324 -0.428366 -0.0519544 0.765518 0.711727 0.110414 -0.138872 -0.356299 -0.253427 0.259594 -0.231505 0.116964 -0.766074 -0.039603 -0.281872 -0.0746024 0.518813 0.770758 0.270969 0.649161 -0.42849 0.0977476 -0.012218 -0.292233 -0.385778 -0.846938 -0.380041 0.0495779 0.284051 -0.29755 0.0136957 0.141674 -0.287226 0.286373 0.220797 -0.239113 -0.731136 -0.116539 0.69585 0.16209 -0.171808 0.274095 -1.17902 0.344541 0.226595 0.509018
+was 0.743503 -0.219687 0.630859 0.189543 -0.734124 0.281838 -0.256248 0.932596 0.425846 -0.307763 0.568447 0.155871 -0.575594 0.703042 -0.0490383 -0.0620161 -0.801524 -0.251176 -0.301118 -0.375997 0.337905 0.653957 0.855469 0.123425 -0.334748 0.236197 1.05479 -0.451971 0.0817836 -0.149519 -0.595272 0.149396 -0.124963 -0.284149 0.0751628 -0.10068 0.268971 0.390084 0.208225 0.214898 -0.0168618 0.220242 0.201806 0.171113 -0.0456709 -0.169592 -1.10004 -0.017665 0.211219 0.23398
+is 0.459386 -0.294196 0.576932 0.415239 -0.753269 0.148147 -0.691806 0.554182 0.606315 0.268711 0.611888 -0.235741 -0.140472 0.372451 -0.498448 -0.305131 -0.733118 -0.279021 0.0870202 -0.504272 0.852962 0.461728 0.603495 -0.530359 -0.49192 0.919233 0.72612 -0.495833 -0.0916867 -0.233348 -0.832931 -0.0718616 0.02688 -0.544653 -0.902341 0.0400422 -0.485333 -0.137943 0.481405 0.0690371 -0.927495 -0.0382945 0.150568 -0.372555 -0.257146 -0.057132 -0.437076 -0.63018 0.0677571 0.119527
+for 0.240618 0.166711 0.382984 0.361427 -0.840945 -0.152259 -0.0497632 0.69922 0.748511 0.337517 -0.0393315 -0.200059 -0.495781 0.352012 -0.38444 -0.0505624 -0.528893 -0.0211197 -0.340248 -0.499523 0.335665 0.335719 0.266335 0.18704 -0.614594 0.191106 0.611558 -0.451507 -0.0583181 -0.902317 -0.588862 0.088819 0.411641 -0.157597 -0.127485 0.0343373 -0.29291 0.14304 0.358493 0.130073 -0.409847 -0.202126 0.345995 0.41354 -0.333838 -0.0422347 -0.960556 0.123897 0.00153163 0.368889
+as 0.178929 0.0311233 0.383683 0.285478 -0.938057 -0.0407058 0.103355 0.664167 0.591581 0.0336315 0.301046 -0.134275 -0.447313 0.394273 -0.784662 -0.0829763 -0.394875 -0.199912 -0.0197685 -0.321068 0.54858 0.357356 0.150532 0.145926 -0.288452 0.244956 0.439825 -0.304926 -0.233311 -0.580267 -0.7138 0.00219208 0.0873197 -0.0466839 -0.26586 -0.218687 -0.204121 0.35253 0.140335 0.209495 -0.800932 -0.0468413 0.2368 0.105298 -0.108704 0.292249 -0.617625 -0.112399 0.261603 0.297329
+on 0.361393 0.194677 0.386321 0.101013 -0.6303 0.021094 -0.094212 0.673848 0.643189 0.144393 0.0196457 -0.190431 -0.653567 0.410097 -0.271139 -0.192813 -0.891066 -0.423621 -0.363702 0.0170993 0.165249 0.352704 0.594466 -0.148003 -0.270243 0.33983 0.0414032 -0.384019 -0.0758417 -0.530481 -0.16999 0.369073 0.232679 -0.408866 -0.0986386 0.0342333 0.0314466 0.334493 0.500842 -0.0390505 -0.532535 -0.248381 0.108413 0.372316 -0.215145 0.392357 -1.26173 0.105443 0.0306274 0.642751
+with 0.414189 -0.0201856 0.596477 0.431318 -0.699933 -0.233491 0.0829651 0.773067 0.448824 0.232143 0.181464 -0.119701 -0.304451 0.419556 -0.236413 -0.136404 -0.971148 -0.136031 -0.159345 -0.323072 0.482769 0.209675 0.431341 0.117836 -0.346871 0.253549 0.212466 -0.220648 -0.293341 -0.584324 -0.325732 -0.201978 0.761816 -0.0935512 0.0944946 0.0589293 -0.244274 0.142954 0.303255 -0.053 -0.566408 0.0399402 0.332823 0.315135 -0.491303 0.122719 -0.877587 -0.0616097 0.286452 0.174432
+by 0.164598 0.117743 0.447771 0.586842 -0.781262 -0.0285669 0.147976 0.806706 0.507283 0.00115935 -0.381605 -0.0896638 -0.603885 0.196833 -0.376773 -0.0546903 -0.964904 -0.316372 -0.300429 0.140248 0.13765 0.279148 0.130351 0.135054 -0.217016 0.362413 -0.0317792 -0.278805 -0.16142 -0.408079 -0.749136 -0.113397 -0.0902098 -0.345095 -0.111432 0.266522 -0.0734709 0.514639 0.278386 0.15478 -0.727428 0.0983311 0.590012 0.38901 0.0129612 0.202999 -1.05376 0.188324 0.25339 0.271067
+he -0.00215639 -0.348425 0.227953 0.540406 -1.29868 -0.468757 -0.182312 0.225502 0.198551 0.00280781 1.28092 -0.271463 -0.211642 -0.106504 -0.212531 0.246637 0.0315608 -0.116296 -0.271059 0.354953 0.336552 0.513029 0.827706 0.864021 -0.191875 0.0742888 0.83226 -0.172331 -0.13244 -0.474771 -0.354279 0.572046 -0.411989 0.0477242 0.424785 -0.509935 -0.18299 0.71168 -0.3204 -0.0663534 -0.356052 -0.463358 1.07169 0.688653 0.654073 0.237653 -0.919869 0.356708 0.274691 0.620945
+that -0.0636978 0.0305305 0.603934 0.266724 -0.568214 -0.46313 -0.0176214 0.0367927 0.309121 -0.145396 -0.343789 -0.535671 -0.195677 0.00810845 -0.600446 0.47427 -0.459608 -0.0801537 -0.30156 -0.00529369 0.47216 0.685532 0.345063 0.519714 0.12533 0.158459 0.271266 -0.10173 -0.0384262 -0.706795 -0.50702 0.0556162 -0.243177 -0.551525 -0.175341 0.255024 -0.3177 0.386972 -0.00500236 -0.232892 -0.626938 0.0240547 0.620032 -0.0516308 -0.229234 0.65433 -1.0744 0.319074 0.177489 0.330443
+from 0.545828 -0.182198 0.420776 0.277229 -0.621808 -0.106732 -0.00616282 0.659348 0.672152 0.188202 0.373023 -0.0877687 -0.284524 0.590145 -0.170934 -0.299052 -1.0906 -0.111158 -0.185043 0.0211288 0.504501 0.254885 0.426567 0.104803 -0.316893 0.267371 0.45389 -0.127518 -0.160256 -0.43257 -0.423472 0.0778388 0.0911174 -0.187355 0.00354515 -0.117082 -0.0185801 0.149666 0.119748 -0.222826 -0.440618 -0.119505 0.040647 -0.0631326 -0.10337 -0.184534 -0.970628 -0.0331648 -0.0862167 0.0904932
+at 0.329125 -0.445077 0.470047 0.232406 -0.462517 -0.124786 -0.385009 0.779465 0.528197 0.259924 0.284657 -0.3738 -0.467329 0.53591 -0.452265 -0.245269 -1.03961 -0.387408 -0.253999 -0.0263969 -0.0469276 0.435928 0.55072 0.0151536 -0.449506 0.0641795 0.352112 -0.463714 0.269793 -0.59694 -0.151035 0.350948 0.197643 -0.0631544 -0.184065 -0.301646 -0.211673 0.0963206 0.476683 -0.281493 -0.249371 -0.287159 0.533484 0.222331 -0.257611 -0.191521 -0.99521 0.250043 0.215126 0.805286
+his 0.399311 -0.11455 0.182618 0.391222 -1.38577 -0.836881 0.0683948 0.212949 0.14031 -0.305645 1.1147 -0.543351 -0.0765271 0.0887827 -0.0498956 0.278891 0.11061 -0.0402854 0.0901623 0.0473649 0.646213 0.0482608 1.18749 0.890662 -0.0203007 0.0510798 0.719535 -0.526675 -0.0613403 -0.519035 -0.299473 0.134028 0.0568395 0.228009 0.973674 -0.299101 0.0913128 0.358576 -0.0290992 -0.134082 0.00284892 -0.265241 1.21392 0.656945 0.411714 0.425909 -0.93552 -0.093997 0.204234 0.111327
+it -0.084556 0.326117 0.284061 0.36073 -0.436328 -0.205005 -0.366607 0.0301945 0.260852 0.286393 -0.0282026 -0.499147 -0.282631 -0.216273 -0.380151 0.368874 -0.726648 0.000957822 -0.269952 0.547038 0.552131 0.756692 0.62617 0.623924 -0.220279 0.139214 0.309936 -0.373758 0.261497 0.0705929 -0.691468 0.110236 -0.505099 -0.635808 -0.834653 0.207433 -0.285344 0.777932 0.0548777 -0.328381 -0.461198 0.0843667 0.310122 -0.392204 0.00201131 0.599579 -0.592596 -0.0292739 0.400259 0.304472
+an 0.483782 0.235291 0.0263575 0.385075 -1.05886 -0.308849 -0.0115418 0.369159 0.695994 0.459544 0.316152 -0.611351 -0.0887699 -0.19684 -0.695032 0.131454 -0.102384 -0.129733 -0.21768 0.11078 0.599457 0.15929 0.727238 0.361606 -0.0150142 0.263774 0.587862 -0.35202 0.390385 -0.0959102 -0.479527 0.170982 -0.202502 -0.221149 -0.334234 0.318643 -0.128748 0.522958 0.222153 -0.21743 -0.203237 -0.133904 0.451675 0.166252 0.22904 0.293301 -0.500455 -0.173757 0.487114 0.09427
+are 0.366673 0.103601 0.846736 -0.420805 -0.912456 -0.246012 -0.0552818 0.0593654 0.392212 -0.333661 0.105397 -0.702711 -0.138733 0.121925 -0.745552 -0.565972 -1.03052 0.271044 0.399679 -0.582794 0.885406 0.500963 0.363753 -0.362677 -0.460116 0.34899 0.459616 0.0736139 -0.444187 -0.615631 -0.690071 -0.58814 0.497567 -0.156204 -0.778628 -0.122807 -0.182225 -0.0933879 0.167186 -0.134256 -1.1949 0.276532 -0.157254 -0.225095 -0.731362 -0.33991 -0.939736 -0.623315 0.149308 -0.412065
+were 0.840957 0.105139 0.861012 -0.330634 -0.711901 -0.173337 0.451832 0.678777 0.129506 -0.951322 -0.171984 -0.123183 -0.485573 0.460478 -0.199897 -0.27218 -1.38525 0.320327 0.0250825 -0.433046 0.223563 0.646032 0.549658 0.296498 -0.289681 0.00227862 0.953287 0.15998 -0.103964 -0.395408 -0.529628 -0.437679 0.503697 -0.158398 0.0155813 -0.101234 0.526122 0.515628 0.140935 -0.0380044 0.0148411 0.658474 -0.0802261 0.299616 -0.457511 -0.564663 -1.52646 0.0884447 0.370672 -0.513596
+which -0.0308938 0.427428 0.236848 0.188871 -0.822977 -0.277875 0.0461739 0.220514 0.408007 0.259788 -0.136725 -0.564335 -0.358749 -0.241445 -0.396207 0.111499 -0.718946 0.0211391 -0.0887908 0.367177 0.339535 0.316353 0.397141 0.468351 -0.0857437 0.181777 0.000663481 -0.381664 0.248886 -0.266597 -0.427537 0.0528625 -0.166818 -0.446 -0.419508 0.175837 -0.182984 0.594403 -0.00418527 -0.362404 -0.373804 0.126156 0.494845 0.0332208 -0.045882 0.162113 -0.645121 0.211181 0.364624 0.0827299
+this 0.121805 0.583971 0.196161 0.036571 -0.832267 -0.3718 -0.0225544 -0.0540604 0.357722 0.175991 -0.081612 -0.713298 -0.234863 -0.00761619 -0.596301 0.359407 -0.433697 0.116133 -0.168708 0.00202869 0.482463 0.384224 0.609998 0.635127 0.324477 0.140734 0.0516987 -0.358923 0.387948 -0.372281 -0.515654 0.0410327 -0.231868 -0.50299 -0.370285 0.0425439 -0.092505 0.507837 0.158584 -0.281709 -0.304212 0.140062 0.639773 -0.250451 -0.18679 0.513726 -0.657437 -0.0214616 0.0675287 0.0994675
+be 0.160188 0.538497 1.05828 0.124406 -0.919728 -0.182331 -0.0148059 -0.257866 0.404362 -0.15517 -0.104514 -0.630551 -0.367752 0.157929 -0.585499 0.11645 -0.229083 0.612074 -0.237913 -0.173878 0.839758 1.1917 0.684893 0.142468 0.43403 0.0635464 0.859538 0.567257 -0.0035431 -0.0508324 -0.817688 -0.373177 -0.496663 -0.556389 -0.39784 -0.400874 -0.00534964 0.496525 -0.0228899 -0.207959 -0.636355 0.088959 -0.0973592 -0.46596 -0.139211 0.0366355 -1.07109 -0.331162 -0.155493 0.137701
+or 0.422606 0.127367 0.434548 -0.0651944 -0.80775 -0.534074 0.342007 0.445345 0.926771 0.115725 0.153505 -0.566136 -0.111956 -0.427414 -0.541772 -0.188438 -0.564553 -0.16036 0.167948 -0.375467 0.802877 0.443689 0.389698 0.226472 -0.339401 0.337808 0.371779 -0.134416 0.341626 -0.869185 -0.665759 -0.444235 0.167933 -0.34207 -0.237812 0.110028 -0.0875291 0.145752 0.424775 -0.277523 -0.910668 0.211459 0.079061 -0.310082 -0.326308 -0.15603 -0.581165 0.0286628 0.163977 -0.393129
+has 0.0907975 -0.370868 0.472671 0.372227 -0.55963 -0.254437 -0.740559 0.443456 -0.0349219 0.123583 0.0671734 -0.211036 -0.130216 0.494627 -0.904902 -0.285619 -0.667336 -0.571808 -0.227803 -0.709041 0.763616 0.410411 0.470513 0.0150921 -0.925126 0.395067 0.88121 -0.578509 -0.278119 -0.241923 -0.753102 0.184303 0.23288 -0.334024 -0.626839 0.160766 -0.301326 -0.273075 -0.057127 0.249279 -0.713739 0.143788 0.049026 0.15795 -0.622983 -0.0627328 -0.619326 -0.387947 0.411062 -0.0031616
+also -0.131348 0.23309 0.228439 0.235306 -0.916172 -0.219213 -0.10435 0.372016 0.403192 0.115023 0.128407 -0.220099 -0.424065 -0.129134 -0.721962 -0.0504072 -0.504246 -0.332756 -0.0796221 -0.00216767 0.394086 0.364807 0.0935661 0.445583 -0.375446 0.0843965 0.303839 -0.267222 -0.0193133 -0.330064 -0.358459 0.0772857 -0.203131 -0.0475263 -0.490977 -0.111271 -0.298549 0.368955 0.0260918 0.0714541 -0.774485 -0.0505496 0.37919 0.36744 0.0495146 0.391305 -0.446474 -0.0832459 0.370997 0.303664
+had 0.580726 -0.365129 0.73616 0.177666 -0.308576 -0.323965 0.0555369 0.938803 -0.185365 -0.680264 0.138601 0.128587 -0.352556 0.65789 -0.180198 0.119748 -0.938568 -0.172192 -0.300412 -0.456784 0.237137 0.761456 0.557403 0.501581 -0.502292 -0.0350899 1.14874 -0.0461978 -0.00639935 -0.594553 -0.674232 -0.198348 0.362018 -0.165012 0.323436 0.0882204 0.103007 0.100207 -0.117007 0.115544 0.263539 0.473712 0.0481587 0.171811 -0.324829 -0.0301516 -1.43554 0.17347 0.300961 0.0620035
+first -0.0320513 0.378786 0.274199 0.621156 -0.612116 0.455718 -0.0799801 0.173864 0.283339 -0.00573044 0.223212 0.0573499 -0.718243 0.336101 0.0830971 0.150333 -0.916925 0.187007 -0.301724 -0.142001 0.0267932 0.0271445 0.279887 0.232645 -0.0313357 0.0453503 0.411816 -0.631635 0.233658 -0.480054 -0.144995 0.506924 0.139953 -0.356228 -0.0115162 -0.394882 0.0174081 0.246184 0.102096 -0.0174408 -0.314883 -0.0235044 0.793063 0.14683 -0.237847 0.125686 -0.875649 0.276215 -0.27985 0.588328
+one -0.185984 0.0254191 0.00325482 0.35376 -1.05478 -0.261174 -0.03876 -0.203585 -0.174547 -0.120855 0.154391 -0.378579 -0.304464 -0.478487 -0.273309 0.169671 -0.62122 0.406452 0.230559 0.178261 0.272278 0.165061 0.208458 0.12544 0.156124 0.00278115 0.530287 -0.493647 0.558652 -0.196943 -0.43 0.146768 0.207279 -0.375482 -0.297281 -0.268547 -0.296808 0.40782 -0.00256939 -0.442629 -0.454132 0.0699884 0.610357 -0.107769 -0.293504 0.0248468 -0.649968 -0.157828 -0.027322 0.182421
+their 0.447969 0.708271 0.326326 -0.0936482 -0.956813 -0.540672 0.729763 -0.321481 0.244345 -0.345672 -0.0678542 -0.806667 -0.0367679 -0.133691 -0.0625767 0.0650193 -0.76162 0.464301 0.0475819 -0.318483 0.82507 0.505776 0.531953 0.797122 0.319149 -0.113362 0.112994 -0.34563 -0.434693 -0.469456 0.0239714 -0.2723 0.454103 -0.140055 0.0210483 -0.14811 0.00804878 0.141668 -0.125033 -0.32823 0.0257426 0.00814036 0.403118 0.284972 -0.240561 0.416243 -1.22414 -0.00467291 0.345902 -0.327542
+but -0.0976109 0.151527 0.61915 0.166065 -0.218822 -0.457101 0.0940686 0.271629 -0.0974717 -0.0167718 0.290072 -0.160597 -0.304866 0.211329 -0.318583 0.459417 -0.551778 0.0592445 -0.225118 -0.162598 0.436597 0.693741 0.584788 0.656801 -0.117667 -0.106987 0.300223 0.0499695 -0.0447736 -0.845668 -0.661158 -0.0745372 0.058848 -0.371486 -0.119555 -0.0904154 -0.216058 0.4271 0.00881008 -0.109043 -0.39183 0.0502467 0.249274 -0.145108 -0.17484 0.579262 -0.839312 0.110007 0.144377 0.327345
+not -0.293432 0.420306 0.74756 -0.0152231 -0.411367 -0.552576 0.0844027 -0.0225544 0.230377 -0.179679 -0.1878 -0.503005 -0.231337 -0.298457 -0.536906 0.689388 -0.311954 0.30407 -0.378551 -0.236514 0.412926 0.975351 0.293328 0.76564 -0.0329118 -0.126459 0.520196 0.275537 0.00148121 -0.728395 -0.791677 -0.00972358 -0.261223 -0.488655 -0.264096 -0.15517 -0.157256 0.393385 -0.0572685 -0.119662 -0.734828 0.153453 0.214627 -0.465343 -0.0686567 0.862006 -0.990828 0.194198 0.104884 0.428472
+its 0.481516 0.453749 0.348654 0.0383236 -0.43239 0.0833581 -0.0616241 0.0636472 0.434729 0.420476 -0.0683768 -0.821589 -0.0496089 0.101445 -0.611208 0.129726 -0.617886 0.0437486 0.128044 0.0282873 0.560658 0.366096 0.713302 0.409793 0.179305 -0.00052532 0.159433 -1.05994 0.058633 0.126 -0.388635 0.173791 0.0588778 -0.380872 -0.389326 0.380193 0.0237126 0.721084 -0.432465 -0.30677 -0.181102 0.296019 0.259287 -0.248398 -0.244017 0.450725 -0.594442 -0.440058 0.486961 -0.206007
+new 0.0845311 0.0903913 0.105067 -0.0688177 -0.886884 0.312979 0.1045 -0.017444 0.702492 0.277339 -0.229972 0.160015 -0.785732 0.575114 -0.381691 0.158357 -0.857548 0.204137 -0.433526 0.182701 0.818256 0.202231 0.157463 0.687817 -0.431758 -0.0908679 0.647508 -0.736242 -0.514706 -0.319298 0.0160191 0.514195 0.358565 -0.572927 0.456368 -0.13958 0.434054 0.303789 0.217873 -0.117851 -0.447282 -0.356023 0.275601 -0.150534 0.422942 -0.222008 -0.443594 -0.174228 0.809176 0.273513
+who -0.0456002 -0.135994 0.0958229 1.11475 -1.19087 -0.389398 0.358859 0.466845 0.0491964 -0.625924 0.0500433 -0.200644 -0.051325 -0.569401 -0.142841 0.0475988 -0.544487 0.04647 -0.305955 0.18754 0.285332 0.419764 0.110682 0.843214 -0.117971 0.33861 0.768925 0.127531 -0.038559 -0.660567 -0.643624 -0.138637 -0.0625404 0.178511 -0.205321 -0.186465 -0.15908 0.313821 -0.0578988 -0.242733 -0.417082 -0.327135 0.642682 0.464978 0.696441 0.155211 -0.813452 0.358737 0.366675 0.0886907
+after 0.43103 -0.266943 0.264414 0.472415 -0.305241 -0.00455542 -0.210205 0.672616 0.0538182 -0.0822265 0.357141 0.153718 -0.563979 0.794927 -0.0199829 -0.0513318 -0.775499 -0.301523 -0.257387 -0.304063 0.0765666 0.336895 0.540506 0.596674 0.0340851 0.133136 0.300454 -0.341747 -0.198027 -0.911783 -0.28396 0.219002 0.31585 -0.354716 0.30768 -0.301646 0.230577 0.235747 0.0923664 -0.277609 -0.281797 -0.0782051 0.537576 0.390328 -0.486128 0.143282 -1.09632 0.312856 0.0715388 0.657462
+they 0.0127179 0.447153 0.56338 -0.334666 -0.590399 -0.582645 0.461719 -0.357508 -0.046246 -0.665627 -0.00101654 -0.497341 -0.19229 -0.260835 -0.110833 0.185554 -0.909396 0.232002 -0.0840311 -0.0114873 0.63396 1.14131 0.150027 0.953811 -0.145944 -0.214573 0.0767677 0.527048 -0.434945 -0.609787 -0.0583094 -0.407301 0.38046 -0.179902 -0.40725 -0.129736 -0.176367 0.371952 -0.327431 -0.370238 -0.586901 0.163937 0.392291 0.228396 -0.0715576 0.378468 -1.27609 0.347753 0.625478 0.173235
+have -0.00297704 0.0925602 1.07748 0.0398511 -0.768572 -0.422269 0.118276 0.0204123 -0.0493076 -0.451782 -0.332297 -0.490831 -0.188549 0.458854 -0.938615 -0.1051 -0.691631 0.0142308 -0.118467 -0.665041 0.788692 0.660065 0.435841 0.238607 -0.181495 -0.0307853 0.959577 0.342409 -0.25441 -0.441494 -0.771477 -0.459005 0.0438766 -0.21854 -0.477677 -0.213353 -0.0116458 -0.0117896 -0.33886 0.130919 -0.582188 0.248965 -0.236117 -0.134323 -0.476538 0.0160651 -1.17841 -0.329025 0.0405493 -0.173081
+her -0.584279 -0.264637 0.0469102 0.728937 -1.64401 -0.925282 -0.149916 0.487955 -0.00720243 -0.0247545 -0.0733011 -0.45696 -0.600529 -0.087582 1.10708 0.356564 0.231689 -0.355053 -0.591771 -0.142719 1.52753 0.386884 2.11819 -0.0223892 -0.235751 -0.594582 0.792787 0.0160713 -0.460368 -1.05793 0.711538 0.640076 -0.197348 0.308668 0.159521 -0.0496709 -0.0830906 -0.104333 -0.176105 -1.37271 0.0171404 0.359637 0.417083 0.0367258 -0.119572 1.28045 -1.05111 -0.539081 1.00546 -0.192545
+two 0.233853 0.131031 0.166096 0.183146 -1.07876 -0.297638 0.194396 0.129628 -0.307207 0.0918866 0.0269272 -0.204239 -0.494028 -0.316395 -0.249325 0.0652629 -1.09147 0.265931 -0.129096 0.132943 0.304309 -0.238938 0.239878 0.289122 -0.207337 -0.131574 0.133883 -0.0684975 0.427754 -0.566348 -0.182739 -0.0956099 0.794086 -0.127571 -0.204921 -0.199234 0.119588 0.404967 -0.00239745 -0.605147 -0.491685 0.277926 0.376736 0.23476 -0.403747 -0.321613 -1.08055 -0.236117 0.235462 -0.0245004
+she -0.970666 -0.461694 0.00599532 0.881237 -1.51652 -0.557043 -0.4006 0.515418 0.0243931 0.359817 -0.080319 -0.112407 -0.724818 -0.106966 0.844992 0.333021 0.200991 -0.641106 -0.908214 0.0377886 1.24146 0.768436 1.7726 -0.0589015 -0.519487 -0.698535 0.843929 0.371536 -0.47271 -1.12035 0.826531 1.06999 -0.637266 0.211721 -0.561258 -0.146904 -0.251893 0.33344 -0.543279 -1.40477 -0.301466 0.334475 0.300556 0.148385 0.174339 1.1968 -0.967338 -0.152524 1.20097 0.353568
+been -0.342371 0.0274244 0.314885 -0.417722 -0.423165 -0.398734 -0.17656 0.00875326 -0.41101 -0.51882 -0.105246 -0.3743 -0.233217 -0.114084 -1.11185 0.195569 -0.462933 -0.481092 -0.156883 -0.112124 0.659791 0.70415 0.493353 1.10745 -0.405035 -0.179101 0.75958 -0.171005 0.150559 0.251919 -0.38525 0.00712381 -0.216738 -0.2444 -0.403994 0.36169 0.247809 0.134626 -0.672305 -0.0891386 -0.407903 0.715668 0.629838 0.300867 -0.136699 0.454377 -1.02323 -0.0500312 0.806445 0.160904
+other 0.20724 0.199516 0.31132 -0.125505 -0.925981 -0.4364 0.526686 0.0582777 0.258174 -0.251759 0.0746888 -0.466759 -0.274291 -0.124994 -0.71931 -0.29989 -0.73956 0.335189 0.333285 -0.184904 0.596838 -0.0224769 -0.12829 0.0419441 -0.39196 0.147891 0.43535 -0.0068367 -0.202182 -0.656736 -0.17776 -0.432269 0.544876 -0.0862902 -0.185203 0.0486996 0.026566 0.166953 0.0743942 -0.415084 -0.641061 0.272455 0.182126 -0.020588 -0.294859 0.0202481 -0.7652 -0.325488 0.19405 -0.391068
+all 0.289413 0.198959 0.247975 -0.0896639 -0.876817 -0.536476 0.223323 -0.0375856 0.160472 -0.288362 -0.102929 -0.694417 -0.120392 -0.593068 -0.072758 0.367711 -0.968601 0.581436 0.037265 -0.0578274 0.267451 0.206307 0.0932387 0.301627 -0.181552 -0.104751 0.470611 -0.39635 -0.0985464 -0.349735 -0.309626 0.0113944 0.498133 -0.462236 -0.250255 -0.0922219 -0.246381 0.200225 0.0832313 -0.136867 -0.366857 0.146711 0.405554 0.196646 -0.410785 -0.126801 -0.847039 -0.0824789 0.145187 -0.183996
+when 0.0668119 -0.047915 0.455947 0.180873 -0.333903 -0.218482 -0.142884 0.523073 0.206307 -0.106847 0.495995 0.10043 -0.511903 0.471925 -0.186109 0.224509 -0.453476 -0.288308 -0.270371 -0.174575 0.341506 0.846342 0.567753 0.608078 0.136945 0.0358486 0.0683195 -0.127193 -0.0971628 -0.96032 -0.311855 0.0656295 -0.0551369 -0.349816 0.0933145 -0.0139344 -0.329842 0.382817 0.119144 -0.338363 -0.451857 -0.0959667 0.313953 0.0295076 -0.229927 0.251856 -0.849276 0.441594 0.071647 0.707896
+there -0.0217568 0.157921 -0.0977322 -0.309104 -0.525283 -0.62313 -0.111696 0.251983 0.163968 -0.22508 -0.253953 -0.824004 -0.270481 -0.742318 -0.683098 0.246536 -0.915766 0.0662488 -0.00368566 0.379393 0.0868246 0.599932 0.299151 0.834645 -0.640994 0.134909 0.576348 -0.0713218 0.71179 -0.901415 -0.491684 -0.150449 0.28296 -0.233117 -0.76652 0.294058 -0.210471 0.255139 0.190389 -0.56326 -0.228409 0.145658 0.327517 -0.254078 -0.134133 0.250079 -0.478602 0.231738 0.446845 0.00904616
+time -0.141203 0.174734 0.0914693 0.149454 -0.204053 -0.139901 -0.265731 -0.117179 -0.162535 -0.409603 0.519729 -0.391095 -0.434722 0.319732 -0.36546 0.361965 -0.341604 0.15782 0.0359192 -0.293801 0.13036 0.825107 0.267851 0.388423 0.050682 0.0268235 0.62518 -0.282151 0.438855 -0.155201 -0.109055 0.620113 0.36863 -0.255357 0.435347 -0.61963 -0.00324802 0.200408 -0.055203 -0.624948 -0.159848 0.159304 0.745859 -0.051225 -0.616578 0.013322 -0.837389 0.242026 -0.369409 0.507302
+during 0.105439 -0.149019 0.0155445 0.317292 -0.326883 0.15448 0.017459 0.758599 0.593571 -0.300096 0.297645 0.0663463 -0.81723 1.23259 -0.507156 -0.167839 -0.535591 -0.264179 0.236453 -0.595177 -0.129131 0.0706727 0.429468 0.284428 0.0923896 0.0978047 0.308739 -0.533732 0.0606631 -1.00791 -0.226933 0.165216 0.493357 -0.207853 -0.0315451 -0.610643 0.653386 0.283757 0.229458 -0.178473 -0.287774 0.0853794 0.50639 0.617253 -0.868935 0.18548 -0.990294 0.490632 -0.159539 0.666981
+into 0.328232 -0.0824687 0.469626 0.381761 -0.174241 -0.13596 0.00858526 0.545417 0.266411 0.249088 -0.174601 0.0655879 -0.351445 0.649544 -0.0643767 0.0735256 -1.15709 0.046153 0.164696 0.0811664 0.552397 0.329579 0.428409 0.330433 -0.317088 0.215063 -0.650662 -0.271162 -0.615188 -0.857215 -0.38168 0.0206255 0.141488 -0.528996 0.0883782 0.159153 -0.372663 0.35156 -0.0422488 -0.348096 -0.712821 0.25269 0.424291 0.201215 -0.499853 -0.220656 -1.11032 0.610517 0.318899 0.374147
+more -0.251818 0.171465 0.266293 0.484294 -0.495692 -0.675504 0.400691 -0.18242 -0.0748823 0.142737 -0.274858 -0.564092 -0.0815999 0.0487114 -0.867081 0.208537 -0.484943 0.293528 -0.00897624 -0.0363006 0.805692 0.396782 0.178695 0.554202 -0.479862 -0.564868 0.670071 -0.0780999 0.0660844 -0.325804 -0.92219 -0.0736606 0.646721 -0.0751188 0.137922 -0.149434 -0.247436 0.669128 -0.0410328 -0.436015 -0.434173 0.331442 0.170921 -0.354174 -0.671356 0.15285 -0.657206 0.093166 0.320999 -0.324571
+years -0.0520597 -0.18402 -0.323511 0.20487 -0.651373 0.0105087 -0.474763 0.153716 -0.391678 -0.555982 0.470937 -0.434597 -0.489859 0.0226152 -0.0713599 0.0405262 -0.715172 -0.114559 0.0392276 -0.140835 0.26753 0.407706 0.261659 1.2491 -0.532923 -0.107257 1.46631 -0.164085 0.574772 -0.101672 -0.55123 0.649055 0.77705 0.112023 0.493703 -0.47689 0.563583 0.0749029 -0.617563 -0.723938 0.083159 0.555584 0.649408 0.165487 -0.824725 -0.583066 -0.732601 0.255237 0.297208 0.0174563
+school 0.591939 -0.717912 0.0451393 0.35873 -0.18194 0.526175 -0.85962 0.217517 0.725079 -0.171067 -0.136202 -0.0456476 -0.0984257 -0.485862 -0.553629 -0.0335999 -0.903597 0.858463 -0.10771 0.59477 0.754463 1.1586 0.513188 0.410202 -0.442281 0.0648244 0.957649 -0.849109 -0.348519 -0.807935 -0.316082 1.93048 0.0524085 1.73168 -0.316799 -1.21367 -0.074674 -0.107396 0.705431 -0.556984 -0.0258657 0.53447 0.826304 0.41274 0.140753 -0.129449 -0.19693 0.188426 0.38862 0.671085
+most -0.897752 -0.0241685 0.242494 0.386944 -0.762283 -0.147996 0.344775 0.167888 0.204054 -0.267288 0.124373 -0.319952 -0.398987 0.270252 -0.911459 -0.23072 -0.64422 0.444403 0.643189 -0.162465 0.161044 -0.0584615 -0.223561 0.210935 -0.0633409 -0.0523491 0.517334 -0.414769 0.152159 -0.209444 -0.795007 0.108733 0.19541 -0.0884242 -0.371111 -0.417446 -0.382095 0.445401 0.0693543 -0.0972441 -0.607408 -0.0447096 0.428874 0.0633366 -0.649432 0.376684 -0.487603 -0.0167442 -0.078207 -0.128358
+only -0.272486 0.33249 0.398843 0.138038 -0.378279 -0.296686 0.0230702 0.127898 -0.214848 -0.0145431 0.0437445 -0.235932 -0.409799 -0.303789 -0.199445 0.42532 -0.7471 0.1973 -0.0978751 -0.124861 0.143721 0.472334 0.224368 0.46501 -0.0976033 -0.218088 0.385762 -0.137923 0.258164 -0.690319 -0.612311 0.103348 0.246845 -0.442801 -0.28879 -0.231554 -0.185818 0.250824 0.0220793 -0.121661 -0.536925 -0.0537045 0.237409 -0.335374 -0.401376 0.292695 -0.893248 -0.00160896 -0.101775 0.254204
+over 0.0960313 -0.13621 0.25426 0.314076 -0.340774 -0.580794 -0.0381126 0.506897 0.175598 -0.0304002 -0.233229 -0.488981 -0.106647 0.272466 -0.142262 -0.0718766 -1.00153 -0.088921 -0.0250921 -0.0808903 0.148052 0.270114 0.197225 0.469212 -0.324876 0.0133074 0.117281 -0.608843 0.160171 -0.497376 -0.394919 0.11643 0.853017 -0.250729 0.199995 0.100741 -0.00282987 0.314251 -0.0761302 -0.399909 -0.131736 0.0315416 0.318954 0.314591 -0.788018 -0.420083 -1.15857 0.205653 0.136135 0.0931845
+up 0.143 0.339394 0.216655 0.534045 -0.600654 -0.57691 -0.000457011 -0.245858 0.150457 0.180242 0.126899 -0.179014 -0.140204 0.0537808 0.389584 -0.00230795 -0.985309 0.538484 -0.173739 0.335284 0.613483 0.819853 0.639195 0.518929 -0.0256085 0.0846894 0.221235 0.0860623 -0.0934029 -0.218815 -0.142634 0.0819388 0.3959 -0.0997551 0.115098 0.107469 -0.33884 0.581202 0.116089 -0.565374 0.00965721 -0.204957 0.0289372 0.268465 -0.0825146 -0.217586 -1.02004 0.230398 0.0675734 -0.0189743
+some -0.0721584 0.349982 0.288139 -0.344601 -0.811169 -0.633346 0.561182 -0.212047 0.0388633 -0.389416 -0.0179561 -0.761876 -0.0591516 -0.0451255 -0.892072 -0.0361109 -0.545903 0.290661 0.410123 0.0798313 0.427917 0.169307 0.406944 0.729618 0.0755649 0.0664533 0.354705 -0.144997 -0.0162015 -0.317446 -0.493054 -0.268642 0.0409208 -0.164518 -0.0587602 0.127023 -0.0518747 0.385068 0.0915894 -0.467713 -0.508037 0.289125 0.248646 0.0585785 -0.432804 0.340499 -0.731672 -0.127187 0.203685 -0.456754
+city 0.185113 -0.790293 -0.244866 -0.0393762 -0.14882 -0.108515 -0.791124 0.546407 0.780419 0.025066 -0.428485 0.250428 -0.939111 0.303955 -0.300537 -0.272403 -0.860941 0.855373 0.133908 0.739869 0.0492011 0.872621 -0.299787 0.519682 -0.403773 0.910612 1.00742 -0.782468 0.0384616 -0.0246423 -0.500839 0.89006 0.411816 -0.868716 0.0583877 -0.8962 -0.252681 0.677992 -0.580149 -0.426191 0.259572 -0.876145 0.433288 -0.0385546 0.176377 -0.118833 -0.132959 -0.417144 1.1518 0.21588
+many -0.181298 0.231764 0.21135 -0.180978 -1.13421 -0.564556 0.489969 -0.221702 0.0404789 -0.579929 -0.0614093 -0.761458 -0.273016 0.152293 -1.14012 -0.229226 -0.573363 0.350995 0.626522 0.00829429 0.389411 0.0226868 0.0601661 0.748557 -0.214552 -0.127308 0.516167 -0.255883 -0.200671 -0.37919 -0.318373 -0.252523 0.191548 0.166779 -0.146931 -0.0484781 -0.0188089 0.532706 -0.0656246 -0.373996 -0.357091 0.166526 0.351998 0.190762 -0.294278 0.210984 -0.552783 -0.098405 0.319599 -0.417741
+would 0.326243 0.117222 0.550426 -0.533784 0.184714 -0.823263 -0.0567853 0.11699 0.355818 -0.627005 -0.427372 -0.131173 -0.425433 0.61909 -0.458403 0.978571 -0.665459 -0.0108549 -0.341282 -0.822825 0.497081 1.38678 -0.0820334 0.788489 -0.419183 -0.423198 0.130727 -0.239956 -0.159286 -1.06669 -0.100515 -0.313071 0.688559 -0.503439 0.683605 0.0689984 -0.0416752 -0.163117 -0.0247059 0.0118939 -0.461214 0.360314 0.704781 -0.00257473 -0.0815617 0.56968 -1.45704 0.523108 0.276821 0.734773
+world -0.535784 0.354458 0.207733 0.567393 -0.567483 -0.0259366 -0.234055 -0.283835 0.177387 -0.263533 0.391516 -0.622571 0.0944485 1.03073 0.205855 -0.0539001 -0.488705 0.337882 0.560039 -0.483489 0.0840044 -0.0633035 -0.144173 -0.22357 -0.184387 -0.383322 0.879969 -0.913379 0.0781671 -0.556868 0.674714 0.370149 0.283555 -0.624682 -0.807033 -0.495901 0.762861 0.547559 0.316798 0.221835 -0.662392 0.0223308 1.30488 0.0625408 -0.232593 0.219269 -0.802183 0.570664 0.580252 0.59146
+can -0.0500149 0.299482 0.225408 -0.831856 -0.0898179 -1.0799 -0.238841 -0.0377758 0.722044 0.0802448 -0.125565 -0.68091 -0.309636 -0.137943 -0.780392 0.144732 -0.557738 -0.1953 0.17339 -0.529731 1.14897 1.27849 0.174823 0.234744 -0.926631 -0.0857891 -0.163003 -0.295739 -0.108198 -0.996821 -0.279684 -0.439801 0.472703 -0.619565 -0.32686 0.33269 -0.851563 -0.643342 0.79464 -0.173138 -1.37492 0.026742 0.842354 -0.290328 -0.564042 0.348388 -0.779929 0.410708 0.314297 0.362113
+such 0.172819 0.806746 0.103258 -0.247011 -1.13785 -0.524822 0.605254 -0.268925 0.59592 -0.168536 0.237559 -0.61157 -0.468628 0.186887 -1.68401 -0.0801483 0.157427 0.180071 0.526237 -0.343277 0.816755 0.22372 -0.206555 0.0481967 -0.0485964 0.19686 0.572695 -0.141494 0.0882008 -0.54776 -0.38589 -0.484503 0.155028 0.0327736 -0.131428 -0.0496286 0.150908 0.0813669 0.0743149 0.0195038 -0.760799 0.406793 0.166131 -0.140914 0.190773 0.73544 -0.356949 -0.444148 0.385993 -0.120261
+out 0.0712929 -0.0582253 0.232757 0.334231 -0.442097 -0.596886 0.116516 -0.0657234 0.227781 -0.279857 -0.096039 -0.154652 -0.0162105 -0.188082 0.539978 0.275608 -0.890838 0.543242 -0.0800879 0.386341 0.422495 0.749391 0.617283 0.566806 0.135873 0.137709 0.311655 -0.23545 -0.166824 -0.23821 -0.308747 0.220647 0.321543 -0.520404 0.170324 0.395733 -0.423843 0.574012 0.35918 -0.660482 0.0437451 -0.0625759 0.503035 0.448209 -0.39888 -0.383033 -1.05946 0.465513 0.273297 0.0338724
+year 0.146409 -0.0337572 -0.0876598 0.204772 -0.449924 0.42122 -0.618255 -0.289687 0.120151 -0.236395 0.0959996 -0.430984 -0.579145 0.0355594 0.357926 -0.209612 -0.817658 -0.0597383 -0.477127 -0.463728 -0.00376655 0.516103 0.0322671 0.445872 -0.0508382 -0.269442 1.0248 -0.62532 0.492643 -0.408997 -0.309825 0.863931 0.590483 0.0291635 0.417141 -0.871984 0.414055 0.0968698 -0.336829 -0.527422 -0.143079 0.357185 0.756904 0.333029 -0.792286 -0.0472319 -0.726415 -0.160161 -0.281226 0.674922
+used 0.174282 1.09395 -0.137459 -0.193269 0.0834287 -0.412843 0.426412 -0.221264 0.787587 0.0197294 0.341224 0.0524571 -0.875305 0.0712513 -1.21358 0.532361 -0.469572 0.633159 0.408244 0.202949 0.718229 0.584474 -0.0536634 0.430168 -0.0337121 0.23922 0.967176 -0.32441 0.907032 -0.756369 0.0361663 -0.735995 -0.227573 -0.0257914 -0.366487 0.255608 0.189414 -0.371206 0.554344 -0.237211 -0.944569 0.667056 0.519993 -0.597902 0.203746 0.384488 -0.174587 -0.150062 -0.275625 0.146398
+him 0.100494 -0.338821 0.397525 0.697591 -0.728677 -1.27551 0.119439 -0.109232 0.0138317 -0.750791 0.857439 -0.404978 0.0533625 -0.111706 0.470249 0.460232 0.316186 0.244106 -0.0435578 0.349362 0.442713 0.949321 1.13282 0.6061 0.248791 0.128641 0.803072 0.0690929 -0.605961 -0.532994 -0.398872 0.0171518 0.0723581 0.0239144 1.25108 -0.527149 -0.104985 0.457544 -0.390573 -0.432329 -0.481216 -0.492961 1.18766 0.654874 0.454139 0.241228 -1.01491 -0.105479 -0.0215532 0.366649
+where 0.112244 -0.280407 0.37262 -0.0413689 -0.497555 -0.408218 -0.23392 0.324147 0.306705 0.339257 0.756445 -0.239923 -0.457815 0.523514 -0.407226 -0.0692869 -0.491632 -0.03927 -0.0732824 0.193575 0.249085 0.462047 0.280984 0.409364 -0.624265 0.148747 0.0884191 0.0834498 -0.0244275 -0.87811 0.0115701 0.301098 -0.0920968 -0.174091 -0.308429 -0.391409 -0.661148 0.0413197 0.112353 -0.294043 -0.555595 -0.618341 0.29719 -0.0978371 0.125689 0.12852 -0.65448 0.428589 0.241019 0.719291
+may 0.0387878 0.297684 0.277375 -1.13171 -0.578081 -0.156671 0.0550624 0.388699 0.854921 0.07324 0.534958 -0.32261 -0.431008 -0.451224 -0.495633 -0.0981561 -0.0670362 -0.544501 -0.391386 -0.344434 0.429708 0.787401 0.388706 0.432359 -0.22433 0.117779 0.222875 0.278995 0.400948 -0.625857 0.0430118 0.0420735 0.120771 -0.2968 -0.265637 0.135642 0.287105 -0.0249531 0.487834 -0.0813283 -0.489792 0.106212 0.414909 0.105124 0.112321 0.660499 -0.780035 0.186615 0.341808 0.516636
+later 0.4229 0.0410551 0.284314 0.320603 -0.585031 0.00776194 0.0261848 0.488174 -0.414455 -0.332985 0.385007 0.212471 -0.675664 0.511295 -0.220297 0.217723 -0.566268 -0.446663 -0.253899 0.0779989 0.252735 0.265103 0.585406 0.763656 -0.0971984 -0.107883 0.46227 -0.0626114 -0.0546701 -0.479794 -0.181071 0.246475 -0.104859 -0.0127552 0.428852 -0.273057 0.280402 0.498683 -0.287664 -0.218708 -0.315215 0.25481 0.448062 0.245181 -0.116402 0.0824461 -0.91868 0.143162 0.244724 0.443861
+known -0.0237223 0.346571 -0.307676 0.44696 -0.599085 -0.0152362 0.14823 0.223434 0.392322 -0.155043 0.724481 -0.0656379 -0.741889 0.0549826 -1.25791 0.0833532 -0.413023 -0.105362 0.647869 0.0669455 0.621509 0.0389901 -0.39017 0.217887 -0.238534 0.300358 0.979028 -0.239937 0.752277 -0.324988 -0.399671 -0.287075 -0.0958528 0.215248 -0.601785 -0.22207 -0.171623 -0.18474 -0.234998 0.125354 -0.54998 0.0607781 0.496592 -0.571765 0.568513 1.04216 0.170224 -0.470641 0.164412 0.651129
+then 0.480538 -0.0651287 0.449495 0.00281188 -0.468534 -0.202038 -0.208814 0.15526 0.136032 0.196338 0.664598 0.0678551 -0.283354 0.285147 0.0481874 0.190435 -0.41854 0.0513047 -0.40036 0.436534 0.29968 0.809726 0.464843 0.463458 -0.078546 0.0865955 -0.137158 0.0444991 -0.152275 -0.489921 -0.084729 0.116382 0.073558 -0.167924 0.235958 -0.0690916 -0.332242 0.381366 -0.135138 -0.421559 -0.577904 -0.0213451 0.480889 0.121373 0.156214 -0.111487 -0.890747 0.126901 0.11118 0.619303
+between 0.0335537 -0.0773057 0.294714 -0.129305 -0.274376 -0.139782 0.0546816 0.961949 0.325528 0.310629 -0.0482929 -0.200104 0.188583 0.506983 -0.523167 -0.239186 -1.0095 -0.0158797 -0.0956721 -0.113277 0.181682 -0.0712001 0.0616522 -0.0901064 -0.5053 0.64181 -0.406375 0.00576497 0.079974 -0.716509 -0.160632 -0.0908105 0.97186 -0.315995 0.067868 -0.0862911 0.339102 0.359787 0.270681 -0.562316 -0.651607 0.277529 0.50645 -0.0286885 -0.269691 0.220953 -0.744683 -0.140398 0.245189 0.293076
+state 0.160839 -1.27271 0.0706944 -0.187111 -0.171538 -0.175552 -0.903121 0.0520996 1.482 0.295184 0.538885 -0.503172 -0.161141 -0.529247 -0.290084 0.542056 -0.605105 0.661301 0.203434 0.0190426 -0.0897202 0.39546 -0.442433 0.635261 -0.463277 0.788804 0.160724 -1.03997 -0.344528 -0.346649 -0.564837 0.829064 0.0972258 -0.2042 0.0649766 -0.186633 0.549233 0.568122 -0.259874 -0.0181809 -0.500858 0.0309532 0.558408 -0.520955 -0.00584511 -0.292075 -0.822587 -0.447121 0.0147368 0.112505
+about 0.144105 -0.476899 -0.108142 0.4476 -0.234796 -0.578418 -0.47089 -0.0298482 -0.0656991 -0.210799 -0.529045 -0.627064 0.210036 0.342577 -0.209303 -0.00570469 -0.892335 -0.32569 -0.0106547 0.435479 0.249262 0.0237338 0.579543 -0.00703188 -0.373046 0.315668 0.769484 -0.171608 -0.00990384 -0.452283 -0.717465 0.162516 0.0188668 -0.252709 -0.214882 0.467066 -0.561525 0.371081 0.180611 -0.802504 -0.223148 -0.00453563 0.32191 -0.170442 -0.609658 0.388494 -0.910141 0.29629 -0.235719 -0.146681
+under 0.673914 0.0522092 0.0678856 0.412721 -0.270302 0.0349173 0.294818 1.28962 0.722274 0.14347 -0.0771899 -0.0544588 -0.126355 0.0169187 0.22674 0.00460852 -1.04562 0.0351317 -0.101426 -0.0142376 -0.139698 0.19917 0.0594771 0.348045 -0.320108 0.47263 0.129459 -0.551892 -0.0963179 -0.55607 -0.606182 0.181382 0.274771 -0.351075 0.324143 0.11189 0.124411 -0.155289 0.165926 0.0659365 -0.338978 0.421838 0.525956 0.120885 -0.269391 -0.390937 -1.01598 0.348067 0.3415 0.100167
+made 0.0740952 0.420968 -0.00185653 0.0227938 -0.174601 -0.524926 0.235919 0.298097 0.430936 -0.0696504 0.654117 0.0255356 -0.612442 0.0500688 0.05599 0.303779 -0.71318 0.495888 -0.160311 -0.0170573 0.174257 0.232329 0.278443 0.422238 -0.690415 0.0273783 0.868264 -0.378638 0.581911 -0.68517 -0.22564 -0.178946 0.368848 -0.221048 0.280162 0.445542 -0.126205 -0.42322 0.169285 0.131112 0.187784 0.262467 0.486253 0.393744 -0.150525 0.42538 -0.523529 -8.14511e-05 0.24086 0.0581427
+three 0.216675 0.0884687 -0.014941 0.099179 -1.18485 -0.253659 0.0865121 -0.117699 -0.42502 0.0416687 -0.0301857 -0.246271 -0.562172 -0.39542 -0.147362 0.0433267 -1.05726 0.296351 0.0364675 0.150351 0.198645 -0.385007 0.3187 0.231066 -0.112796 -0.433115 0.133023 -0.248562 0.447483 -0.573126 -0.229062 0.0192354 0.660262 -0.129168 -0.19548 -0.366432 0.201672 0.281522 -0.216092 -0.486817 -0.510119 0.250861 0.404182 0.433624 -0.417671 -0.35195 -1.19463 -0.285766 0.145704 0.0878589
+united -0.560375 -0.271594 -0.0101368 0.849634 -0.480421 0.262078 0.839116 -0.231772 0.929914 0.280816 0.744105 -0.795679 -0.593164 0.618577 0.0937804 0.215854 -0.0937447 1.09562 -1.27842 0.69469 0.152425 -0.0854985 -0.851771 0.568498 -0.683566 0.357986 1.47635 -1.36604 0.279968 -0.0788709 -0.0674411 0.51841 0.16442 -0.520919 -1.04324 -0.100465 0.736235 0.128372 0.198384 -0.189795 -0.713234 -0.423382 -0.33544 -0.640687 -0.667139 -0.0624289 -1.69683 0.468638 1.06628 0.662402
+these 0.0409865 0.73258 0.336852 -0.696714 -1.03218 -0.597701 0.643784 -0.161839 0.24247 -0.280365 -0.483067 -1.00975 -0.317773 -0.230486 -1.04865 -0.0124775 -0.736195 0.482362 0.407112 0.0634332 0.497385 0.00920861 0.118434 0.55669 0.0551544 -0.283182 0.0453644 0.127443 0.0264882 -0.541457 -0.29502 -0.451813 0.206058 -0.121762 -0.208657 0.0835384 0.20411 0.421116 -0.115199 -0.395042 -0.626256 0.570338 0.471372 0.00854867 -0.436806 0.155249 -1.04836 -0.0814976 0.331465 -0.475568
+than -0.334295 -0.156759 0.359497 0.483256 -0.219231 -0.718728 0.231275 -0.000712365 -0.212905 0.165325 -0.265857 -0.389372 -0.132971 0.327355 -1.09689 0.159904 -0.472001 0.180595 0.220894 -0.332661 0.73131 0.392458 0.180297 -0.0376538 -0.607309 -0.856043 0.878775 -0.304291 0.280214 -0.618245 -1.26256 -0.115257 0.861721 -0.111798 0.216566 -0.169126 -0.346213 0.290819 -0.305807 -0.225891 -0.512903 0.414003 -0.0405439 -0.699975 -1.21808 0.24842 -0.967149 -0.00264445 0.0802079 0.044011
+university -0.17003 -1.67123 0.80264 0.371786 0.107255 0.782648 -1.15349 0.00913664 1.13366 0.211765 0.886143 -0.571045 0.156498 -0.603528 -1.28855 -0.342258 -1.02956 0.271252 -0.255641 1.13969 0.152293 0.553227 0.293753 0.174802 -0.782567 0.225476 1.333 -0.741766 -0.610389 0.160982 0.0873673 1.7924 -0.14926 0.541654 0.549362 -0.543281 -0.0368193 -0.295074 -0.722044 -0.804234 -0.346354 -0.0580254 1.63525 0.44133 0.286933 -0.61918 -0.802551 0.233766 0.23516 0.624512
+part -0.317737 0.139598 -0.366 0.165667 -0.540056 0.408415 -0.178221 0.297531 0.278986 0.11002 0.23561 -0.274796 -0.40353 -0.0916524 -0.184818 0.152429 -0.540168 0.755475 0.301672 0.373276 0.181236 0.0795447 0.142784 0.338841 0.121202 0.69155 0.21403 -0.628165 0.0910652 0.595181 -0.451897 0.296736 -0.023335 -0.547091 -0.816294 -0.53868 -0.0328819 0.464352 -0.127663 -0.617699 -0.265573 0.224345 0.643059 -0.162108 -0.0945666 0.203087 -0.439592 -0.112477 0.270219 0.325033
+while 0.0833272 -0.250778 0.291343 0.29552 -0.352077 -0.37596 0.145452 0.398066 0.284462 0.00542032 0.220397 -0.0393332 -0.301741 0.286876 -0.376331 0.0588492 -0.513925 0.00880581 0.0309026 -0.116397 0.362926 0.290618 0.347148 0.264383 -0.133991 -0.113563 0.0200621 -0.0987566 -0.246925 -0.80505 -0.17434 0.0411203 0.329387 -0.063465 0.0436008 -0.0665036 -0.181276 0.145576 0.37364 -0.258018 -0.544256 -0.119181 0.176707 0.154011 -0.33894 0.273314 -0.886202 0.247676 0.0945468 0.174
+no 0.413999 0.188388 0.201884 -0.131894 -0.640249 -0.697101 0.0914307 0.21179 -0.0982933 -0.0575351 -0.173643 -0.887623 -0.0293912 -0.855512 -0.369238 0.7996 -0.461864 -0.192768 -0.235883 -0.327651 0.226566 0.548836 0.681056 0.66367 -0.253697 -0.132386 0.559224 -0.179301 0.274844 -0.438538 -0.499557 -0.180034 0.122518 -0.56939 -0.253965 0.329449 0.0146745 0.458982 0.318493 -0.225901 -0.141768 0.398659 0.230069 -0.371206 -0.40568 0.548929 -0.699461 -0.181125 0.61213 0.237214
+national -0.187595 -0.313513 -0.110665 0.00961042 -0.598571 0.413738 -0.456042 0.0386418 1.39567 0.415218 0.0514408 -0.55989 -0.141914 -0.258489 -0.500547 -0.0609387 -0.844514 0.452596 0.235824 -0.954584 -0.386126 -0.150429 0.407436 0.0569665 -0.594212 0.552881 0.759234 -1.41539 0.189453 -0.143521 0.343931 0.523827 0.218561 0.0501111 -0.580227 -0.0471139 0.243117 0.503292 -0.381131 0.728005 -0.244177 -0.329574 0.221369 0.207826 0.480847 0.203046 -1.36006 -0.198803 0.00768561 0.271853
+well -0.11966 0.495872 -0.181779 0.118496 -0.955985 -0.192439 0.11185 -0.169471 0.216541 -0.0586232 0.34636 -0.407719 -0.415368 0.168267 -1.2685 0.0626895 0.0540467 0.192588 0.605123 0.00988324 0.682624 0.368432 -0.0404162 0.199354 -0.111898 -0.169348 0.63407 -0.106078 -0.0194313 -0.146301 -0.512095 -0.0692064 0.0283504 0.419818 -0.343795 -0.589323 -0.0284179 0.396312 -0.250656 0.162171 -0.766298 0.096277 0.0616002 0.103211 0.221095 0.867625 -0.180902 -0.509165 0.399072 0.326317
+however -0.157366 0.161428 0.531636 0.0773593 0.031027 -0.345333 0.179177 0.401887 -0.0271832 -0.129832 0.0825318 -0.032205 -0.223266 0.381707 -0.588399 0.299131 -0.525764 0.131058 -0.291118 -0.415202 0.306003 0.596233 0.303373 0.64365 0.165214 -0.0186125 0.179487 -0.0682006 -0.0427246 -0.859642 -0.606691 -0.000905837 0.0838159 -0.531662 0.0441852 -0.105005 0.0161332 0.207941 0.0238707 -0.107074 -0.287026 0.15022 0.344617 -0.129019 -0.383677 0.621539 -0.87204 0.0114456 -0.0168307 0.299149
+became 0.64698 -0.380627 0.316248 0.25175 -0.479684 0.500368 -0.0616378 0.793596 -0.00268991 -0.356455 1.17297 0.295626 -0.424658 1.20411 -0.385425 -0.238137 -0.606188 -0.200656 0.362311 -0.588991 0.314559 0.321722 0.334476 0.0321376 -0.155523 0.0798634 0.943896 -0.909563 -0.197411 -0.373087 -0.249929 0.228389 0.265013 0.141871 0.60192 -0.463372 0.183366 0.00949919 -0.377031 0.145669 -0.0342036 0.532535 0.00900028 -0.208241 0.018575 0.268677 -0.68727 -0.00734143 0.230421 0.503643
+american 0.110755 -0.327861 0.330032 1.39171 -0.761215 0.595392 0.979096 -0.468086 1.77915 -0.0458963 0.285601 -0.0734915 -0.417567 0.557942 -0.0537227 0.626504 -0.341715 -0.0926922 -0.0240289 0.0650818 0.261019 -0.864473 -0.392376 -0.0831506 -0.302149 0.183454 1.57195 -0.705794 0.232515 -0.481034 0.370172 0.676453 0.333335 0.186435 -0.357916 0.0271725 0.0123168 0.566231 0.664318 0.0119221 -0.0997324 -0.353089 0.115236 -0.367739 0.081012 0.256848 -1.2394 0.433746 0.778952 -0.379167
+states -0.08091 -0.999395 0.51819 -0.440427 -0.648246 0.193764 0.120534 -0.0911628 1.48825 -0.731824 1.07623 -0.546559 -0.441931 0.524622 0.305698 0.363293 0.146088 0.525254 -0.416717 0.0342472 0.222298 0.843562 -1.17522 0.131637 -0.924027 0.213283 1.23185 -0.780749 -0.0822202 0.417053 0.235729 0.363838 0.206042 -0.538886 -0.945781 0.0470796 0.547126 -0.0850085 0.554541 -0.154247 -0.850894 0.354491 0.15918 -0.959988 -0.829992 -0.0995647 -1.64565 0.474534 1.06372 0.328414
+south -0.543642 -0.778524 -0.220643 0.306307 0.101562 0.34076 -0.341403 -0.0297722 1.19829 -0.0296622 0.234618 0.26588 0.347468 0.660583 0.0233992 -0.393258 -1.73631 0.897935 0.0982001 1.32866 0.531067 -0.0543244 0.740825 0.504811 -0.746034 0.245256 0.55513 -0.336937 -0.26083 0.805676 -0.235128 0.421686 0.313672 -0.173563 -1.29867 -0.325189 0.426853 0.971829 -0.0304962 -0.304487 -0.476149 0.222076 0.00180549 -0.399462 0.853336 0.0791976 -1.20846 -0.871259 0.161122 0.734365
+being -0.163234 0.205983 0.318265 0.240498 -0.521915 -0.265442 0.190133 0.290513 -0.139274 -0.195855 0.127993 -0.178457 -0.0842378 0.0357394 -0.291283 0.0254349 -0.340727 0.15104 -0.00774704 -0.0404987 0.418206 0.480851 0.543768 0.410403 0.298543 0.126669 0.385484 -0.0855126 -0.133101 -0.358782 -0.634591 0.24369 0.0780757 -0.360954 -0.078857 -0.145683 -0.0155548 0.548302 0.0869898 -0.0836965 -0.594777 0.0866292 0.116608 0.214584 -0.295104 -0.0965949 -0.625704 0.0300345 0.207228 0.142429
+war -0.0122065 -0.14111 -0.90988 0.180418 -0.132126 0.155655 0.367412 0.795235 0.324687 -0.960916 0.247191 -0.0327929 -0.252127 1.23559 0.373919 0.412789 0.439354 0.723282 0.512052 -0.197201 0.0422168 0.313766 0.468598 0.162092 -0.434774 0.158263 1.27351 -0.845376 -0.0987101 -0.637797 0.0421241 0.144757 0.275778 -0.284038 -0.336374 -0.837935 1.6225 1.85386 0.207155 -0.108912 -0.299968 0.662753 1.39482 0.0630329 -0.653526 0.199051 -1.072 1.00666 0.763916 0.842901
+born 0.047585 -0.469531 -0.494452 1.12474 -1.4175 0.53732 -0.552924 0.785542 0.756406 -0.0769586 2.03049 0.237057 -0.678339 -0.367803 -0.330926 -0.598039 -0.635647 -0.284851 -0.559062 0.497237 0.488245 -0.120653 0.322552 0.595039 -0.720071 0.630091 1.74262 0.213987 0.87809 0.0576174 0.00972461 0.614085 -0.122753 0.268979 0.0323751 -0.169318 0.437106 -0.245198 0.199085 -0.140149 0.141015 -0.359181 0.953374 0.691139 1.61862 1.4969 -0.339949 0.115433 0.473935 0.91931
+through 0.278686 0.00649811 0.384041 0.214377 -0.178099 -0.3845 -0.176061 0.483715 0.621461 0.533819 -0.392486 -0.451265 -0.254977 0.652366 -0.0920953 -0.0800546 -1.00174 0.000408803 0.227666 0.0925368 0.611165 0.0495643 0.138272 0.11168 -0.419843 0.294443 -0.448301 -0.27633 -0.697446 -0.653454 -0.0957853 0.103879 0.400064 -0.31115 0.274931 -0.237692 -0.136993 0.230819 0.323532 -0.40056 -0.469904 -0.0564795 0.255283 0.0099881 -0.465869 -0.082176 -0.952838 0.150889 -0.175889 0.357236
+both -0.0405234 0.169453 0.356342 -0.0671868 -0.678155 -0.400182 0.386343 0.200134 0.189107 0.00227777 0.180536 -0.30135 -0.0609203 -0.218811 -0.43291 0.00874164 -0.609081 0.283065 -0.0763247 -0.00868413 0.451538 0.034063 0.148977 0.204152 -0.182557 -0.0799904 -0.0368519 -0.102588 -0.0945048 -0.281654 -0.0200702 -0.132958 0.507471 0.0641306 -0.0281191 -0.0769281 0.146358 0.225811 0.115372 -0.365845 -0.419393 0.299899 0.41237 0.25518 -0.0119962 0.253485 -0.70646 -0.251826 0.365836 -0.110529
+north -0.32367 -0.643648 -0.0887581 0.223132 0.21486 0.599754 -0.276207 0.221134 0.877886 0.0396722 0.410672 -0.0273911 0.162695 0.494357 -0.0941751 -0.136009 -1.80343 0.82904 0.20355 1.32011 0.83634 -0.0452704 0.359583 0.155008 -0.720124 0.166519 0.469559 -0.366044 -0.255566 0.786469 -0.125553 0.214602 0.114344 -0.190401 -1.13625 -0.553033 0.430461 1.15546 -0.0609986 -0.600705 -0.499168 -0.092544 -0.176701 -0.348449 0.513036 0.18722 -1.16367 -0.721875 0.164848 0.547394
+second -0.00695026 0.225224 0.0333556 0.528778 -0.749359 0.392943 -0.258757 0.115612 0.12071 0.193099 0.335164 0.245295 -0.64781 0.196725 0.406879 0.132791 -0.829904 0.448223 -0.329079 -0.0234958 0.0814816 -0.00407405 0.32167 -0.133838 0.21048 -0.187425 0.0988568 -0.703507 0.626753 -0.468054 -0.342861 0.403714 0.276676 -0.555126 -0.052251 -0.717615 0.196066 0.254117 0.388729 0.0393622 -0.210038 -0.104498 0.719812 0.121706 -0.339486 0.310699 -0.936376 0.223638 -0.397935 0.521881
+area 0.012357 -0.348216 -0.578084 -0.0529304 0.691772 0.424755 -0.334154 0.633817 0.538832 0.228854 -0.363706 -0.576674 -0.586595 0.152856 -0.797064 -0.335875 -1.14979 0.68873 0.500482 0.850647 0.241101 0.877289 0.016165 0.169638 -0.383893 1.15045 0.636824 -0.247207 0.174225 0.390097 -0.74783 0.614683 0.266081 -0.259696 -0.996298 -0.229574 -0.359769 0.673907 -0.116547 -0.47674 0.115023 -0.22538 0.0488577 -0.451065 -0.689321 0.0328705 -0.791551 -0.331593 0.521657 0.123486
+before 0.2515 -0.213842 0.340025 0.140518 -0.194188 -0.2479 -0.134447 0.335625 -0.0235053 0.158374 0.382291 0.176759 -0.414235 0.76222 -0.0720293 0.209852 -0.673681 -0.249969 -0.395878 -0.183276 0.214129 0.383663 0.610607 0.700946 -0.256484 0.0999218 0.107464 -0.138168 -0.234352 -0.840455 -0.382547 0.361777 0.131952 -0.522022 0.100429 -0.557793 0.077169 0.230999 0.0455702 -0.37157 -0.434492 -0.123822 0.146664 0.217427 -0.490955 -0.0473284 -1.11683 0.371494 -0.0676824 0.879995
+including 0.440356 -0.0656429 0.272436 0.0366682 -0.884797 -0.238484 0.288415 0.578922 0.49024 -0.325634 0.00701157 -0.288409 -0.480099 0.503127 -0.712471 -0.689956 -0.994536 -0.0247426 0.321327 -0.333789 0.385936 -0.668834 -0.0696923 -0.248833 -0.83201 0.212 0.543286 -0.562402 -0.380212 -0.288963 -0.267688 -0.417855 0.877113 0.042285 0.00640211 0.187807 0.196908 -0.219144 0.41593 0.000515268 -0.268983 0.129884 0.135683 0.62639 -0.493352 -0.316255 -1.02226 -0.391851 0.320714 -0.257861
+high 0.517919 -0.261965 0.0254563 0.0133824 -0.251801 -0.489208 -0.372057 0.533406 1.01076 0.492034 0.181941 -0.326751 0.118481 -0.192555 -0.938967 0.0753226 -0.877159 0.366105 0.104162 0.29762 1.03513 0.879761 0.127735 -0.0539103 0.349698 -0.74097 0.0520779 -0.736364 0.516154 -0.569552 -0.722128 0.781788 0.181964 1.14939 -0.251945 -0.369583 0.210291 -0.223644 0.795921 -0.322197 -0.0254703 0.208353 0.200527 0.537447 -0.141716 0.29967 -0.375867 -0.225998 0.585324 0.0457985
+them 0.241122 0.194662 0.377199 0.0619063 -0.633424 -0.875104 0.55233 -0.182151 -0.0068729 -0.721174 0.0509775 -0.665768 0.142283 -0.411069 0.394523 0.245433 -0.605112 0.870103 0.0777263 0.276334 0.602549 0.955756 0.410025 0.69887 -0.160349 0.0552044 0.539404 0.52944 -0.545427 -0.298084 -0.176826 -0.508308 0.559458 -0.266108 0.190011 -0.0269627 -0.0927368 0.396179 -0.140293 -0.728666 -0.238782 -0.0274912 0.596551 0.210324 -0.226039 -0.205884 -0.926266 0.19526 0.355109 -0.380037
+people -0.177518 -0.595935 -0.654047 0.00401221 -0.570404 -0.636505 -0.396888 -0.179148 0.409931 -1.11706 -0.611166 -0.397671 0.271994 -0.337489 -0.26991 -0.642497 -0.89136 0.345339 0.226553 -0.0293781 0.291684 1.10031 0.149159 0.460428 -0.296088 0.500057 1.20408 -0.258415 -0.436557 -0.180671 -0.376871 -0.138202 0.224788 -0.366155 -0.363599 -0.0383034 -0.282272 0.504539 -0.0955078 -0.541626 -0.0930494 0.360221 0.32961 -0.488757 -0.126085 -0.0131918 -0.623785 0.634666 0.0455412 -0.73167
+team -0.475925 0.274443 0.388587 1.20832 0.693558 0.219837 -0.162879 -0.154491 0.0667045 -0.104043 0.761645 0.176206 -0.0191194 -0.790097 -0.201963 0.358275 -1.05045 0.650697 0.0194997 -1.09775 0.478581 1.0033 -0.35978 0.388615 0.255151 0.258441 1.30174 -0.0467959 -0.217592 -0.5376 0.766891 1.06313 0.912516 -0.16757 -0.727453 -0.928221 -0.0353875 0.633845 -0.247594 0.428755 -0.680942 -0.185983 1.40351 1.40999 0.484416 -0.528217 -1.50803 -0.102045 -0.0186903 0.641019
+will 0.146206 0.417381 0.265719 -0.61845 0.00317063 -0.884996 -0.940255 -0.167027 0.567354 0.0543792 -0.713156 -0.598605 -0.270113 0.134121 -0.169309 0.512322 -0.482339 -0.0408535 -0.510154 -0.683504 0.874191 1.44941 -0.0323657 0.14562 -0.588628 -0.245383 -0.0993155 -0.491158 -0.475566 -1.05507 -0.0708514 -0.285613 0.787286 -0.81748 -0.184266 -0.0287985 -0.275908 -0.678932 0.299361 -0.161204 -0.915873 0.0453336 0.763632 -0.0740162 -0.346378 0.289713 -0.942701 -0.0169807 0.213129 0.635945
+name 0.15015 0.472907 0.392571 0.748685 0.0258741 0.943149 0.100353 0.546202 0.393826 -0.699268 0.212908 0.271351 -0.262727 -0.025594 0.402594 0.0356407 -1.07251 -0.363396 0.17331 -0.214754 0.509204 0.810098 0.0786147 0.662321 0.373863 1.14817 0.729598 -0.409671 0.235993 0.400276 -0.403409 0.265038 -0.350294 -0.165072 -0.10189 -0.0710833 -0.345796 0.148252 -0.257448 -0.52897 -0.996163 0.168388 0.724313 -0.776713 -0.266151 0.514012 0.187383 -0.314051 -0.140055 0.65831
+early -0.0112492 0.355915 0.262342 0.328738 -0.502461 0.466845 0.267004 0.311872 0.268757 -0.270848 0.556465 0.0715768 -0.656076 1.08071 -0.780868 -0.209142 -0.5729 -0.283964 0.31675 0.0073117 0.495529 -0.138026 0.415044 0.690688 0.218441 0.211815 0.628556 -0.274214 0.469517 -0.211879 0.074989 0.587366 0.0355832 0.385619 0.304659 -0.424156 0.61707 0.373362 0.152486 -0.190477 0.358928 0.413706 0.299864 0.163494 -0.343941 0.506057 -0.60548 0.361541 -0.152232 0.111812
+family -0.829808 -0.093808 0.67413 0.936844 -0.715834 0.601852 -0.202023 1.32124 0.180181 -0.815257 0.913558 0.0268136 -0.584519 -0.408858 0.842628 -0.871948 -0.276484 0.0466994 0.233779 0.489549 0.561787 0.503139 0.262887 0.708935 -0.207459 0.982319 0.838441 -0.370122 -0.0154082 -0.451377 -0.355748 0.191944 0.413461 0.69641 -0.414578 0.772156 -0.453039 -0.185739 -0.324394 -0.756187 0.00802912 -0.387409 0.420284 -0.963353 -0.294532 0.499038 -0.506007 0.120801 0.0484072 -0.272383
+since -0.149631 -0.040468 0.322797 -0.0239419 -0.264259 -0.0886402 -0.537415 0.289221 -0.141933 0.0938095 0.074333 -0.103541 -0.451186 0.335132 -0.872743 0.0336869 -0.721218 -0.467197 -0.311551 -0.670692 0.18739 0.451605 -0.0666907 0.570607 -0.484033 0.0268383 0.334837 -0.773395 0.171998 -0.419584 -0.253845 0.391088 0.0721736 -0.331307 -0.310221 0.0366011 -0.0487237 -0.00991242 -0.332141 0.143282 -0.561734 0.177005 0.333439 0.0965742 -0.361309 0.409408 -0.675413 0.307963 0.259602 0.371079
+until 0.2646 -0.0400147 0.347934 -0.352479 -0.10079 0.345661 -0.253343 0.702536 -0.06579 -0.0224192 0.745583 0.156643 -0.785396 0.827255 -0.127758 0.247002 -0.583418 -0.473229 -0.173036 -0.186803 0.0405856 0.68274 0.58754 0.892418 -0.132363 0.050144 0.154483 -0.556579 0.0389985 -0.819578 -0.225739 0.47209 0.106966 -0.121584 0.245713 -0.20526 0.342035 0.399579 -0.267848 -0.325326 -0.410123 0.262855 0.0763257 0.017386 -0.29237 0.10223 -0.968205 0.41484 0.0760045 0.838726
+film -0.322487 1.52111 -0.915001 1.33502 0.278124 -1.37612 -0.766165 -0.324459 1.08474 -0.71147 -0.114551 0.845582 -1.11468 1.17367 -0.0463658 0.346157 0.0586176 0.0116518 -0.73357 0.105639 -0.263721 0.681968 0.369907 -1.00818 -1.2723 1.65833 0.806449 0.0934199 -0.207372 -0.0614739 -0.833492 1.28584 -0.403327 0.411583 -0.194139 -0.340388 -0.802971 0.226391 0.32879 -0.83311 0.1462 0.212109 1.51063 0.411079 -0.705054 0.265904 -1.19695 -0.251641 1.61458 -0.395099
+history 0.329566 -0.0432826 0.237048 0.0533082 -0.148922 0.777987 -0.188955 0.210507 0.455948 -0.229802 0.0764223 -0.449055 -0.0136743 0.367493 -0.503344 0.0723214 -0.991097 0.119503 0.493381 -0.0682801 0.32066 -0.326278 0.134045 0.42935 -0.258352 0.544675 0.717714 -1.02156 0.209651 0.34301 -0.143138 0.690506 -0.361195 -0.362863 -0.362517 -0.24306 0.234046 0.377935 -0.115303 -0.226538 -0.0736561 0.210796 0.848889 0.155404 -0.180673 0.134183 -0.349576 -0.0272546 0.107402 0.208891
+called 0.644003 0.313126 -0.043194 0.228511 -0.303393 -0.146334 0.0246302 -0.0365358 0.402098 0.0631344 0.167487 0.200355 -0.659871 -0.0131284 -0.649864 0.0690998 -0.777954 -0.21266 0.124646 0.0612163 0.475222 0.337079 -0.00162813 -0.0497912 -0.308168 0.537942 0.124918 -0.394296 0.318383 -0.734921 -0.065926 -0.392204 -0.0424519 -0.373038 -0.266067 0.155623 -0.322861 -0.0544909 0.00325462 -0.0301801 -0.521066 0.279745 0.374259 -0.380645 0.363164 0.646793 -0.14278 0.198063 0.0324365 0.446157
+county -0.235302 -1.78371 -0.40814 -0.720617 0.548432 0.242005 -0.192335 0.919446 0.776375 0.687299 0.327066 0.022588 -1.19577 -1.06706 0.363549 0.0921578 -0.835212 0.470943 -0.431513 0.0632476 1.26479 0.328829 -0.087388 1.12117 -1.50023 1.51019 1.15628 -1.22158 0.301133 0.313741 -1.08219 0.559085 0.0720473 0.492711 -1.10209 -0.669758 -0.0794964 1.1747 0.0180016 -0.401079 0.492767 -0.0241702 0.358534 -0.417439 -0.347921 0.259493 -1.08476 -1.17035 0.382022 0.899426
+series -0.847384 1.34185 0.440744 0.44055 -0.66006 -0.0681823 -0.617118 -0.226096 -0.160628 -1.0525 0.625798 0.334413 -1.04651 0.176342 -0.252621 0.427211 -0.568599 0.406724 0.00327731 -0.0784856 0.69541 -0.850389 -0.155846 -0.520836 0.265825 -0.203012 0.157019 -0.376766 -0.396311 -0.319764 0.0222026 0.857676 0.352498 -0.673581 -0.272451 -0.333607 -0.0576295 0.928145 0.266545 -0.630043 -0.666432 0.287111 1.30752 0.644676 -0.0500278 0.0226688 -0.893316 -0.466176 -0.0513252 0.0202326
+season -1.22796 0.686871 0.256969 0.455909 0.556194 0.927406 -1.11608 -0.635287 0.103994 -0.352866 1.28086 0.294609 -0.877688 -0.0448278 0.350957 0.757167 -1.13356 0.404976 -0.160404 -0.530247 0.357295 0.763807 -0.162565 0.621981 0.638858 0.0538953 0.866325 -0.155693 0.0863229 -0.960525 -0.587359 1.41803 1.91265 -0.698707 -0.587844 -1.52045 0.06907 0.587187 0.157655 -0.572096 -0.402985 -0.191457 0.955365 1.57821 -0.430995 0.530529 -1.10894 -0.820229 -0.301219 0.589758
+album -0.624054 1.44277 -0.739445 1.25463 -1.04713 -0.54506 0.291952 -0.659387 0.211859 -0.0490142 -0.0756265 0.210987 -1.80311 -0.25242 0.873728 0.465311 -1.99155 -1.09449 -0.375328 0.384629 0.432751 1.75525 0.580739 -0.321305 -0.786557 1.08514 0.921232 -0.174594 -0.149118 0.93623 0.10499 1.32445 0.688915 -0.386513 1.07865 -1.44048 -0.155828 -0.045322 1.11015 1.44848 -0.323627 0.00381642 0.809007 -0.518686 -1.36111 1.77717 -1.06041 0.0123739 -0.0122888 -0.172577
+work -0.216465 0.636937 0.187952 -0.0418631 -0.907336 -0.0374134 -0.232997 -0.265602 -0.0354476 0.154085 -0.107646 -0.490807 -0.523024 0.66634 -0.343633 0.156587 0.348842 0.527197 0.505822 0.194247 0.663638 0.265969 0.805536 0.167003 -0.475377 0.419406 1.07281 0.378105 -0.445634 0.16655 0.0177807 0.761232 -0.016798 0.600016 0.904345 -0.302405 -0.0370596 0.0111001 -0.154332 -0.00652784 -0.287078 0.338174 0.984503 0.223741 -0.0355046 0.431107 -0.536992 0.385183 -0.197333 0.238114
+several 0.161849 0.158114 -0.0195596 -0.176955 -1.10763 -0.48757 0.349905 -0.0704172 -0.277057 -0.157946 0.0974431 -0.636256 -0.546117 0.0941889 -1.01438 -0.246317 -0.487071 -0.0789544 0.339568 0.115942 0.312239 -0.405175 0.21232 0.668572 -0.340025 -0.141357 0.256403 -0.142028 -0.102392 -0.254469 -0.114014 -0.12719 0.192635 0.0751964 -0.122434 -0.0267464 0.288389 0.587796 -0.224992 -0.537191 -0.424063 0.0984116 0.229115 0.621079 -0.362266 0.034925 -0.929804 -0.257436 0.486385 -0.328594
+music -0.239009 0.730587 0.0889699 0.811819 -1.31656 -0.462198 0.614078 -0.447429 0.630948 -0.155439 0.276207 -0.36064 -1.15493 -0.0111885 -0.817435 0.322767 -1.26571 -0.406135 0.0954684 0.433467 -0.411542 1.6874 0.341572 -0.370202 -1.41146 0.800479 0.416521 -1.229 -0.871808 0.821844 0.194201 1.27922 -0.105764 0.729605 0.74324 -1.04539 -0.653654 -0.559759 0.626233 -0.340181 0.268048 0.663617 0.415937 0.238319 -0.257954 0.738359 -0.136241 -0.103875 0.118518 -0.792417
+against -0.0358191 -0.647742 0.759448 0.194126 0.081763 -0.713887 0.798679 0.766099 0.931318 -0.050504 0.532197 0.546503 0.0392843 -0.150267 -0.102244 -0.439055 -0.353799 0.568391 -0.391648 -0.631083 0.273299 0.431806 0.349241 -0.11839 -0.380106 0.300672 0.396424 -0.260216 -0.448052 -1.11358 -0.262217 -0.186313 0.659456 -1.30701 -0.0148454 -0.0641828 0.635181 0.696023 0.319187 0.525637 0.285418 -0.0533103 0.986493 1.07166 -0.0333903 0.506296 -1.29151 0.302588 0.109347 0.574126
+group 0.295011 0.464808 0.0708425 0.823206 -0.899954 0.518797 0.814254 -0.179292 -0.257098 0.0512551 -0.154119 -0.134422 -0.260934 -0.149935 -0.209605 -0.707364 -0.953056 0.209201 -0.00879631 0.0220796 0.0263776 0.538302 -0.383392 -0.183763 0.635407 0.641666 0.228083 -0.113477 -0.525511 0.494254 0.68857 0.224089 0.654422 -0.0462864 -0.698614 0.119176 -0.51619 0.291331 -0.389906 0.0422709 -0.406631 0.441641 0.717729 -0.341169 -0.319479 0.388346 -1.32661 0.493754 0.416072 0.392721
+number -0.350018 0.452512 0.016274 -0.27552 -1.43752 -0.00621954 0.483297 -0.359077 -0.215818 -0.423696 -0.0225212 -0.474619 -0.179284 -0.459949 -0.560737 -0.496754 -0.716339 0.77634 0.272837 0.155672 -0.0159795 0.153589 0.0899725 0.0113504 0.377894 -0.19843 0.574999 -0.542957 0.617887 0.507258 -0.372202 0.417257 0.270604 -0.238996 -0.139722 -0.187079 -0.426712 0.409198 0.219756 -0.391927 -0.372306 0.213841 0.357049 -0.0806424 -0.966272 -0.00139775 -1.12023 -0.0484697 0.0207874 -0.045347
+life -0.381249 0.0142635 -0.124366 0.465251 -1.14354 -0.12608 -0.500037 0.407387 0.108267 -0.467615 0.68431 -0.679501 -0.209864 0.148074 0.30718 -0.285645 0.419736 -0.0996441 0.49608 0.100801 1.01036 0.0777867 0.886046 0.184112 0.0200065 0.286133 0.984854 -0.197184 -0.34137 -0.195763 -0.250992 0.864714 0.0714648 0.189383 0.567371 -0.282345 0.0728928 -0.0169767 0.104352 -0.260779 0.162065 -0.0362068 0.708387 0.0420745 -0.170386 0.6454 -0.208179 0.487 0.149019 0.117512
+so -0.000759398 0.12967 0.625731 -0.237625 -0.258436 -0.572014 -0.160067 -0.0960993 -0.0149682 -0.292147 -0.0200747 -0.0977488 -0.14414 0.216807 -0.318453 0.613484 -0.413457 0.176898 0.116887 -0.256114 0.510277 1.2758 0.448008 0.258969 -0.189045 -0.314038 0.363401 0.239129 -0.0490053 -0.63677 -0.570587 -0.240474 0.015962 -0.305797 0.0712085 -0.149317 -0.687983 0.343906 -0.0108595 -0.047171 -0.385562 0.135732 0.305656 -0.46173 -0.195599 0.628061 -0.742926 0.245617 -0.0194083 0.436707
+company 0.136539 1.04416 0.302392 0.362644 -0.235418 0.556642 0.0752815 0.660262 0.174842 0.204908 -0.143302 0.17401 -0.725063 0.857336 0.65112 -0.121357 -0.303032 0.211661 0.584084 0.229102 0.451917 1.13732 -1.25465 -0.0166276 -0.351863 0.507975 0.867224 -0.889119 -0.410267 0.123227 0.567686 0.307552 -0.587689 0.301812 0.616127 0.659096 -0.826703 1.02966 -0.275342 -0.520911 -0.142548 0.782203 0.71207 0.217205 -0.207959 0.170665 -1.24917 -0.513252 0.842078 1.12231
+west -0.128748 -0.57783 -0.0680797 0.199221 -0.0611378 0.45412 -0.307306 0.185105 0.718449 0.0260665 0.192166 0.370896 0.39533 0.60375 -0.0223646 -0.0685641 -1.48543 0.80826 0.160447 1.32041 0.637947 0.0341647 0.423094 0.05998 -0.808176 0.564828 0.326649 -0.258808 0.0480949 0.724584 -0.32562 0.491406 0.469642 -0.0341571 -1.05799 -0.701685 0.322816 0.936399 -0.0753659 -0.528535 -0.101259 -0.0688191 -0.033 -0.211207 0.475124 0.294233 -1.1793 -0.862714 0.321755 0.928418
+now -0.0282273 0.164325 0.249992 0.0938267 -0.203846 -0.0577903 -0.291651 0.255802 -0.18514 -0.133213 -0.0741883 -0.303195 -0.218466 0.0739022 -0.534744 0.238488 -0.762285 -0.148715 0.131105 0.358657 0.404002 0.545887 0.0721709 0.69698 -0.372718 0.251906 0.179831 -0.665211 -0.101332 -0.24021 -0.107278 0.126131 -0.309106 -0.102937 -0.40641 -0.0272275 -0.533046 0.366124 -0.21845 -0.0782567 -0.608107 -0.0726276 0.2194 -0.2007 0.233612 0.374189 -0.343768 -0.037649 0.470039 0.668044
+de 1.00486 -0.168105 0.309904 0.455589 0.162308 0.996881 0.232518 0.696205 0.351339 0.0944484 0.107447 -1.14873 -1.19155 0.124238 0.171698 -1.16407 -0.0137544 -0.819785 0.251629 -0.127501 -0.539117 0.249742 1.06443 0.294731 -0.829967 0.319462 0.303891 -0.191771 0.723271 -0.32977 -0.330597 -0.250268 0.464691 -0.865281 0.565559 -1.10128 -1.01376 0.554091 0.33449 0.172686 -1.23342 1.1459 1.2575 -0.129183 1.53976 0.624581 -1.14741 -0.549101 0.837986 -0.393407
+age -0.38056 -0.168645 -0.175026 0.730961 -0.873189 -0.130723 -0.302339 0.61045 0.553489 -0.689762 0.838501 -0.462204 -0.198803 -0.56488 0.491236 -0.487528 -1.05368 -0.0578206 0.468929 0.114662 0.773244 0.574343 0.545922 0.894026 -0.0464097 0.512627 1.69776 0.279877 1.03562 0.103214 -0.630422 1.06255 0.813923 0.530142 0.609115 -0.876825 0.426051 -0.496667 -0.22279 -1.17623 0.654854 0.279438 0.888853 -0.217849 -0.549974 -0.841398 -0.356013 0.767887 0.482856 -0.459883
+town 0.0359562 -0.465085 -1.10835 0.0384539 0.493698 0.461922 -0.686983 0.559158 0.270484 -0.248404 0.14692 0.039442 -0.468844 0.125131 0.2766 -0.581534 -1.08192 0.74647 0.295514 1.01602 0.246918 1.3653 0.330362 0.892928 -0.635817 1.76294 0.937591 -0.765275 0.294058 -0.100497 -1.06008 0.473416 0.179193 -0.371987 -0.70724 -0.811526 -0.824977 1.12486 -0.177926 -0.606238 0.572138 -0.649813 0.469008 -0.292865 0.0644468 0.00765719 -0.117623 -0.250823 0.430508 0.482739
+played -0.32454 0.0958198 -0.138532 0.167521 -0.589695 0.276212 0.356534 -0.537423 0.0354223 -0.303327 2.06305 0.872896 -0.991618 -0.56855 -1.00469 0.798096 -0.81776 0.314293 0.164571 -0.538252 -0.0156008 0.62434 0.112018 0.143435 -0.891448 0.75956 1.12037 -0.449831 0.385964 -0.723359 0.345076 0.0574224 1.44221 -0.385029 -0.615892 -0.942413 -0.180457 -0.844714 0.471078 0.105658 0.606303 0.114291 1.00259 1.11939 0.542667 0.747108 -0.616281 -0.731185 0.454082 0.577275
+game -0.956372 0.816431 0.485191 0.788197 0.72119 0.467551 0.560365 -0.973847 0.0819806 -0.531972 0.619576 0.514209 -0.641405 -1.06694 -0.851869 1.0317 -0.671697 0.533393 0.280682 -0.636165 0.791446 1.56283 0.198978 -0.368899 -0.738175 0.289096 0.659475 -0.786761 0.119215 -1.18021 0.0839098 1.11536 0.279869 -1.52994 0.0898779 -1.0871 -0.128225 0.985519 0.740075 -0.0135744 0.396373 -1.08081 1.71491 1.70208 -0.294402 -0.243404 -1.14295 -0.804205 -0.0787474 -0.142674
+use -0.123244 0.818547 0.434147 0.0493764 -0.0196176 -0.110826 0.652728 -0.129143 0.702099 0.00336497 -0.0201629 -0.436701 -0.404314 0.206754 -0.654262 0.309983 0.0235869 0.95852 0.536072 0.0556869 0.821767 0.765875 0.218607 0.334875 0.399453 0.414839 1.09905 -0.404732 -0.175275 0.0190758 -0.194253 -0.411415 -0.228853 -0.226477 0.0467339 0.104998 0.251938 -0.0123952 0.569832 -0.387497 -0.803394 0.557606 0.397909 -0.476646 -0.582366 -0.337606 -0.337164 -0.371512 -0.41025 -0.296857
+four 0.214134 0.0765706 0.0118869 0.220728 -1.1924 -0.272891 -0.00357152 -0.204119 -0.501671 0.128098 -0.0589519 -0.246534 -0.653089 -0.490051 -0.0951046 0.151297 -1.1175 0.338374 0.0147805 0.190162 0.228156 -0.500992 0.340511 0.210401 -0.0575745 -0.664407 0.225188 -0.402193 0.528857 -0.521223 -0.174129 0.00819309 0.750741 -0.106912 -0.133872 -0.448977 0.351299 0.348538 -0.271049 -0.43916 -0.595176 0.364036 0.302948 0.381199 -0.470338 -0.571491 -1.16942 -0.332565 0.116666 0.144062
+population -0.617611 -0.637436 -1.16648 0.0909255 0.773131 0.6114 -0.828292 0.83955 0.791847 -0.564264 -0.586451 -0.277753 0.0951329 -0.638415 -0.533509 -1.2527 -1.63345 0.851398 0.111775 1.04945 0.615532 1.21238 -0.399464 0.717783 -0.42555 1.08824 1.53156 -0.505731 0.410568 -0.130717 -1.76969 0.976435 0.486563 -0.742776 -1.43335 -0.211294 -0.0648564 0.99104 -0.496385 -1.0201 0.791474 0.4609 0.777485 -1.03191 -0.579338 0.497531 -1.0306 0.658539 0.247594 -1.04789
+each 0.139484 0.581319 0.203769 -0.382122 -1.04882 -0.597842 0.16963 -0.806563 -0.00777009 -0.0229781 -0.420805 -0.861564 -0.38657 -0.852992 -0.263399 0.150851 -0.753814 0.717591 -0.126882 -0.210451 0.524242 0.153057 0.345482 -0.107056 0.155045 -0.279925 -0.235987 -0.473437 0.282812 -0.575303 -0.0302163 -0.207447 0.75449 -0.197571 -0.278597 -0.127711 -0.00425192 0.351381 -0.208998 -1.08345 -0.737342 0.230281 0.641403 0.0400227 -0.566446 -0.587751 -1.07786 -0.412873 -0.112265 -0.228535
+released 0.323588 1.68405 -1.02501 0.651096 -0.25888 -0.217252 0.182134 -0.493163 -0.0945091 -0.0274458 0.956634 0.462715 -1.87529 -0.803202 0.144765 0.294618 -1.08571 -0.238647 -0.567746 0.599523 0.418509 0.375537 0.202506 0.164652 -0.941848 0.333487 0.87446 -0.434968 -0.0635518 0.194183 0.174982 0.447389 0.112815 -1.21796 0.657607 -0.115142 0.338744 -0.462799 0.696495 0.504435 0.141082 0.026144 0.606233 0.0088458 -0.648469 1.57216 -1.4514 -0.0492239 0.382337 0.32031
+government -0.219103 -0.433604 -0.978725 -0.0401453 0.159596 0.0864354 -0.308758 0.815459 0.73352 -0.514431 -0.665797 -0.285867 0.116453 0.200304 -0.36889 -0.530394 -0.127852 1.15451 -0.333825 0.140459 -0.281437 1.20486 0.0719569 0.340176 -0.409052 0.884452 0.779144 -1.0925 -0.810043 -0.526927 -0.601914 0.624603 -0.269616 -0.173278 0.784575 0.278479 0.655184 1.07799 -0.840775 0.00600585 -0.728135 1.09584 0.0474967 -0.584669 0.15725 -0.0224064 -1.50335 -0.097143 0.112214 -0.103271
+located 0.714213 -0.739919 -0.638377 0.0187621 0.408835 0.474954 -1.24519 0.486288 0.773918 0.205523 0.409709 -0.851909 -0.943868 -0.568835 -1.12267 -0.27208 -1.72769 0.885321 0.77599 1.37166 0.299123 0.593018 0.0681682 0.0777581 -0.48161 0.824046 0.812349 -1.0707 1.28044 0.137412 0.502531 0.181275 -0.172357 -0.18627 -1.64698 0.195401 -0.286961 -0.157674 -0.0336047 -1.17928 -0.31798 -0.102498 0.292271 -0.863412 0.564231 0.308488 -0.460155 -0.786927 0.86271 0.578849
+day 0.302765 -0.0337717 -0.124328 -0.145386 -0.400702 0.00804564 -0.406436 -0.376428 0.331504 -0.921714 -0.236953 -0.652085 -0.521588 0.340615 0.235808 -0.0661813 -0.65066 -0.192945 -0.106828 0.0525596 -0.299346 0.776295 0.495551 0.0748605 0.468353 0.0959237 0.75543 -0.323758 0.0541376 -0.621333 0.0523451 0.339497 0.738563 -0.416656 -0.204123 -0.593187 0.38547 0.551438 0.320543 -0.70385 -0.138345 0.314798 0.356955 0.135185 -0.681325 0.152247 -0.475349 0.0138901 -0.698366 0.901168
+district 0.393222 -1.39297 -0.759606 -0.450781 -0.537765 0.500063 -1.59109 0.390308 0.894295 0.261798 0.13189 0.641517 -0.740049 -0.817419 -0.286929 -0.607175 -0.873987 1.71915 0.0602802 0.4932 0.462162 1.29062 -0.175343 0.058238 -0.93345 1.61914 0.0397002 -0.84476 0.269138 -0.209905 -1.24589 1.23907 -0.57092 0.527712 -1.3437 -0.938795 0.163078 0.82519 -0.302111 -0.173896 0.15953 0.55162 0.0339719 -0.400312 0.0628648 0.535224 -1.10742 -0.977316 0.689914 0.631857
+same 0.0571734 0.490613 0.276479 0.0550861 -0.115563 0.0903384 -0.0883348 0.0980216 0.160683 -0.210943 0.0447948 0.101149 -0.635851 0.195387 -0.160741 0.254647 -0.624788 0.0300499 -0.316385 -0.520384 0.124704 0.467593 0.150084 -0.050415 0.132133 0.291861 0.0778566 -0.370998 0.611793 -0.688336 -0.251572 0.418539 0.218224 -0.246358 0.143209 -0.407068 -0.115921 -0.150587 0.155888 -0.518055 -0.611306 0.333324 0.497748 -0.597364 -0.560517 0.269676 -0.613507 0.361484 -0.259752 0.443269
+based 0.659881 1.16943 -0.0970605 -0.108881 -0.358422 0.268247 0.164628 -0.0125022 0.699743 0.0918371 0.290992 -0.313314 -0.2655 -0.187924 -0.840768 -0.116154 -0.387914 0.306651 0.147955 0.0139788 0.810913 -0.0722263 -0.0807325 -0.148749 -0.352725 0.442027 0.370714 -0.66611 0.28433 0.120144 0.483604 0.402536 -0.169181 -0.405568 -0.383467 0.569061 0.0160904 -0.0797419 0.522148 -0.237393 -0.0743223 0.387017 0.633388 0.169643 0.383909 0.766932 -0.528385 -0.139649 0.466049 0.246233
+if -0.175542 0.177537 0.703746 -0.256701 -0.279941 -0.78162 -0.131097 -0.227173 0.220345 -0.0252367 -0.251063 -0.359646 -0.183946 -0.36907 -0.28204 0.687045 -0.161991 0.167574 -0.373036 -0.561559 0.442829 1.46378 0.170501 0.0312674 0.0118444 0.0118688 0.302044 0.188752 0.240487 -1.15105 -0.811808 -0.237731 0.0116698 -0.86629 -0.118668 -0.153769 -0.867664 0.00687042 0.225496 -0.105419 -0.960967 -0.17253 0.319645 -0.629112 -0.413522 0.549466 -1.22814 0.510944 -0.153931 0.677436
+east -0.0691068 -0.653147 -0.140027 0.0951241 0.0347539 0.510426 -0.438484 0.332535 0.734435 0.0801888 0.0579485 0.455829 0.697073 0.742421 -0.0936455 -0.277698 -1.7467 0.928722 0.219905 1.51091 0.731724 0.0558717 0.458451 -0.00335461 -0.64717 0.362257 0.335216 -0.278606 0.0687777 0.597045 -0.139746 0.378954 0.399638 -0.0931539 -1.05981 -0.719548 0.562632 0.953955 -0.224284 -0.741302 -0.389355 -0.0309772 0.0674484 -0.354399 0.428693 0.335855 -1.16563 -0.77288 0.330217 0.758879
+like 0.240538 0.272124 0.501785 0.503289 -0.52857 -0.583768 0.0892766 -0.101567 0.199266 -0.415147 0.158689 0.0279196 -0.288138 0.422901 -0.784312 -0.0625692 -0.814519 -0.207537 0.472818 -0.336513 0.741155 0.500304 0.342395 -0.177666 -0.449091 -0.0146669 0.187218 -0.136643 -0.344098 -0.639158 -0.375275 -0.714899 0.310074 -0.120609 -0.17751 0.229808 -0.65494 -0.0428388 0.104727 0.194502 -0.722556 0.087631 0.109587 -0.235226 -0.135516 0.543156 -0.320486 -0.035339 0.168839 -0.145439
+house -0.693822 -0.601453 0.157714 0.351017 -0.641265 0.450395 -0.362354 -0.155474 -0.431205 -0.810967 0.159744 -0.311063 -1.48141 -0.184071 0.312538 -0.429684 -0.218122 0.552585 -0.169146 0.74029 0.440925 1.09122 0.884594 -0.277342 -0.429664 1.2654 -0.129172 -1.54612 -0.174303 -0.624212 -0.113751 0.0232473 -0.470968 0.738662 0.232503 0.0773859 0.315249 0.200151 -0.250413 -0.525544 -0.334251 0.0125483 -0.251918 -0.52027 0.126414 -0.209346 -0.912479 0.154194 0.108188 0.315022
+john 0.0668055 -0.814552 0.281859 0.00959616 -0.90832 0.344578 0.562789 0.281856 0.225121 -1.0516 0.161577 -0.741777 -0.70862 0.529256 -0.0328352 0.591635 -0.337578 -0.195088 -0.872808 0.957511 0.819056 -0.0341175 -0.0181278 0.192161 0.183024 0.577238 0.633856 -0.785461 0.127293 0.159513 -0.451775 -0.196341 0.530802 0.582107 0.557338 0.120737 0.168973 0.264547 0.706751 0.25138 -0.287137 0.236972 1.03119 0.816755 1.03676 0.120309 -0.77237 -0.0348303 0.286081 0.619579
+any 0.299097 0.289157 0.417477 -0.367945 -0.639655 -0.974424 0.448903 -0.175565 0.34477 -0.237388 -0.0563634 -1.13952 0.157385 -0.790364 -0.402588 0.58164 -0.0527959 0.59799 -0.244264 -0.520291 0.550434 0.676848 0.456294 0.601822 -0.0101992 -0.00183235 0.718824 -0.121753 0.300732 -0.348871 -0.649749 -0.0121884 0.138006 -0.794718 -0.0204816 0.164725 -0.146623 0.458403 0.131778 -0.396456 -0.232111 0.13823 0.261737 -0.442359 -0.406721 -0.0969528 -0.867639 0.0877183 0.273585 -0.344769
+band -0.132888 0.549941 -0.76951 0.65732 -0.425145 -0.412517 1.13928 -0.354925 0.411758 0.176239 0.401539 0.296304 -1.79747 -0.15061 -0.345882 0.597547 -2.15577 -0.849973 -0.62445 -0.281184 0.214399 2.14209 0.104401 0.0350146 0.315007 1.45974 0.488614 -0.229232 -0.923279 1.03351 0.497754 0.568418 0.879883 0.912764 0.0891663 -1.44531 -0.341497 0.0330851 0.188746 0.66827 -0.184044 -0.130587 0.369977 -0.279691 -1.14637 0.551761 -0.801905 0.624691 0.293167 0.00578002
+station 0.472439 0.872753 -0.686551 -0.0478422 0.124015 -0.050063 -0.884809 1.49376 0.0293743 -0.245282 -0.434846 0.11883 -1.02187 0.280788 -1.33583 1.03444 -1.45333 -0.516116 0.807334 1.29205 -0.410399 1.96553 0.255258 -0.418542 0.305661 0.967324 1.1528 -0.796517 -0.730324 0.175184 0.240774 1.50877 0.33236 0.129748 -0.130844 -0.149724 -0.542131 1.28995 0.270371 -1.05868 -0.584487 0.0146442 -0.748975 -0.5367 0.807801 -0.0823883 -0.962319 -0.894828 -0.405587 0.823957
+general 0.499197 -0.325644 -0.201949 -0.476407 -0.534521 0.506573 -0.132986 0.626115 0.844696 -0.240073 0.591196 -0.231346 -0.561473 -0.00106197 -0.863917 -0.133385 0.535248 0.634294 -0.471836 0.0415415 -0.163384 0.250935 -0.0356114 -0.169377 0.380115 0.170902 0.244313 -0.514734 -0.128635 -0.781735 -0.461773 -0.216518 -0.0866405 0.270145 0.211039 -0.039917 -0.0560152 0.999058 -0.201829 0.0815989 -0.626057 0.488238 0.668937 -0.290218 0.279136 0.186641 -1.0628 -0.27337 -0.151513 0.403496
+because -0.448585 -0.179985 0.523201 0.159358 -0.134563 -0.433361 -0.0459225 0.112835 0.231275 -0.238243 -0.0037817 -0.215185 -0.105575 0.121487 -0.319115 0.409874 -0.300887 0.391728 0.271134 -0.268997 0.254967 0.828009 0.225465 0.49678 0.43876 0.0337139 0.503234 -0.192222 -0.028825 -0.493224 -0.801606 0.0680889 -0.116158 -0.466416 0.0134091 0.162861 -0.462653 0.106377 0.192036 -0.22519 -0.377656 0.00686697 0.347318 -0.395027 -0.570622 0.696166 -0.592451 0.343842 0.0409054 0.0642421
+end -0.320036 0.129023 0.130782 0.407689 0.142849 0.368857 -0.137418 -0.197814 0.0907992 0.0540159 0.247936 0.207325 -0.240888 0.65022 0.0657004 0.283618 -0.608307 0.9293 0.232922 0.350513 0.268536 0.412897 0.598359 0.0797968 0.616464 0.477097 0.0709484 -0.554944 0.265263 0.093319 -0.188924 0.306595 0.543128 -0.497171 0.0930342 -0.759466 0.250289 0.487683 0.26481 -0.980048 -0.28073 -0.0382344 0.577363 -0.0130404 -0.6063 0.089629 -0.741171 -0.118894 -0.507888 0.651079
+system -0.120104 0.560248 0.143061 -0.167053 0.0311046 0.656164 -0.438083 0.702334 0.842152 0.616901 -0.276271 -0.113793 -0.370925 -0.485313 -1.17819 0.483916 -0.476545 0.581747 0.439415 0.041991 0.732615 0.662784 -0.28295 0.242224 0.263524 -0.0971555 -0.0901511 -0.657837 -0.254775 -0.410985 -0.139422 0.547298 0.0162322 -0.779098 0.355552 0.0652527 0.167693 0.499397 0.245446 -0.146649 -1.04637 0.473346 0.656032 -0.598352 -0.0540451 -0.502288 -1.00258 0.008114 -0.421525 -0.347372
+river -0.459518 -0.937536 -0.687704 -0.253097 0.385349 0.685384 -0.90039 0.566481 1.85695 0.236487 -0.293251 -0.393186 -0.317191 0.428792 0.686728 0.577758 -1.80396 -0.2841 1.42943 1.12271 0.275772 1.43934 -0.125737 -0.0763812 -1.12315 1.47268 -0.521091 0.34243 0.488044 0.0053437 -0.332051 -0.554321 0.164752 -0.301043 -0.531491 -0.757969 -0.219923 0.942973 -0.12199 -0.712418 0.482599 0.641959 0.842928 -0.0826709 0.290827 0.557747 -1.39941 -0.753744 0.216348 0.62543
+line 0.283319 0.78419 0.0740506 -0.26949 -0.161192 -0.180681 -0.0581767 0.920679 0.806671 0.050693 -0.360818 0.781265 -0.140799 -0.182831 0.240194 0.685226 -1.57277 0.564094 0.371858 0.433991 0.318592 0.53035 0.242296 0.162696 -0.0886347 0.75616 0.321332 -0.043272 0.218322 0.61864 -0.254903 0.321726 0.817086 -0.2357 0.390492 -0.158319 -0.142559 1.10762 0.916567 -0.912084 -0.0869874 -0.00770232 0.494244 -0.605574 0.192022 -0.326477 -0.687003 -0.403597 -0.187903 0.16263
+league -0.381756 0.186949 0.597371 0.124537 -0.227382 1.06362 0.400234 -0.219357 0.431229 0.00603185 0.754678 0.845284 0.32424 -0.812964 -0.271558 0.428796 -1.29371 1.21214 0.202262 -1.08743 0.420785 0.81971 -0.0836601 0.320862 -0.0941023 1.11654 1.61157 -0.895567 -0.270152 -1.01258 0.0319138 1.51942 1.66679 -0.92992 -0.50808 -1.3729 0.343875 1.49056 -1.01763 0.940092 0.101899 -0.394596 0.689879 1.19847 0.611732 0.310381 -0.983179 -0.574931 0.627053 0.654297
+college 0.372875 -1.03844 0.641539 0.249326 -0.141248 0.587718 -0.33868 0.164834 0.994539 -0.231318 0.444325 -0.659657 0.0389526 -0.641682 -0.936931 0.000487362 -0.826 0.241276 -0.301015 1.10326 0.577145 0.633589 0.409475 0.418423 -0.428209 -0.0789908 1.11179 -1.05234 -0.538914 -0.747708 0.119012 1.85462 0.153389 1.11054 0.163721 -1.09222 -0.418302 -0.192489 -0.551538 -0.327926 0.197963 0.107238 1.12221 0.529547 0.321561 -0.682452 -0.377327 -0.0535263 0.22056 1.07377
+long -0.250165 0.115149 -0.282893 0.347123 -0.374733 -0.674133 0.0269891 -0.216737 0.197308 0.601739 0.0154367 -0.390697 0.0137117 0.319229 -0.552576 0.535972 -0.357971 -0.843835 0.304462 -0.0822142 0.644776 0.0902365 0.922597 0.623016 -0.130107 -0.126038 0.205913 -0.231744 0.542067 -0.345525 -0.372694 -0.0545629 0.645026 0.227963 -0.029373 0.0997715 0.325794 0.598928 -0.415609 -0.672702 -0.475791 0.090153 0.166658 -0.216558 -0.261134 0.19641 -0.542674 -0.295483 0.0681788 0.320708
+won -0.23419 -0.471609 -0.022573 0.595699 0.0263854 -0.803153 -0.71156 -0.384518 0.521741 -0.28341 1.00282 0.370792 -0.742814 -0.0192737 0.308361 -0.231406 -1.46752 0.518578 -1.02586 -1.52627 0.238419 0.372264 -0.325569 -0.817716 -1.45335 -0.489879 0.924204 -0.941658 0.916868 -0.666437 0.146028 0.312193 1.17641 0.00491712 0.360758 -0.487267 0.0834671 0.1858 -0.414873 0.382275 0.488846 0.390445 1.59152 0.692171 0.864441 0.392816 -1.27109 0.0354623 0.28054 0.787788
+home -0.328582 -0.0513859 0.173868 0.39535 -0.0590728 0.33545 -0.310082 -0.0293188 -0.116195 -0.409223 0.283512 -0.381629 -0.474018 -0.216876 0.216983 -0.0789923 -0.525662 0.681457 0.38276 0.348684 0.213256 0.729092 0.494077 0.59949 -0.202633 0.255744 0.861408 -0.918721 -0.283382 -0.770436 0.354515 0.23638 0.457037 0.0473803 -0.350951 -0.230292 -0.440417 0.324435 0.277328 -0.335891 0.47408 -1.0924 0.0831991 0.186929 0.0381791 0.214403 -0.630627 -0.555495 -0.0140451 0.378884
+back 0.159612 0.21169 0.563112 0.359258 0.145696 -0.371649 0.00485064 -0.418477 0.10605 -0.10388 0.512305 0.273785 -0.137148 0.583203 0.435635 0.324523 -0.797563 0.460766 0.195803 0.287981 0.75659 0.787557 1.10537 0.725777 0.351992 0.0886146 0.127077 -0.0254453 -0.121467 -0.130931 0.263828 0.0534759 0.0937559 -0.166277 0.278928 -0.470552 -0.333568 0.320892 -0.206033 -0.678011 -0.333086 -0.383807 0.138496 0.24115 -0.0366878 0.0847771 -1.18486 -0.0451289 -0.319342 0.452717
+york -0.279595 -0.812567 0.453018 -0.992219 -1.15913 0.611913 0.41769 -1.11926 0.929282 0.0798098 -0.551313 0.704761 -0.985764 0.966778 -0.111461 1.10295 -0.0564825 0.0519682 -0.439718 0.66361 1.17024 0.0634099 -0.153174 0.764332 -0.568646 -0.0713305 2.22522 -1.64246 -0.509031 -0.157873 0.594201 1.17631 0.943958 -0.87567 1.6237 -0.982903 0.0416014 0.165427 0.800464 -0.940585 -0.241713 -1.16966 0.716403 -0.789269 0.525212 -0.540362 -1.01878 -0.104226 2.25099 0.311691
+career -0.333004 -0.0454585 0.0902078 0.900086 -1.23114 -0.0263287 -0.0669419 0.533821 0.319643 0.177131 1.72516 0.0471849 -0.0128221 0.276038 -0.193261 0.102597 0.164631 0.133638 0.181578 -0.624881 0.41708 0.0239638 0.728024 0.79408 -0.485544 -0.101972 1.14144 -0.412962 -0.0708933 0.0854705 0.327724 1.7381 0.66826 0.882078 1.26326 -1.07165 -0.136077 0.299318 0.118851 0.0186989 0.307987 -0.0816869 0.925306 1.37876 0.103618 0.74407 -1.14244 -0.164724 0.315198 0.739812
+began 1.14208 0.305891 0.381726 0.065308 -0.0606673 0.0850982 0.113522 0.0964767 0.321924 -0.145238 0.755548 -0.0540697 -0.593934 1.56305 -0.399968 -0.0208544 -0.653565 0.210622 0.401565 -0.386138 0.529201 0.171336 0.488858 0.573328 -0.129399 0.0371387 0.793891 -0.363342 -0.379608 -0.389035 0.779711 0.42222 0.585759 0.208501 0.89379 -0.298503 0.702713 -0.126477 -0.00870506 -0.489696 0.593132 0.657307 0.078355 0.455343 -0.0581581 0.385496 -1.00209 -0.0381167 0.0463873 0.242192
+following 0.208057 0.00135868 0.159834 -0.0049036 -0.272681 0.281115 0.0421705 0.56659 0.190129 -0.0193505 0.0397685 0.0211341 -0.657429 0.682641 -0.159783 -0.386267 -0.785849 0.0211071 -0.494956 -0.412077 -0.275868 -0.285853 0.153611 0.186655 0.289879 0.384982 0.0927425 -0.483626 0.0134375 -0.687876 -0.279452 0.45045 0.577721 -0.467501 0.219055 -0.481128 0.293 0.0982437 0.166467 -0.153113 -0.262618 0.112302 0.50031 0.329571 -0.719655 0.359141 -1.10897 0.101956 -0.35388 0.735232
+found -0.285564 -0.177048 0.15452 -0.460049 0.289338 -0.404148 0.129055 -0.11577 0.11017 -0.337214 0.88051 -0.187808 -0.592735 -0.0944344 -0.537833 -0.24591 -0.786871 -0.0132753 0.351967 0.466927 0.604749 0.169017 0.440952 0.320598 -0.362655 0.409554 0.71279 0.335731 0.616547 -0.65409 0.0966495 -0.319577 -0.222674 -0.339931 -0.516257 0.705622 -0.171766 -0.789549 -0.0295864 0.0166761 -0.21221 0.126413 0.346607 -0.630273 -0.122461 1.07154 -0.758653 0.0308162 0.254399 0.226205
+century -0.657032 0.0204229 0.922473 -0.0543885 -0.143929 0.904401 0.296141 0.215788 0.0997882 -0.984049 0.103988 0.248676 -0.553495 1.42208 -0.530945 -0.193259 -1.10841 0.27251 0.949087 0.631246 0.252384 0.271384 0.388252 0.506458 -0.424589 0.6532 0.536577 -0.78844 1.18153 -0.387694 -0.737887 0.337663 -0.329137 -0.086202 -0.0526924 -0.918129 0.886162 0.583164 -0.190234 -0.2693 -0.0791 1.0698 0.993116 -0.166023 -0.184916 0.285365 -0.0740692 0.508612 -0.428392 -0.208768
+around -0.0941757 -0.0519982 0.0154999 0.289529 0.000198912 -0.496591 -0.1129 0.126492 -0.0665898 -0.141976 -0.364017 -0.29196 -0.296001 0.855937 -0.460276 -0.520858 -1.28952 -0.116098 0.437744 0.268856 0.326125 0.390608 0.261631 0.124393 -0.290074 0.0294093 -0.0371701 -0.39163 0.0910522 -0.483231 0.026151 -0.0358919 0.267558 -0.101571 -0.316235 -0.111894 -0.276503 0.340809 -0.0207628 -0.585699 -0.42855 -0.0963283 0.218329 -0.0461402 -0.536719 -0.101924 -0.712923 0.425139 0.0425721 0.190226
+international -0.254852 0.309471 -0.0441815 0.785132 -0.748342 -0.16229 -0.228456 -0.199449 0.931198 0.97845 -0.0612589 -0.828276 0.589559 0.542127 -0.622605 -0.415408 -0.366506 0.0902906 -0.341939 -0.631106 -0.747832 -0.163474 -0.0206912 0.0492354 -0.541256 0.519182 1.32783 -1.06499 -0.0716653 -0.263354 0.957908 0.705581 0.74412 -0.502413 -0.0829202 0.0581495 0.12338 0.0587945 0.105068 0.199594 -0.700785 0.346121 0.602636 0.284513 0.210585 0.0615636 -0.787672 -0.423651 0.655854 0.422861
+member -0.659989 -0.663526 -0.698988 1.02825 -1.60017 0.484669 0.262452 0.0771996 0.196741 0.487734 1.10238 -0.0880776 -0.748294 -1.20565 -0.167643 -0.335104 -0.465123 0.403761 -0.379165 -0.074261 0.214677 0.202179 -0.279645 -0.13546 0.390021 0.849546 1.06193 -0.93879 -0.102114 0.290864 -0.0127456 0.412517 0.156014 0.16362 0.0370198 -0.195805 0.0417324 -0.260031 -1.39757 -0.0351998 -0.68829 -0.0184103 0.624643 -0.222241 0.643757 -0.10763 -0.761018 0.426255 0.120062 0.668037
+although -0.333398 0.196904 0.501086 0.0522638 0.00173319 -0.25338 0.350102 0.463155 -0.101584 -0.213597 0.0342151 -0.0393004 -0.380356 0.460216 -0.811134 0.308396 -0.55172 -0.160145 -0.143865 -0.462648 0.263071 0.328205 0.284072 0.549812 -0.220628 -0.0635678 0.384516 -0.131458 -0.00721918 -0.72313 -0.658013 -0.0403593 0.0469287 -0.215357 -0.219766 -0.0974776 -0.042291 0.18517 0.0673714 0.140414 -0.460207 0.169264 -0.0226606 -0.315441 -0.399778 0.560232 -0.678249 0.103339 0.153357 0.359117
+public -0.0577691 -0.114562 0.0690761 -0.300994 -0.113629 -0.315015 -0.476447 0.300738 0.719645 0.0593116 -0.958926 -0.961261 -0.448171 -0.136147 -0.888876 -0.29164 0.105019 0.474638 0.0492107 0.111682 0.0140451 0.523278 0.280754 -0.0620001 -0.643365 0.505025 0.696413 -1.32472 -0.527481 -0.777917 -0.243437 0.902961 -0.235714 0.633726 0.53972 0.0315477 0.333633 0.508884 0.231526 -0.368227 -0.128001 0.32501 -0.269554 -0.146588 0.414734 0.224167 -0.817168 -0.20815 0.133732 0.140894
+british -0.924781 1.16455 -0.74774 0.234734 -0.704411 -0.0150426 1.33131 0.650169 0.759543 -0.848025 0.256399 -0.391091 0.847281 1.48198 -0.270507 0.0310718 -0.995825 0.127087 -0.918783 0.661316 -0.134662 -0.182573 0.314833 -0.589194 0.0884828 0.467241 0.674093 -0.83994 0.440279 -0.829979 0.456563 -0.0364765 -0.614453 0.155077 0.226365 -0.633447 -0.214029 0.613725 0.124309 0.357897 0.0150466 0.835697 -0.0120046 -0.405203 0.383653 -0.429497 -1.79419 0.252259 0.845474 0.483994
+place -0.106477 -0.0304777 0.255932 -0.0523483 -0.305409 -0.0736271 -0.652554 -0.404721 0.168357 -0.480057 0.0635029 -0.861197 -0.393238 0.105337 0.203973 -0.265877 -0.479027 0.795027 -0.255948 -0.246336 0.127961 0.951626 0.506582 0.454297 0.138926 0.448893 0.0634471 -0.189809 0.726021 -0.343683 0.14593 0.231763 -0.0640034 -0.532785 -0.58595 -0.903301 0.26587 0.701713 0.30525 -0.640544 0.0813972 -0.227876 0.85365 0.109875 -0.0873254 -0.114554 -0.422661 0.227614 -0.0816897 0.375635
+show -0.618313 1.30834 0.978182 0.384728 -0.106058 -0.509672 -0.560159 -0.734684 0.363191 -1.25067 0.161633 0.0521545 -1.26837 0.54001 -0.68685 0.00847557 -0.442064 -0.737471 -0.324356 -0.389605 0.353232 0.898893 0.100859 -0.734733 0.543921 0.607769 0.441729 -0.19689 -0.128876 -0.262216 0.408121 0.887723 0.616847 0.170697 -0.238888 -0.513894 -0.504551 0.0547449 -0.0378145 -0.951566 -0.220553 0.388711 0.169071 0.593689 0.0424381 0.642944 -0.891946 0.058691 -0.323256 -0.0068401
+very -0.71853 0.26433 0.164524 0.302591 -0.0992506 -0.572674 0.00159516 0.086921 -0.101856 -0.163993 0.474704 -0.457101 -0.103671 0.312651 -0.901031 0.178324 -0.441001 0.145822 0.456944 0.00307359 0.452744 0.837453 0.614304 0.36343 -0.191227 -0.442423 0.345298 0.0917379 0.20393 -0.403841 -0.766927 0.0132042 0.281784 0.30321 -0.20758 -0.0195773 -0.741453 0.490601 0.030526 -0.139084 -0.396618 0.187972 0.317116 -0.193564 -0.574094 0.90374 -0.0259519 0.207666 0.173616 0.198019
+party -0.80955 -0.661923 -0.788182 0.450159 -0.928414 -0.0288047 -0.0328539 -0.204844 -0.105321 -0.237961 0.592644 0.395193 -0.743878 -0.413345 -0.193971 -0.993453 -0.378143 0.739475 -1.08865 -0.795508 0.342859 1.48514 0.389083 -0.625106 -0.210501 1.22314 0.0996249 -0.960518 -0.72721 -0.65487 0.141704 0.203411 0.457481 0.0266484 0.785074 0.190409 0.286862 1.93159 -0.636516 -0.0234027 -0.538782 0.288823 0.0983969 -0.955029 0.379544 0.722725 -1.23174 1.01255 -0.403967 0.150635
+named 0.481954 -0.306495 -0.69871 0.875801 -0.102815 0.581237 -0.366502 0.34405 0.156675 -0.280565 0.313279 0.0137753 -0.696213 -0.185779 -0.178331 0.446725 -1.00539 -0.184497 0.0614024 0.372158 0.349125 -0.115249 -0.229542 0.333548 0.0531651 0.173656 0.822926 -0.522174 0.820176 -0.819147 -0.125105 -0.133565 0.0596852 0.0313726 -0.253337 0.206504 -0.118867 -0.267238 -0.282293 -0.249611 -0.441163 -0.33932 0.96534 -0.10718 0.737088 0.535553 -0.143242 -0.386106 0.159945 0.703753
+another 0.305776 0.274499 0.131408 0.200857 -0.743908 -0.413621 0.12895 -0.152955 -0.0355202 -0.0257279 0.0251383 -0.266201 -0.131443 -0.0279375 -0.241782 0.317502 -0.274553 0.0278147 -0.0164125 -0.06715 0.434641 0.279486 0.503716 0.513726 0.28861 0.144778 0.0957959 -0.0953978 0.354591 -0.457631 -0.149986 -0.151359 0.0135644 -0.401288 -0.0411907 0.00127376 -0.211769 0.427294 0.20756 -0.407922 -0.153946 -0.0160041 0.500522 0.0413817 -0.0171068 0.28374 -0.581375 -0.159113 0.0598237 0.0695672
+major -0.00647032 0.147792 -0.11418 -0.201498 -1.04433 0.28018 -0.0280328 0.197416 0.396146 0.150252 0.00335282 -0.189503 -0.390534 0.303718 -1.02228 0.022694 -0.35833 0.315255 0.62186 -0.173501 -0.0419268 -0.450105 -0.376977 0.239383 0.166665 -0.0971478 0.125936 -0.493213 0.0470624 -0.479917 -0.0445039 0.177435 0.445299 -0.152335 -0.112007 -0.437707 0.00103912 0.809343 0.0736685 0.372226 0.0800135 0.0157522 0.606475 0.202145 -0.132258 0.707785 -0.719376 -0.729223 0.405829 0.304971
+best -0.856509 0.564926 0.0458431 1.05305 -0.984948 -0.314477 -0.179344 -0.556946 0.525531 -0.0960192 0.162977 -0.180992 -0.467189 0.331788 -0.0224064 -0.0892398 -0.481112 0.485928 -0.0438034 -0.393966 0.313369 -0.00472278 -0.365217 -0.490262 -0.496166 -0.381266 0.904127 -0.280767 0.301901 -0.189558 -0.725194 0.755958 -0.0549614 0.435715 -0.167113 -0.883045 -0.967718 0.254186 0.298298 0.302275 -0.184734 -0.667455 1.06593 0.512505 -0.0591275 0.66028 -0.473046 0.0541512 -0.0939675 0.540099
+club -0.49843 0.405749 0.513629 0.973988 0.330904 0.737178 0.56833 -0.58612 -0.518663 0.472705 0.324055 -0.198753 -0.115667 -0.424516 -0.0169607 -0.430788 -1.31047 0.0414126 0.411448 -1.072 -0.368314 1.32644 0.673333 0.418951 -0.325269 1.27387 1.27564 -1.02887 -0.0396466 -0.531348 0.417854 1.10434 1.07599 0.268265 -0.315836 -0.893825 -0.881686 0.699633 -0.351028 0.25445 -0.157987 -0.00820652 0.496201 0.785085 0.461513 -0.0218327 -0.601266 -0.50801 0.755813 1.46658
+small 0.0358642 0.251121 -0.287537 -0.0455803 -0.399681 -0.147961 0.24556 0.0336052 -0.0320036 0.292938 0.241035 -0.475724 -0.415641 0.125596 -0.707211 -0.375352 -0.700662 0.204393 0.78447 0.573853 0.714306 0.412643 0.668996 0.174981 0.0132646 0.0961456 0.136264 -0.496902 0.450512 -0.40094 -0.319947 -0.530083 0.211478 0.374191 -0.807167 0.675667 -0.76452 0.641319 -0.350801 -0.631555 -0.170944 -0.0276999 -0.0761163 -0.380113 -0.184539 0.213411 -0.548368 -0.0740308 0.438656 -0.121554
+within 0.144313 -0.439482 0.145105 0.270446 -0.0380534 -0.146534 0.0299659 0.77521 -0.033429 0.730031 -0.700235 -0.334833 -0.230925 0.115862 -0.650124 -0.377721 -1.00569 0.358282 -0.0131279 -0.160072 0.599782 -0.122068 0.270852 -0.185998 -0.478048 0.549746 -0.12694 -0.405613 -0.522421 -0.558537 -0.803086 0.183983 0.195228 -0.416433 -0.739068 -0.190718 -0.10674 0.188234 -0.228228 -0.0948603 -0.663738 -0.0100134 0.130071 -0.607039 -0.850143 0.0694788 -0.97406 0.251473 -0.0326767 0.245996
+former 0.190257 -0.668636 -0.188094 0.943036 -0.574272 0.35969 -0.101416 0.664688 -0.0259874 0.066538 0.467063 0.315413 0.00260277 -0.118993 -0.471844 0.0109108 -0.847922 0.207072 -0.157051 -0.104601 0.0811061 -0.0335203 -0.125848 0.405846 0.326335 0.95819 0.570806 -0.994364 -0.106054 -0.141676 -0.252406 0.0762544 0.24472 0.211246 -0.108369 0.0911561 0.314081 0.181647 -0.0303991 0.210027 -0.792904 -0.315049 0.0141733 0.416105 0.780764 0.431702 -0.691476 -0.554414 0.435538 0.440815
+church 0.0011713 -1.4516 0.690013 -1.10362 -0.519345 0.165776 0.223083 0.696659 -0.458958 -0.661746 -0.996492 -0.89019 -1.06974 0.468207 0.447907 -0.127916 -1.1209 0.582979 0.152989 1.74883 0.712006 1.35321 0.781673 0.326662 0.423578 1.33664 1.39055 -1.51018 -0.590806 -0.416798 -0.292477 1.31707 0.58232 0.686529 -0.669002 -0.979384 0.781627 0.400862 -0.148965 0.700953 -1.07931 0.835685 0.0964941 -0.267062 0.316212 1.29623 0.406102 1.05121 -0.631547 0.296585
+local 0.422123 0.0879162 -0.445542 -0.545922 -0.329414 -0.277088 0.379689 0.128464 0.477841 0.00318699 -0.332037 -0.481886 -0.271 0.201877 -0.903167 -0.587729 -0.729175 0.329275 0.352878 -0.0675098 0.28079 0.502185 -0.0802324 0.598196 0.0983253 1.16025 0.341131 -0.982545 -0.382748 -0.668771 -0.361609 0.296947 0.321534 0.353447 -0.14392 0.0327255 -0.439783 0.527565 -0.34079 -0.41323 0.0806618 0.192469 -0.44288 0.328398 0.331278 0.268583 -0.585566 -0.272997 -0.175828 0.00899139
+could 0.13333 0.239243 0.575791 -0.666928 0.338189 -1.01798 0.171844 0.388272 0.294372 -0.614163 -0.471823 -0.222885 -0.339585 0.386473 -0.519469 0.942127 -0.537674 0.0431748 0.0324238 -0.620779 0.525492 1.47246 0.218145 0.721907 -0.727252 -0.702221 0.20294 -0.0972983 -0.0420262 -1.19335 -0.167511 -0.481312 0.263203 -0.47859 0.584353 0.405973 -0.265846 -0.246053 0.139236 0.00906733 -0.506487 0.460477 0.678132 -0.0983776 -0.233435 0.453324 -1.42753 0.948601 0.206244 0.665505
+march 0.663468 0.293576 -0.124351 -0.169543 -0.870987 0.548719 -0.315839 0.287706 0.665413 -0.016359 0.955779 -0.205432 -0.851168 -0.214205 0.109959 -0.488511 -0.333264 -0.248547 -0.838981 0.295533 -0.307084 0.44586 0.837747 0.074637 0.157284 0.247031 0.922088 -0.186973 0.370655 0.443025 0.296977 0.520817 -0.13766 -0.422234 0.109731 -0.0128412 1.18501 0.882152 0.237041 -0.269824 0.311854 -0.058779 0.119082 0.619135 0.573794 0.663891 -1.0432 -0.216829 0.197717 0.713515
+village -0.0400772 -0.467298 -1.28246 0.304469 0.183345 0.525525 -1.19159 0.808065 0.0326899 -0.0382715 -0.17304 0.202733 -0.36077 -0.259026 0.332841 -0.747378 -0.991073 0.640493 0.48935 1.22716 0.906142 1.20638 0.9819 0.528934 -1.23346 2.35519 0.903465 -0.652775 0.141288 -0.309687 -0.631633 0.242364 -0.446754 0.145539 -1.78374 -0.844529 -0.545871 1.00668 -0.174691 -0.742266 0.693481 0.0377602 0.623394 -0.525222 -0.373582 0.565475 -0.163166 0.111886 0.0347387 0.215334
+large -0.276892 0.233132 -0.0104331 -0.290153 -0.433637 -0.325147 0.351335 -0.0410674 0.10557 0.0747034 -0.161667 -0.68794 -0.473565 0.246608 -1.01306 -0.379942 -0.627344 0.429998 0.884855 0.485348 0.464381 0.420969 0.461027 0.0662431 0.253417 -0.187876 0.101184 -0.726004 0.525365 -0.405953 -0.37147 -0.573976 0.289686 0.243977 -0.403379 0.705809 -0.220195 0.62086 -0.483781 -0.458721 -0.0717068 -0.0996529 0.195033 -0.213267 -0.39341 -0.0054294 -0.808316 -0.104513 0.490922 -0.438474
+often -0.719341 0.513913 0.446009 -0.0522382 -0.840071 -0.777951 0.799704 -0.337108 0.578572 -0.104486 0.028913 -0.397832 -0.234517 0.00658359 -1.28747 0.0307134 -0.0895888 -0.0490601 0.444896 -0.213161 0.578729 0.341474 0.287275 0.576321 0.107784 0.150335 -0.00749902 0.148416 -0.0423441 -0.844549 -0.628812 -0.209625 0.0705622 0.378814 -0.101255 -0.171387 -0.270284 0.665883 0.229228 -0.492022 -1.18653 -0.00176274 0.0938817 -0.0791824 -0.212815 0.545787 -0.168773 0.217009 0.293546 -0.0237201
+service 0.13341 0.837541 -0.647089 -0.672498 -0.291748 -0.0836068 0.0491456 1.49878 0.846124 0.113734 -0.427088 -0.672273 -0.306617 -0.103521 -0.0183382 0.105532 0.482342 0.515946 0.220044 0.187867 0.28783 0.390963 0.0505239 0.12183 -0.125643 -0.0817499 1.28022 -0.960433 -0.457528 -0.493005 0.5966 0.788172 0.232346 0.705484 0.189744 -0.270387 0.307851 1.11198 0.0987167 -0.80635 -0.855142 0.0479373 -0.103258 -0.129755 -0.26089 0.212302 -0.968617 -0.546697 -0.378515 1.00321
+those 0.278177 -0.000674918 0.302565 -0.536085 -1.06761 -0.756492 0.536978 0.0208911 0.280149 -0.776955 -0.186574 -0.872701 0.246339 -0.690867 -0.345447 0.0399886 -0.455779 0.617438 0.242083 -0.0536556 0.583323 0.299138 0.0468261 0.676668 -0.134937 -0.054914 0.925354 0.231559 -0.0623661 -0.146055 -0.603081 -0.327168 0.648345 -0.0418732 0.092187 -0.0532478 0.393988 0.200611 0.146306 -0.561848 -0.295076 0.532339 0.364591 -0.133443 -0.253403 -0.353694 -0.747603 0.0341329 0.430499 -0.849126
+old 0.15862 -0.099552 -0.0847549 0.665311 -0.242425 0.128909 -0.0693327 0.112415 -0.443104 -0.315449 -0.298414 0.0434229 -0.257548 0.387316 0.133185 0.17334 -1.19056 -0.179729 0.475258 0.0718892 0.409806 0.00822375 0.763494 0.752655 -0.0415929 0.547021 0.601347 -0.744838 0.49902 -1.1424 -0.538725 -0.00598998 0.199658 0.345388 0.032294 -0.535923 0.0582254 0.105039 -0.0477687 -0.40608 -0.495036 -0.124917 0.227599 -0.164013 -0.233349 0.139809 -0.0508064 0.0969408 -0.10614 0.416462
+did 0.450221 0.0676201 0.855594 -0.57646 0.151895 -0.592717 -0.0619676 0.229124 0.159542 -0.905962 0.286739 0.0425798 -0.426141 0.764965 -0.380229 0.853873 -0.503547 0.0808351 -0.381064 -1.12338 0.332453 1.10041 0.334066 0.648664 -0.867533 -0.57232 0.732801 0.216365 -0.297977 -0.937264 -0.386354 -0.0790886 0.337014 -0.246143 0.600791 -0.41654 0.186235 -0.182266 0.00669899 0.434851 -0.12447 0.560473 0.0880775 -0.0835097 0.0284253 0.929024 -1.45836 0.485013 0.158116 0.720981
+september 0.646156 0.39214 -0.161225 -0.211412 -0.66776 0.721422 -0.433564 0.37271 0.610089 -0.144006 0.84979 -0.224675 -0.742867 -0.226992 -0.0228572 -0.485616 -0.4056 -0.283288 -0.758663 0.329867 -0.21562 0.415935 0.949471 0.0906837 0.272447 0.0840729 0.975606 -0.273836 0.368199 0.48933 0.398253 0.765882 0.00705455 -0.429406 -0.00924841 0.0571101 1.16923 0.810352 0.366786 -0.3107 0.29858 -0.136165 0.117461 0.8008 0.600063 0.752154 -1.16768 -0.162928 0.336926 0.675836
+song -1.06092 0.318848 -0.67036 1.16632 -0.915386 -0.725194 0.402829 -0.849696 0.794401 -1.10603 -0.438699 0.91198 -1.34421 -0.25751 0.428806 0.828624 -1.74455 -0.907256 -0.384713 -0.00232378 -0.199448 1.88126 0.765992 -0.568799 -0.939789 0.695371 0.243166 -0.625026 -0.138893 0.626142 0.0653221 0.540864 -0.166758 0.0393594 0.24817 -1.56275 -0.217637 0.333243 1.30517 -0.13617 -0.313038 0.718253 0.614139 -0.223357 -0.758895 1.80072 -0.783583 -0.188807 -0.679384 -0.111655
+still -0.411338 0.186762 0.329262 -0.0133485 -0.0550418 -0.475232 0.0604668 0.14181 -0.365407 -0.249493 -0.109591 -0.3485 -0.30259 0.0697776 -0.601888 0.473922 -0.528109 0.0706581 0.369399 0.0375983 0.300483 0.683367 0.335116 1.08594 -0.109287 -0.0901098 0.251707 -0.417656 0.184812 -0.549964 -0.270836 0.0668823 -0.210133 -0.0694828 -0.187958 -0.0928696 -0.371041 0.416249 0.0308658 -0.188797 -0.55833 -0.03682 0.20148 -0.156621 -0.27349 0.616462 -0.384805 0.273744 0.315636 0.415815
+along 0.243582 -0.195343 -0.254614 0.353931 -0.121293 -0.218172 0.172452 0.453029 0.217265 0.124939 -0.190466 -0.0870037 -0.211817 0.269914 -0.133554 0.0995974 -1.36676 0.229162 0.457295 0.61706 0.410674 -0.164987 -0.0690386 0.189593 -0.295642 0.645667 -0.50057 0.140603 -0.186198 -0.155798 0.353449 -0.280596 1.19763 0.0838931 0.281642 -0.04958 -0.0304469 0.440021 0.0510957 -0.661107 -0.679863 -0.0304742 0.334402 0.0932365 -0.198361 0.338047 -0.659666 -0.592718 0.341406 0.28309
+january 0.639385 0.322364 -0.226481 -0.280205 -0.816367 0.752826 -0.460716 0.359243 0.581999 0.0343487 1.0517 -0.155253 -0.749826 -0.264548 0.062182 -0.502239 -0.278241 -0.29767 -0.886232 0.271405 -0.224835 0.508194 0.782401 0.223041 0.213784 0.225414 1.06158 -0.23721 0.507068 0.477713 0.140061 0.720547 -0.0859829 -0.39609 0.11229 0.105422 1.07139 0.721387 0.141357 -0.357654 0.192036 -0.0903767 0.0466531 0.766269 0.507894 0.745763 -1.01968 -0.334309 0.316471 0.691753
+built 0.31587 0.421077 -0.189627 -0.0596738 0.297929 0.228409 -0.393601 1.00703 -0.452125 -0.0613365 -0.477331 -0.248604 -1.30808 0.493548 -0.561701 0.66099 -1.26818 1.05292 1.06617 1.02719 0.669085 1.23072 0.189441 0.554057 -0.624623 0.00725846 0.753299 -0.980623 1.44714 -0.496803 0.821132 -0.376493 -0.144435 0.333865 -0.252613 0.340879 0.779533 0.221923 0.307718 0.0208412 -0.0779495 0.677444 0.644017 -0.3887 0.600727 0.232878 -0.231803 -0.375493 0.164693 0.170509
+took 1.15471 -0.314494 0.127811 -0.300598 -0.0548079 -0.22345 -0.180523 0.247831 -0.0403594 -0.660078 1.05343 -0.0119762 -0.49271 1.25641 0.0492947 -0.209013 -0.763542 0.372192 0.177567 -0.667237 0.069744 0.419 0.796329 -0.213307 -0.268479 0.0318228 0.5289 -0.737696 0.219234 -0.692791 0.475895 -0.138173 0.797309 -0.205503 0.724434 -0.363996 0.696759 -0.356869 0.0850512 -0.334491 0.634034 0.76131 0.618554 0.365588 0.149496 0.0360459 -1.36069 0.0416809 -0.313257 0.695232
+own 0.088094 0.569884 0.542572 0.297444 -0.288371 -0.265988 0.517041 0.435265 -0.209247 -0.510511 0.0531193 -0.367655 -0.157894 0.242588 0.357698 0.161907 -0.461481 0.469573 0.65465 -0.437399 0.648784 0.310766 0.550501 0.400214 -0.108145 0.246849 0.193845 -0.948684 -0.981083 -0.227749 0.161997 0.149389 0.352873 0.20245 1.00703 -0.0201593 -0.640142 -0.267594 -0.328806 -0.253659 -0.310368 0.17765 0.577329 -0.104407 -0.525279 0.544872 -0.890691 0.166555 -0.264638 0.238956
+members -0.0810116 -0.307308 -0.271797 0.267801 -1.14011 0.157482 1.08159 -0.390878 -0.571911 -0.662406 -0.395044 -0.47023 -0.719131 -1.15405 -0.320832 -0.547723 -0.929393 0.578824 -0.0940939 -0.358884 -0.041523 0.776528 -0.371079 0.063752 0.658424 0.827698 0.711974 -0.630937 -1.03697 0.31026 0.19204 -0.156086 1.07338 0.432981 0.0199862 -0.0946219 0.221896 -0.154839 -1.31297 -0.15671 -0.774226 0.555015 0.400098 -0.257145 -0.468568 -0.209019 -1.40431 0.314619 0.150055 0.0737331
+left 0.235061 -0.298327 0.163099 -0.101867 -0.0758314 -0.0607762 0.0427575 0.133746 -0.353725 0.0600622 0.951172 0.675198 -0.333058 0.131271 -0.114863 0.164442 -0.775015 0.418721 0.280057 0.206582 0.409221 0.522556 0.940089 0.224794 0.148042 0.354171 0.0787047 -0.193379 0.15381 -0.408138 0.477217 -0.00274088 0.967939 0.0051611 0.696677 -0.0479134 -0.198274 -0.0700913 -0.283491 -0.0910331 -0.202436 -0.0696847 0.224197 -0.128322 0.291164 0.819872 -1.08991 0.432275 -0.135422 0.610443
+due 0.106966 0.585611 0.575071 0.507455 0.128631 -0.120576 -0.163222 0.045997 0.365749 0.202006 0.346601 -0.297258 -0.339659 0.856222 -0.690872 -0.0772675 -0.485463 0.884035 0.113451 -0.397779 0.752114 0.501712 0.649798 0.380456 1.31443 0.0820137 1.05313 0.265036 0.0389414 -0.230101 -0.731625 0.0657201 -0.0275787 -0.510919 0.171815 -0.403683 0.524008 -0.0300107 0.0779012 -0.200566 0.532425 0.100666 -0.453018 -0.442932 -0.145918 0.788923 -0.885171 -0.73643 -0.237871 -0.120807
+october 0.62248 0.377227 -0.173024 -0.225963 -0.790322 0.706086 -0.478207 0.362855 0.628232 -0.089243 0.908207 -0.239172 -0.854683 -0.178863 0.0724866 -0.468477 -0.407798 -0.292882 -0.793871 0.333765 -0.31049 0.449222 0.929673 0.0825746 0.255926 0.137734 0.981073 -0.266707 0.357294 0.502722 0.367282 0.633492 -0.0703111 -0.52017 0.137383 0.137285 1.11737 0.80831 0.304181 -0.239053 0.33673 -0.143364 0.130629 0.737033 0.597708 0.751264 -1.09168 -0.234947 0.297442 0.678153
+june 0.629039 0.358435 -0.116939 -0.304914 -0.795767 0.654342 -0.336173 0.408524 0.656093 -0.0240468 0.902042 -0.314347 -0.794443 -0.259825 0.122382 -0.48969 -0.317865 -0.2887 -0.797075 0.275747 -0.230298 0.453801 0.90513 0.0959616 0.21287 0.156331 0.993191 -0.147219 0.421653 0.518538 0.3718 0.613174 0.00355713 -0.390525 0.0737146 0.0503623 1.12835 0.776601 0.278648 -0.224185 0.297806 -0.0789119 0.0880698 0.698888 0.584037 0.735213 -1.10483 -0.257615 0.323891 0.737752
+off -0.121774 0.227718 0.115439 0.356342 0.0660266 -0.57048 0.235312 0.00801725 0.238753 0.112259 -0.0784705 0.180016 -0.258194 0.538146 0.338539 0.291317 -0.763676 -0.000109288 0.170378 0.242251 0.608201 0.883257 0.609548 0.129404 0.117356 -0.0663172 -0.31636 0.00529051 -0.0680322 -0.636065 0.118996 0.278533 0.655226 -0.581245 0.0424126 0.167267 -0.414535 0.716367 0.401557 -0.709156 -0.357843 -0.303398 0.0131106 0.662258 -0.156512 0.0333397 -1.20985 -0.103009 0.127607 0.770049
+single -0.221017 1.09209 -0.0509482 0.394502 -1.14394 -0.202866 0.230204 -0.279968 -0.212004 0.511797 -0.0773785 0.194606 -0.717148 -0.621776 0.16318 0.272103 -1.07473 0.0907657 -0.299641 -0.0826864 0.651825 0.561161 0.448448 -0.0432175 0.0351118 -0.115386 0.0474281 -0.440101 0.682846 0.266573 0.0316545 0.361592 0.450234 -0.450196 -0.306222 -0.147782 -0.282515 0.194922 0.827793 0.360394 -0.197123 0.25279 0.0119724 -0.666557 -0.865331 0.437977 -1.11022 0.148406 0.0295175 0.180825
+held 0.173992 -0.361595 -0.0692503 -0.428386 -0.137738 -0.581431 -0.0891498 -0.287843 0.358531 -0.183925 0.673889 -0.882479 -1.01647 -0.425866 -0.71322 -0.365663 -0.801721 0.157506 -0.474046 -0.343625 -0.45177 0.465338 -0.128795 -0.194181 -0.347976 0.179534 0.339277 -1.08629 1.13394 -1.08284 0.759691 -0.0702629 0.136894 -0.187788 -0.0287598 -0.121166 0.979252 -0.135692 -0.428972 -0.254254 0.179891 0.591929 0.672691 0.0355525 0.67749 0.31266 -0.866151 0.132054 -0.0660172 0.905715
+july 0.671236 0.351176 -0.18259 -0.241962 -0.768539 0.645214 -0.391063 0.461277 0.695254 -0.0407085 0.945226 -0.352072 -0.697998 -0.25384 0.14017 -0.517241 -0.367081 -0.30699 -0.746449 0.292385 -0.312091 0.46342 0.894083 0.101999 0.225003 0.202889 1.02908 -0.155986 0.47691 0.530526 0.310993 0.650972 0.00466221 -0.447716 0.0803433 0.0970703 1.15824 0.831005 0.324321 -0.262725 0.316494 -0.129991 0.0889754 0.681119 0.550229 0.733019 -1.06705 -0.294549 0.39179 0.72483
+football -0.233591 0.102152 0.630206 0.730152 1.08939 0.588483 0.355229 -0.493588 0.719575 0.248498 1.25917 0.57421 0.458633 -1.28394 -1.10202 0.307926 -1.23232 0.278262 0.224568 -0.922613 -0.0256407 1.43131 0.577883 1.02652 -0.456694 1.40863 2.41363 -1.62128 -0.0107364 -0.712268 -0.365414 1.59345 0.89543 0.251322 -1.46135 -1.1676 -0.112154 0.955052 -0.291861 0.539755 -0.200478 0.072514 0.671737 2.26991 1.10271 -0.210669 -0.711713 -0.813345 0.113908 0.271498
+death -0.462066 -0.657674 -0.163366 0.16597 -0.874037 -0.562257 -0.229125 0.622117 -0.0856054 -0.835258 0.8245 0.00808855 -1.08621 0.0117878 0.490758 -0.265264 0.224231 -0.437573 0.383462 0.273929 0.380183 -0.00771622 0.978254 0.525827 0.944978 0.936392 0.888308 -0.134172 -0.0355411 -0.101255 -0.570978 0.10643 -0.192007 -0.302953 1.24455 -0.655 0.890317 -0.0611417 -0.0313096 -0.33916 0.211427 0.0351558 1.27218 -0.173236 -0.478808 0.484978 -0.858183 0.423368 -0.152006 0.443031
+main 0.208039 0.273473 -0.221937 0.352814 0.0656047 0.0920926 -0.423249 0.58353 -0.103032 0.191252 -0.48172 -0.018516 -0.392339 0.283937 -0.810084 -0.272227 -1.22025 0.458087 0.564644 0.197496 0.162071 -0.14332 0.194128 -0.452768 0.275174 0.569249 -0.445991 -0.8604 0.237487 -0.79689 0.0375331 0.0608406 0.274956 -0.256827 -0.57403 0.120763 -0.0117046 0.558532 0.279254 -0.639472 -0.722332 0.185498 0.722966 0.0545779 0.0295346 0.207506 -0.291313 -0.246923 -0.105554 0.283708
+august 0.667551 0.311513 -0.145087 -0.157284 -0.74025 0.703594 -0.379237 0.44527 0.632115 -0.0559498 0.998819 -0.330621 -0.71098 -0.222116 0.136277 -0.497971 -0.476124 -0.324814 -0.720202 0.339398 -0.358702 0.433007 0.970128 0.0989911 0.241216 0.192342 1.03347 -0.0304458 0.439912 0.535544 0.321688 0.664014 -0.0275433 -0.397308 0.0362706 0.124199 1.22443 0.812954 0.356058 -0.271272 0.368038 -0.153338 0.185415 0.75932 0.567346 0.844462 -1.04925 -0.230488 0.357876 0.665838
+last -0.179717 0.251862 -0.0604128 0.347476 -0.32575 0.0763305 -0.20089 -0.0239998 -0.3378 -0.456038 0.172846 0.0658387 -0.79218 0.443899 0.224596 0.332061 -0.786659 -0.0214419 -0.0516228 -0.271274 -0.243364 0.169844 0.377199 0.280181 0.272029 0.0306182 0.241756 -0.494243 0.479103 -0.554599 -0.4582 0.256805 0.449997 -0.49904 0.0405701 -0.78751 0.175254 0.264647 -0.0114305 -0.35284 -0.237032 -0.00315586 0.648354 0.0760625 -0.609888 0.222045 -0.851648 0.157436 -0.563743 0.564905
+president -0.221 -1.33806 -0.162575 0.960428 -0.674926 0.766803 -0.904516 -0.138397 0.960858 -0.693011 0.37682 -0.148309 -0.488811 -0.302831 -0.0957883 0.285134 0.559902 0.257179 -0.549401 0.338152 -0.786887 1.04318 -0.316951 0.094431 0.231707 0.424493 1.29818 -1.43884 -0.284072 -0.0320234 0.00149518 0.356426 0.622165 0.292492 0.847497 0.488757 0.673164 0.717623 -1.50078 -0.0177023 -1.4146 0.584888 1.0092 -0.10331 0.694828 0.0647301 -1.17769 -0.238068 -0.0464783 0.641892
+set 0.334734 0.966061 -0.0722186 -0.0442072 -0.46836 -0.352627 0.0740641 -0.68466 -0.100182 -0.0746741 0.0437243 -0.381382 -0.486613 -0.102442 0.118852 0.541868 -0.541816 1.14111 0.116677 0.239436 0.309814 0.177422 0.462083 -0.17403 -0.226638 0.32956 0.0445854 -0.340113 0.548171 -0.430924 0.270927 0.209054 0.228329 -0.451693 0.332637 0.15654 -0.106054 -0.0796111 0.343585 -0.293978 -0.0787916 0.195042 0.842678 0.0417553 -0.0388885 -0.027036 -0.842587 0.0372653 -0.218249 0.143367
+great -0.761134 -0.15775 0.0916806 -0.0735566 -0.576101 -0.556193 0.113618 0.543322 0.509947 -0.445657 0.0939925 -0.738327 0.283826 0.925315 0.092247 0.675568 -0.753195 -0.236813 0.860709 0.102236 0.0913276 0.0797538 0.165383 0.0342509 -0.122362 -0.0812639 0.167137 -1.00044 0.0351083 -0.33497 -0.494137 -0.267636 0.212589 -0.126387 0.142092 -0.385426 0.0462194 0.391455 -0.408485 0.0225564 0.205379 0.0452291 0.977967 -0.0662391 0.242241 0.418905 -0.270667 -0.11492 0.206884 0.357605
+much -0.68044 0.0278876 0.107844 0.133631 -0.108913 -0.484837 0.156812 -0.191337 -0.254662 -0.203358 0.0555928 -0.535024 0.0255961 0.542636 -0.787006 0.567823 -0.155648 0.360188 0.443442 0.223908 0.78446 0.654882 0.619412 0.57409 0.0578831 -0.459471 0.425565 -0.321366 -0.0426972 0.130568 -1.08732 0.0566487 0.226701 -0.0222583 0.320713 -0.0942253 -0.409715 0.665475 -0.268453 -0.470009 -0.103426 0.239935 0.312622 -0.295078 -0.695216 0.594902 -0.378921 -0.0387911 0.234341 -0.104268
+even -0.155656 0.219992 0.563614 0.0244091 -0.143947 -0.871769 0.312416 -0.121919 -0.157909 -0.372255 0.208038 -0.277864 -0.0616389 0.212704 -0.652689 0.432453 -0.147923 0.198033 0.294343 -0.323422 0.388939 0.814016 0.412289 0.515384 -0.0185656 -0.244893 0.312359 0.0221947 -0.0394973 -0.817831 -0.673657 -0.117923 0.283854 -0.256441 0.317567 -0.136418 -0.215995 0.376541 -0.116029 -0.309126 -0.478202 0.193278 0.166279 -0.382631 -0.562756 0.625573 -0.579201 0.218753 0.140993 -0.0327454
+april 0.710961 0.374959 -0.186636 -0.253318 -0.810312 0.654879 -0.344914 0.385152 0.579024 -0.093696 0.922419 -0.249799 -0.799457 -0.251121 0.119823 -0.477759 -0.315043 -0.323412 -0.780294 0.288703 -0.281003 0.444495 0.872941 0.0694371 0.229394 0.250484 1.00442 -0.192757 0.388905 0.455922 0.333487 0.587703 -0.0797446 -0.436366 0.0456787 0.122517 1.13446 0.831679 0.264113 -0.301302 0.303322 -0.0928673 0.130848 0.702294 0.553269 0.707319 -1.01222 -0.239338 0.311598 0.756842
+st 0.185075 -1.09922 0.976056 -1.59804 -0.523227 0.99554 -0.31431 1.05606 0.557731 -0.308322 -0.39559 -1.10111 -0.639981 0.396753 0.201198 0.224275 -1.33924 0.445108 -0.244793 1.35605 0.249362 0.397255 0.2578 -0.0955755 0.541657 0.469001 1.70778 -0.614669 0.167867 -0.494845 -0.22081 0.0644699 0.956772 0.236935 -0.0330655 -1.23592 -0.768728 0.291284 0.901259 0.24745 -0.220525 0.372924 0.647409 1.44474 1.29495 0.612695 0.331897 -0.17404 0.403457 0.428312
+november 0.590458 0.284696 -0.167356 -0.264649 -0.87469 0.636697 -0.43438 0.274044 0.61412 -0.096801 0.945538 -0.197083 -0.921197 -0.269479 0.0458309 -0.464382 -0.336541 -0.275776 -0.995203 0.291573 -0.298586 0.464372 0.855367 0.10976 0.224569 0.13002 0.971093 -0.263623 0.444593 0.530852 0.287703 0.638929 -0.10005 -0.492505 0.152092 0.114392 1.14522 0.849545 0.193405 -0.309163 0.287528 -0.103954 0.100112 0.731996 0.575088 0.692993 -1.1061 -0.295274 0.301788 0.712228
+december 0.6454 0.297429 -0.16238 -0.234087 -0.83897 0.61855 -0.462 0.413621 0.632849 -0.0769015 0.932462 -0.169496 -0.78563 -0.164895 0.123583 -0.506809 -0.346608 -0.288373 -0.835094 0.351431 -0.409358 0.413932 0.841285 0.164297 0.240167 0.171784 1.10998 -0.205466 0.451705 0.548649 0.189977 0.696515 -0.0974617 -0.492774 0.126395 0.196248 1.05047 0.75737 0.186689 -0.315564 0.304145 -0.0507721 0.0904152 0.713993 0.531392 0.736889 -1.04615 -0.44634 0.298306 0.667113
+what -0.254099 -0.0123843 0.252342 0.183639 -0.389136 -0.481675 -0.182585 -0.955473 0.00181823 -0.686778 -0.366638 -0.721216 0.0872231 0.0611919 -0.541091 0.984046 -0.100661 0.269819 0.0661169 0.313684 0.625685 0.937809 0.394471 0.750503 0.00273789 0.208675 0.2883 -0.0724469 -0.183948 -0.121278 -0.253768 0.209068 -0.708353 -0.5257 -0.0036848 -0.233402 -0.778325 0.64543 0.00294537 -0.409171 -0.436509 -0.00149731 0.537704 -0.482943 0.220869 0.948719 -0.715767 0.653244 0.266317 0.580081
+five 0.12869 -0.052067 -0.258276 0.30528 -1.26742 -0.301085 0.0722838 -0.422529 -0.509892 -0.0605758 -0.141861 -0.261554 -0.512532 -0.439488 -0.0325947 0.0525878 -1.07834 0.280723 0.0245672 0.286808 0.0311583 -0.508371 0.312633 0.280302 -0.157419 -0.672381 0.413906 -0.41462 0.492523 -0.509918 -0.328841 0.147972 0.755553 -0.208316 -0.0403818 -0.377878 0.308296 0.273365 -0.386521 -0.488961 -0.416113 0.273991 0.232433 0.463879 -0.623075 -0.595631 -1.27548 -0.326544 0.0588845 0.129433
+served 0.73367 -0.800672 -1.36541 -0.497939 -0.789053 0.332007 -0.2322 0.452909 0.683693 0.0239203 1.60988 0.398347 -1.00525 0.189368 -0.96085 0.356512 0.211223 0.402932 0.518473 0.496913 0.406885 0.0405074 -0.216667 -0.189053 -0.298875 0.000362797 1.5313 -1.18691 0.607062 -1.01864 -0.0599914 -0.0494926 0.554338 0.856834 0.166186 -0.230672 0.834535 0.0320799 -0.766046 -0.417702 -0.766633 0.681406 -0.028552 -0.248274 0.913917 0.40835 -0.3983 -0.70414 0.0665214 1.03323
+air 0.460254 0.874952 -0.524816 0.205337 0.582211 -0.428443 0.0547861 0.798729 1.06653 0.0248769 0.977496 -0.977605 -0.616364 0.724963 -0.615679 0.730385 0.13114 -0.0992097 0.159557 0.677832 -0.280397 1.66089 0.185853 -0.985217 1.17234 -0.363483 1.02022 -0.658662 -0.582938 -0.108433 0.415443 0.588514 1.08736 0.00597492 -0.825398 0.288089 -0.0770638 0.684135 0.320805 -0.0659321 -1.17141 0.781121 -0.501325 0.257469 -0.31237 -0.192206 -0.953762 -0.0264261 0.699833 0.422499
+book -0.443222 0.405986 0.38369 0.269334 -0.330454 0.221846 -0.272502 -1.03161 -0.0964199 -1.07221 -0.341323 -0.06792 -0.324976 0.208465 0.473111 0.144041 -0.290044 -0.270773 0.315709 0.547524 1.13133 -0.911904 0.510836 -0.844784 -1.1387 0.707567 1.03607 -0.359917 -0.437168 -0.633879 -0.219031 0.87598 -0.692405 -0.266304 0.718716 -0.221056 0.0972804 0.729586 -0.0546358 -0.00161638 -1.09153 0.14223 1.79431 0.0377216 -0.481815 0.73127 -0.620929 0.695079 -0.675839 0.705787
+order -0.0152733 0.751081 0.573759 -0.516269 -0.632036 0.0772553 0.226041 0.014845 0.558844 -0.255915 0.310943 -0.382836 -0.357907 -0.216737 0.538262 -0.0230279 0.16795 1.62145 0.0188679 0.165396 0.467691 0.489965 0.46326 -0.0420907 0.834858 -0.132811 0.307951 -0.178584 -0.301865 -0.305434 -0.0818606 0.116918 -0.299122 -0.319077 -0.0563553 -0.207756 0.0695571 -0.248394 -0.239912 -0.267273 -0.541009 0.0428309 0.837741 -0.480038 0.213902 0.142083 -1.14157 0.0864255 -0.242551 0.0223702
+children -0.686862 0.281441 -0.256418 0.21769 -1.01305 0.0312252 -0.712782 0.454881 0.195297 -1.40711 -0.561356 -0.227378 -0.438953 -1.02821 1.06514 -0.442451 -0.423961 -0.0649253 0.244487 0.407958 1.12906 0.261803 0.341972 0.492658 -0.844294 0.0900656 1.50883 -0.107744 -0.842189 -0.860195 -0.114341 0.416491 0.492925 0.804197 -0.2007 -0.308273 0.286251 -0.449252 0.144029 -1.07359 0.347376 0.786481 0.406603 0.216216 0.288307 0.0687935 -0.800937 0.662511 0.134169 -0.674479
+law 0.124534 -0.931876 0.5594 -0.283323 -0.632281 -0.33198 -0.278917 0.525125 1.07723 -0.0706733 0.451313 -0.201858 0.340379 -0.649884 0.145734 0.0402203 0.76052 0.733958 -0.0101939 0.305593 0.518226 0.682185 -0.289455 0.265085 -0.644341 1.26072 0.384838 -0.750596 0.0853794 -0.791801 -0.340983 1.486 -0.811103 0.170905 1.04489 -0.298442 0.791865 -0.253873 0.175077 -0.708834 -0.802501 1.41485 0.492944 -0.253046 0.37354 -0.02488 -0.942889 0.319746 0.618071 0.106824
+park -0.926692 -0.226016 -0.367578 0.221418 0.923671 0.153735 -1.25552 -0.214871 0.474461 -0.325249 -0.333526 -1.15868 -0.97576 -0.018044 -0.348353 -0.134317 -1.23755 0.196496 0.515687 -0.166928 0.117181 0.876437 0.733067 0.606293 -0.532628 1.46699 0.196728 -1.1005 -0.0407308 -0.121715 0.0773198 0.249871 0.584708 0.560004 -0.617559 -0.38227 0.46732 0.798892 0.578745 0.37643 0.416595 -1.40976 0.60956 -0.317648 0.818997 -0.182634 -1.09776 -1.29024 1.09132 0.671113
+km 0.503276 -0.571579 -1.42672 0.720064 0.807484 -0.74419 -1.53258 1.05377 0.614634 0.754986 -0.336146 -1.75773 0.241137 -0.145993 -0.125783 -0.199724 -2.82778 0.54002 0.659398 1.3375 0.01035 1.018 0.802132 0.0575006 -0.758288 0.225361 1.77086 0.0763772 1.7326 1.19863 -0.738432 0.403138 0.575387 -0.836617 -0.737912 0.0305518 -0.0316211 1.61445 0.157645 -0.701642 0.0703191 0.411252 0.878007 -1.16436 0.0639902 -0.342174 -1.14967 -0.774929 0.260766 -0.28017
+army 0.700713 -0.284055 -1.03803 0.45289 0.605417 -0.0929869 0.853651 1.40703 0.98464 -0.956726 0.850814 0.355187 -0.558005 0.266723 0.15855 0.257794 0.386184 1.01815 0.265274 0.439123 -0.194346 1.14358 0.229701 -0.339765 -0.182709 0.153212 0.946694 -0.458846 -0.499695 -0.268112 0.0387292 -0.303214 -0.219604 0.690854 -0.405154 -1.39273 0.495108 1.58101 -0.523219 0.309473 -0.473971 0.315787 1.01438 -0.0618652 -0.684547 -0.0767397 -1.77004 0.618862 0.353623 0.578898
+king -0.220201 -0.245536 0.0104624 0.391628 -0.0472301 -0.24195 0.717071 0.749803 0.542743 -1.59111 0.0912896 -0.167928 -0.169699 0.444097 0.608597 0.140879 -0.267255 -0.502787 -0.41677 0.610343 0.382784 1.45974 0.113062 0.141236 -0.120124 0.390756 -0.145611 -1.60893 0.063947 0.0280967 -0.62855 -0.662755 -0.539538 0.0642998 0.196127 -1.45027 1.04979 -0.188331 -0.173709 -0.358217 -0.339652 0.4264 1.48105 0.204756 0.434872 0.0338218 -0.626524 -0.79942 0.236267 0.234474
+include 0.748479 0.141653 0.757062 0.077869 -0.928934 0.240549 0.0192784 0.168287 0.714917 -0.241583 -0.103083 -0.479745 -0.453935 0.933517 -1.02001 -0.893836 -0.673167 -0.00741515 0.491375 -0.857542 1.011 -0.32089 -0.0375009 -0.898605 -0.596272 0.612326 0.733478 -0.248801 -0.6283 -0.239832 -0.525744 -0.803859 0.561594 0.22822 -0.457737 -0.270546 0.124228 -0.61091 0.50837 0.231787 -0.700717 0.320366 -0.290683 0.254979 -0.439765 -0.315354 -0.645074 -1.2712 -0.0769897 -0.432262
+english -0.0438665 0.901869 -0.13563 0.439895 -0.358596 1.00956 1.60533 -0.0354233 0.361522 -0.216069 0.580157 -0.0714253 0.815508 0.0845301 -0.136506 -0.0193125 -0.640409 0.078316 -0.577768 0.592988 0.18661 0.0546823 1.2292 0.00987852 -1.22696 0.561619 0.985659 -0.883695 0.289 -0.774009 -0.656253 0.670085 -0.323464 0.0323886 -0.210826 -0.872782 -0.299635 0.0215304 0.641067 -0.342254 -0.223959 0.647631 0.401659 -0.0671791 0.362879 0.258944 -0.0272497 0.0161722 -0.0609633 0.210404
+country -0.738424 -0.193502 -0.285876 0.494742 -0.254143 -0.260903 -0.0363624 -0.287071 0.575039 -0.506743 0.287044 -0.798052 0.338555 0.434693 0.487307 -0.153849 -1.00853 0.551446 0.503311 -0.189222 -0.52552 0.579291 -0.0440922 0.279521 -0.750348 0.338843 0.366935 -1.25524 -0.0481346 0.0742317 0.203909 0.705654 0.240133 0.191382 -0.270054 -0.746372 -0.108577 0.442919 0.053505 0.189412 -0.322309 0.431231 -0.617666 -0.378485 -0.122907 0.873735 -1.1645 0.151074 -0.33138 0.058643
+form 0.0191147 0.526696 0.504771 0.0921988 -0.59597 0.366982 0.422626 -0.339612 0.238795 0.351441 0.339461 -0.122149 -0.234917 -0.0123131 -0.309805 -0.0865803 -0.686238 0.756263 0.0899549 -0.119775 0.88844 0.368252 0.601999 0.0196074 0.612009 0.622833 0.179661 -0.0997811 -0.151374 0.108978 -0.427777 -0.00593935 0.0894307 -0.534135 -0.0734958 -0.311531 -0.13844 0.181342 -0.474396 -0.152107 -0.42566 0.661632 0.537399 -0.791203 -0.425967 -0.196625 0.0762599 -0.0928295 -0.0196596 0.00253238
+games -1.06396 0.73558 0.776218 0.0272615 -0.044122 0.183016 0.180908 -0.465056 0.0322977 -0.636179 0.876728 0.30342 -0.331763 -1.13835 -1.12836 0.507203 -1.14329 0.482749 0.406765 -1.25583 0.761832 1.37679 0.157322 0.128776 -1.13604 -0.314044 1.42254 -0.195151 0.195051 -0.985073 0.863096 1.15688 0.692385 -1.05896 0.0417548 -1.17719 0.621898 0.902173 0.259587 -0.145415 0.284972 -1.18365 1.43805 1.95394 0.0703616 -0.742918 -0.856398 -0.949399 0.379677 -0.614312
+road 0.075198 0.10775 -0.389763 0.0579799 0.879231 -0.738117 -1.19938 0.388083 0.320333 -0.0739185 -0.562556 -0.268781 -0.059866 0.41855 -0.0963762 -0.197234 -1.40945 0.671334 0.447666 0.711398 0.953864 0.548514 0.465697 0.788164 -0.490408 1.40193 0.270207 -0.169588 0.585525 -0.0140215 0.424261 0.306182 0.346331 0.259111 0.109422 -1.35036 0.107107 1.55177 1.41388 0.186673 -0.28405 0.253913 0.14693 -0.342765 0.486882 0.0121484 -0.589494 -1.0662 0.063132 1.11249
+building -0.0221922 -0.200681 0.313993 -0.085293 -0.00948985 0.518852 -0.961626 0.204862 -0.611497 -0.00292809 -1.48911 -0.298266 -1.0916 0.591211 -0.726867 0.402592 -0.413644 1.39671 0.80675 0.834857 0.517988 1.61593 0.682475 0.00312004 0.0665424 0.640835 0.494283 -1.39123 0.249585 -0.610393 0.553586 0.648532 -0.325272 0.610875 0.119977 0.200648 0.528907 0.320126 -0.0715451 0.114452 -0.591136 0.272591 0.539694 0.00390565 -0.333145 -0.327882 -0.532736 0.132502 0.568746 0.338422
+died 0.187256 -1.04224 -0.311234 -0.458093 -0.88236 -0.127761 -0.681673 0.979206 -0.0394917 -1.24374 1.79195 0.135674 -1.07224 0.140233 0.270031 -0.572596 -0.254771 -0.788003 0.526228 0.379062 0.247824 0.132361 0.983808 0.837428 0.0508058 0.177703 1.86722 -0.0057359 0.586927 -0.300941 -0.0823667 -0.192757 0.17839 0.137765 0.787286 -0.430232 0.944091 -0.140336 -0.112256 -0.40587 0.857036 0.320144 1.14994 0.0199246 0.502474 0.669329 -0.862083 0.187754 0.324677 0.716302
+third -0.143762 0.126403 -0.000733391 0.509428 -0.825452 0.360056 -0.280487 -0.184063 -0.0525972 0.285242 0.129739 0.296672 -0.599398 -0.0824695 0.443081 0.0116133 -0.868893 0.591237 -0.310931 0.0196882 0.327984 -0.11551 0.134776 -0.220569 0.317834 -0.248844 -0.0811406 -0.816132 0.600283 -0.525056 -0.30407 0.406029 0.318838 -0.59015 0.0436154 -0.763965 0.226245 0.268684 0.330772 0.0583854 -0.25236 -0.167118 0.48853 0.0160487 -0.208469 0.390667 -0.986794 0.275408 -0.550649 0.324113
+down 0.189726 -0.069809 0.261708 0.20033 0.181369 -0.637791 0.0501493 -0.219993 0.0423941 0.0478204 -0.0883664 0.295001 -0.0851379 0.398609 0.2856 0.435764 -0.905174 0.203926 0.256287 0.453482 0.729377 1.14297 1.02973 0.240701 0.277189 0.0457102 -0.370739 -0.158728 -0.0283283 -0.279585 -0.108733 0.0900153 0.196011 -0.166639 0.262369 0.280168 0.00421755 0.557238 0.345262 -0.491329 -0.285067 0.134115 0.0121402 0.409066 -0.195569 -0.0361249 -1.32073 0.455371 -0.0475766 0.571835
+power -0.198734 -0.104175 -0.214606 0.150742 -0.634284 -1.00659 -0.180672 0.966296 0.24344 0.139751 0.574099 -0.258813 -0.317501 0.163294 -0.180299 0.912448 -0.253074 0.525952 0.545989 0.0656669 0.852587 1.41983 -0.422898 -0.3243 0.656354 -0.368111 0.0521585 -1.31348 -0.676835 0.305351 -0.430467 -0.0603067 0.0433416 -0.930194 0.991537 0.232821 0.286896 0.717171 -0.439864 -0.0600138 -0.0988499 0.723726 0.447142 -0.509578 -0.0796347 -0.169744 -0.587077 -0.113795 -0.185851 -0.620717
+just 0.0648702 -0.218346 -0.0184632 0.0568361 -0.117881 -0.521982 -0.44588 -0.583213 -0.49564 -0.110705 0.0948967 -0.201407 0.113548 0.105461 -0.196032 0.669125 -0.68336 0.117676 0.0403816 0.272707 0.410066 0.651953 0.684903 0.604554 -0.129642 -0.0441983 0.246708 -0.193253 0.330633 -0.201391 -0.33558 0.176539 0.509291 -0.388638 -0.11672 -0.322957 -0.481067 0.60815 0.222135 -0.39628 -0.321398 -0.121872 0.0919066 -0.242758 -0.23965 0.202961 -0.716809 -0.124021 0.149391 0.524144
+water -0.674431 -0.275086 -0.688121 0.116662 0.0196181 -0.360897 -0.73209 -0.138954 1.33386 0.890767 -0.169816 -1.41707 -0.544346 0.338746 0.602818 0.527386 -1.07504 0.00450792 0.897999 1.52588 0.735054 1.77731 -0.0376203 -0.658705 -0.149312 0.237225 0.495093 0.434461 0.189765 -0.889178 -0.884751 -0.250123 -0.170253 -0.754486 -0.510624 0.374309 0.394131 0.0759536 0.366807 -0.169591 -0.192156 0.992208 0.0361337 0.513392 -0.0873202 -0.664519 -0.297864 -0.131217 0.765944 -0.479279
+final -0.551194 0.356189 0.44222 0.28273 0.029392 -0.118927 -0.140526 -0.505675 -0.0063031 0.22495 0.112741 0.0630634 -0.96421 0.176336 0.347283 0.512756 -0.955897 0.792628 -0.96692 -0.669386 -0.00365268 0.113102 0.611922 -0.140476 0.153438 -0.0277996 -0.124159 -0.160993 0.287846 -0.511618 0.0359066 0.515321 0.480715 -1.00013 -0.0481966 -1.05602 0.430528 0.224464 0.297109 -0.117665 -0.252941 0.267785 1.35056 0.722708 -0.505473 0.306425 -1.09101 -0.253672 -0.345216 0.85075
+original 0.220898 0.977431 0.466096 0.57831 0.0197024 0.528626 0.299725 0.0268198 -0.290263 -0.516013 -0.925546 0.231025 -1.35314 0.474662 -0.210811 0.905157 -1.11457 0.2685 0.0764111 0.0544559 0.411602 0.0745964 0.20639 -0.00864767 -0.06793 0.633564 0.131399 -0.853675 -0.0085396 -0.0660988 -0.411768 0.200365 -0.160285 -0.212161 0.0818274 -0.0862516 -0.0751349 -0.145762 0.123767 -0.0620928 -0.615135 0.265588 0.546479 -0.357783 -0.651077 0.0912663 -0.517734 -0.191697 -0.0104479 0.0561
+way -0.203752 0.153701 0.402274 0.0188066 0.116276 -0.418236 -0.173256 -0.424165 0.134325 -0.349252 -0.141827 -0.270244 0.0907981 0.433864 0.142971 0.414275 -0.482106 0.947785 0.428062 0.125729 0.701898 0.867239 0.346087 0.163681 -0.0685109 0.44896 0.098208 0.218669 -0.260687 0.0739116 0.0900433 0.192684 0.344548 -0.424222 0.344136 -0.504058 -0.423751 0.369163 0.360543 -0.515419 -0.403859 0.0798688 0.591954 -0.537679 -0.171176 0.33008 -0.537783 0.0396572 -0.371121 0.58104
+published 0.133463 0.683337 -0.250451 -0.443227 -0.115216 0.519858 0.0289886 -0.821157 -0.247528 -0.391264 0.425572 -0.17688 -0.814615 -0.420698 -0.628223 -0.115239 -0.729617 -0.456359 0.00846863 0.596252 0.658061 -1.45092 0.497415 -0.915486 -1.68834 0.104306 1.6671 -0.210108 0.0180122 -0.739271 -0.000260211 0.758878 -0.555822 -0.61714 1.09031 0.509763 0.530883 0.402174 -0.294098 -0.0921448 -0.51418 0.849389 1.65803 -0.200472 0.0103265 1.27351 -0.779592 0.397806 -0.306824 0.672413
+near 0.242937 -0.747962 -0.000557805 -0.0191802 0.354501 -0.0661487 -0.709524 1.14831 0.407566 0.035787 0.403817 -0.531771 -0.397294 0.718054 -0.367301 -0.328837 -1.1208 -0.26554 0.658469 0.893185 0.216012 0.484464 0.521033 0.166351 -0.15122 0.710832 0.0991917 0.215335 0.318681 -0.249406 0.0306702 -0.171995 0.0552881 -0.0479557 -0.635267 -0.216786 -0.127433 0.511834 0.172982 -0.322822 -0.153143 -0.548602 0.25306 -0.306203 0.0305125 0.185638 -0.613846 -0.0386448 0.163443 0.8361
+late -0.141995 0.356898 0.377843 0.205187 -0.139954 0.555383 -0.00480215 0.0626881 0.0102339 -0.5695 0.536623 0.397983 -0.759443 1.19861 -0.589287 -0.247138 -0.717995 -0.292611 0.15395 0.0633923 0.309353 0.249912 0.325666 0.677743 0.332801 0.238504 0.345943 -0.204982 0.476672 0.0169766 0.188074 0.426317 0.140274 0.301758 0.179528 -0.373999 0.783403 0.467346 0.0772366 -0.39142 0.225494 0.668482 0.179397 0.253333 -0.383805 0.646598 -0.619044 0.0765966 -0.18206 0.18541
+february 0.602718 0.356628 -0.13847 -0.191935 -0.844772 0.662813 -0.360237 0.400648 0.660241 -0.0950662 1.01788 -0.224041 -0.812134 -0.23871 0.0999417 -0.524783 -0.314125 -0.296911 -0.951165 0.272431 -0.336446 0.476764 0.896818 0.0866578 0.251173 0.207927 1.06423 -0.14334 0.444273 0.494938 0.344384 0.667948 -0.169238 -0.477657 0.147348 0.153288 1.13447 0.722309 0.269592 -0.271656 0.314476 -0.0899872 0.120459 0.862768 0.602313 0.769038 -1.04445 -0.280256 0.350737 0.7071
+among -0.230246 -0.467633 0.506096 0.50753 -0.701076 -0.220825 0.54976 0.459123 0.257244 -0.610305 -0.483753 -0.232682 -0.42752 0.319787 -0.849612 -0.650851 -0.944758 -0.0131302 0.536718 -0.272899 -0.193558 -0.152277 -0.770538 -0.168001 -0.608677 -0.131301 0.31696 -0.202773 -0.225831 -0.980813 -0.396068 -0.244779 0.477854 0.328627 0.0168933 -0.226115 -0.242705 -0.0301078 -0.228002 0.256471 -0.547182 0.0112654 0.594114 0.112815 -0.345778 0.745138 -0.796164 0.555098 -0.134237 -0.107779
+son -0.600947 -0.890738 -0.00159931 1.3137 -1.20478 0.242358 -0.464734 1.44295 -0.0414959 -1.50476 1.09179 0.382938 -0.247609 -0.119463 1.24907 -0.279354 -0.117838 -0.0119077 0.173784 0.71574 0.808411 0.532318 0.0887768 0.583752 0.0475089 0.773141 1.00558 -0.292496 0.269738 0.123597 -0.665002 -0.0862416 -0.169411 0.6012 1.08236 -0.493408 0.170098 -0.111512 -0.17281 -0.496608 -0.279412 0.107079 1.71889 0.105241 0.563084 0.686279 -0.445377 -0.208055 -0.274219 0.576461
+received 0.654327 -0.414934 -0.326089 -0.403419 0.0609785 -0.600247 -0.0978444 0.59531 0.821075 -0.0913382 0.511323 -0.196915 -0.729432 0.0873772 -0.0631417 -0.162581 -0.538503 0.0915229 -0.720444 -0.183546 0.0563003 0.0342779 0.073361 -0.698743 -1.00045 -0.884018 1.123 -0.917105 0.170461 -0.283607 -0.135125 0.608845 0.472654 0.396423 1.03566 0.0549744 -0.0279595 -1.02053 -0.0392875 0.100838 0.232191 0.15391 0.738186 0.312397 -0.370231 0.907762 -1.27201 0.231502 0.13286 0.381268
+community 0.274515 -0.621792 -0.256216 0.261923 -0.0130761 0.564417 -0.162634 0.301628 0.656969 -0.0484266 -0.97677 -1.03444 -0.160478 -0.143889 -0.849309 -0.774667 -0.518576 0.390423 0.189812 0.32457 0.927817 0.887187 -0.365603 0.600726 -0.438364 1.76871 0.600394 -0.747921 -0.81051 -0.196501 0.232268 1.01205 0.375839 0.456417 -0.565311 -0.586 -0.304787 0.38675 -0.534849 -0.42369 0.244482 0.0191265 -0.0665417 -0.627467 -0.125754 0.646037 -0.295072 0.366882 0.0537461 0.174962
+different 0.0475201 1.02718 0.119807 -0.375884 -0.84439 -0.618754 0.317018 -0.769489 0.0617174 -0.28549 0.0750823 -0.729641 -0.17826 -0.388833 -0.997687 0.115135 -0.802323 0.631746 0.211177 -0.250392 0.712316 -0.00602754 0.183768 0.476553 -0.161632 -0.251996 -0.148522 -0.186269 0.109368 -0.404823 0.0126143 -0.213826 0.294753 -0.126971 -0.136725 -0.119217 -0.0682884 -0.112055 0.205766 -0.751468 -0.894152 0.617389 0.594684 -0.232609 -0.602981 0.0836326 -0.632575 -0.0296699 0.325314 -0.772518
+central -0.0741607 -0.369516 0.14352 -0.0321202 -0.0310106 0.341852 -0.802805 0.244851 0.966311 0.558156 0.305521 0.405152 -0.040509 0.242816 -0.86794 -0.176775 -0.937151 0.821147 0.363797 0.70583 0.200072 -0.382179 0.376102 0.0946446 -0.364644 0.444071 -0.00107284 -0.752578 -0.338553 0.0318837 -0.19321 0.267979 0.250066 -0.309054 -0.966818 0.0399428 0.196009 0.588498 -0.581171 -0.488269 -0.607449 0.0306431 0.0997931 -0.345233 0.477679 0.264493 -0.655536 -0.533197 0.297386 0.0675069
+man -0.23223 -0.053192 -0.260515 1.27443 -0.59799 -0.903558 0.191117 -0.373887 -0.221823 -0.899592 0.162934 -0.0278034 -0.0237552 -0.0961325 0.670829 0.557016 -0.0815288 -0.386356 0.711656 -0.138215 0.583689 0.39618 0.649509 -0.268984 0.208577 0.679561 1.0019 -0.119687 0.277104 -0.390069 -0.508208 0.0825131 0.165241 -0.377535 0.232956 -0.44562 -0.103498 0.404026 -0.100677 0.156231 -0.569485 0.229885 0.790906 0.0408535 0.254863 -0.194677 -0.395209 0.402424 0.446413 0.648007
+times -0.350691 -0.454598 0.299452 -0.657377 -0.292183 -0.579842 -0.27623 -0.467373 -0.110572 -0.571457 0.150769 0.21836 -0.0412879 0.308414 -0.677063 -0.173235 -0.500606 -0.360475 0.118059 -0.238568 0.366535 0.282334 0.0745848 -0.0257586 -0.777008 -0.292206 0.992324 -0.0456044 0.548067 -0.0919455 -0.420788 0.610041 0.547132 -0.367172 0.343546 -0.588652 0.269012 0.625028 -0.32093 -0.434635 -0.629665 0.122608 0.513987 0.544714 -0.743389 -0.0680894 -0.397482 -0.0386594 0.253728 0.62053
+top -0.501637 0.363275 0.249845 0.59585 -0.305996 -0.160212 -0.095823 -0.828023 -0.242284 -0.023558 0.537749 -0.0312178 -0.165141 -0.229549 -0.00959731 -0.394257 -1.4687 0.918268 0.346025 0.0859231 0.241058 0.740969 0.0187867 -0.956341 0.231737 -0.595523 0.0184896 -0.952454 1.00211 0.356119 0.157737 0.884065 0.461405 0.433783 -0.0628175 -0.644751 -0.3968 0.279098 0.140988 0.170489 -0.890873 -0.192059 -0.160112 0.560204 -0.494917 0.391104 -1.53199 -0.189477 -0.105201 0.506743
+french -0.225778 0.277634 -0.0213018 0.904022 0.412593 1.21557 1.37988 0.289812 0.619797 0.149154 0.446452 -0.790524 0.056095 0.838426 0.20279 -0.141829 -0.368405 0.240857 -0.646344 0.703967 -1.12399 -0.155144 1.1604 -0.358369 -0.55926 0.0299968 1.20122 -0.698659 0.546174 -1.30305 0.304581 -0.636972 -0.134726 -0.578673 0.555107 -1.44954 -0.884371 0.958003 0.596297 -0.462358 -0.972688 1.34571 0.198223 -0.0183852 0.508333 0.461994 -0.990513 0.600209 1.08617 -1.44015
+play -0.456441 0.898277 0.877796 0.473394 -0.449944 0.37064 0.598901 -1.20353 0.185789 0.143752 0.614867 0.535529 -0.547885 -0.360744 -0.530094 0.954889 -0.28402 0.607859 0.226269 -0.542929 0.215272 1.2802 0.61018 -0.278016 -0.688552 0.798583 0.735153 0.0868441 -0.428104 -0.48438 -0.119528 0.255463 0.620861 -0.570771 -0.411986 -1.53892 -0.606106 -0.0739985 0.220525 -0.421955 1.06091 0.00985934 0.760265 1.04671 0.419603 0.266598 -0.679185 -0.724455 -0.0570091 0.252734
+again 0.150918 0.0402853 0.234742 -0.119291 -0.241548 -0.33428 -0.298793 -0.206603 -0.298393 0.00547536 0.701647 0.130601 -0.650725 0.396927 0.342657 0.6026 -0.412707 0.000978031 -0.516628 -0.0657825 0.320901 0.939145 0.782681 0.773554 -0.032602 -0.289464 -0.2198 -0.0267983 -0.131163 -0.423124 -0.0489132 0.229232 0.238557 -0.517671 0.30529 -0.540026 0.134591 0.746714 -0.327146 -0.325414 -0.152404 0.13104 0.462413 0.221556 0.121124 0.347716 -0.873866 0.143406 0.238565 0.762017
+students 0.610102 -0.668789 0.142477 0.412398 0.0653879 -0.0305821 -0.372836 -0.753778 0.336837 -0.23562 -0.90896 -0.795644 0.266902 -1.10656 -1.20907 -0.418701 -0.957911 1.17139 -0.411453 0.541135 0.171067 1.62886 0.493996 0.350434 -0.659501 -0.261612 1.39266 -0.351847 -1.1885 -1.05176 0.140449 1.40894 0.559015 1.86703 0.165198 -1.00696 0.11506 -0.0984711 -0.102482 -1.13822 -0.258664 0.950317 1.29535 0.0214904 -0.297051 -0.770508 -0.564142 0.445688 -0.0883134 -0.350906
+you 0.440933 0.537812 -0.111206 -0.417252 -1.01539 -1.28799 -0.490789 -1.80056 0.440561 -1.25386 -0.462228 -0.664393 -0.0593108 -0.744641 0.0707304 0.92327 -0.261295 0.083064 0.0407213 0.211577 0.999076 1.97999 0.598328 -0.0916892 -0.705514 -0.462758 0.52167 -0.336874 -0.297531 0.193814 -0.049735 -0.347989 -0.0223204 -0.468625 0.459761 -0.896355 -1.31049 0.477673 0.639104 -0.270889 -0.901119 0.16125 0.310147 -0.163905 0.162144 1.19317 -1.1162 0.196676 0.183048 0.873719
+according 1.21952 -0.255991 0.720583 0.673748 -0.390802 0.0980437 -0.202244 -0.143188 0.59629 -0.776594 -0.0176232 -0.46675 0.0304399 0.473328 -0.273589 -0.258458 -0.584391 0.693836 -0.202545 0.126615 0.536676 0.382299 0.120754 -0.658412 0.160873 0.694289 2.02566 0.163472 0.414852 0.192736 -0.863443 -0.430016 -0.783007 -0.534693 -0.334217 -0.314649 0.521394 -0.376411 0.101876 -0.252168 0.162031 0.138128 -0.346991 -0.838729 0.0648548 0.265847 -0.92564 -0.497093 -0.671722 -0.63769
+white 0.128301 0.0629177 -0.157896 0.374869 0.389415 0.0172866 0.654502 -0.600583 1.81639 -0.767967 -0.025235 0.249774 -0.273472 -0.499539 0.179251 0.220019 -1.23525 -0.0832608 -0.565071 0.511568 0.624653 0.106823 0.812747 -0.00123428 0.303649 0.365073 0.562303 -0.564762 0.223956 -0.634511 -0.670304 -0.501329 0.997855 0.589159 0.233536 0.438223 0.207302 0.428911 -0.13298 0.114245 -0.538141 -0.782445 0.0974941 -0.365074 -0.129329 -0.0601741 -0.389441 0.128968 1.53461 -0.29777
+side -0.663982 -0.112917 0.386928 0.0595088 0.104412 0.0310816 0.206074 0.192327 0.102473 0.297617 0.313744 0.498884 -0.0362856 -0.216281 -0.0430937 0.0440746 -1.10166 0.919594 0.406124 0.394143 0.328202 0.784057 0.982962 -0.153894 0.101012 0.702609 -0.156464 -0.038874 0.359221 0.722181 0.0237039 -0.053376 0.935588 -0.447878 -0.163995 -0.51549 -0.0450454 0.435898 0.523427 -0.129509 -0.615227 -0.229679 0.142271 0.0711109 -0.119927 0.306637 -0.608304 -0.568042 0.331101 0.711639
+development -0.342267 0.744116 -0.319302 0.744321 -0.0492018 1.08173 -0.86928 0.352176 0.388164 0.754607 -0.375606 -0.480892 0.0036877 0.554689 -1.01736 -0.182723 0.0727203 0.966278 0.488913 0.00856684 1.10219 0.3778 0.126328 0.478356 -0.0658422 0.474674 0.408392 -0.700974 -0.487336 0.311284 -0.0028068 0.5879 0.304974 -0.217703 -0.0932457 0.376407 0.608548 0.35207 -0.095739 0.0562253 0.163218 0.850114 1.18153 0.0965345 -0.192982 -0.130016 -0.634612 -0.210886 0.241569 -0.421384
+do 0.726189 0.34558 0.630946 -0.325807 -0.286724 -0.756339 -0.248731 -0.79413 0.240958 -0.495763 0.135267 -0.383307 0.0145839 0.0537622 -0.365245 0.284384 -0.133193 0.205957 0.118468 -0.767668 0.847704 1.46007 0.355158 0.198748 -0.667062 -0.33009 0.830914 0.51518 -0.717766 -0.283695 -0.686652 -0.414735 0.284013 -0.397375 -0.305971 -0.405315 -0.48014 -0.444778 0.110864 0.051917 -0.748425 0.425115 -0.121147 -0.840303 0.36668 0.769103 -1.0128 -0.251582 0.0219617 0.248339
+island -0.812348 -1.19742 -1.66478 0.152498 -0.160547 1.24007 1.3352 0.289559 0.116721 -0.391477 -0.467349 -0.541315 -0.553705 0.868991 -0.0841362 0.0875235 -0.885104 -0.851832 0.537789 0.505268 0.723512 1.67546 0.843284 -0.106122 -0.21371 0.516593 0.548761 -0.296709 0.301888 -0.243795 0.232128 0.993479 0.450189 -1.24156 -0.238087 -0.11348 0.55915 0.346 -0.152478 -0.744582 -0.629745 -0.754556 0.0589228 -0.69235 0.522088 0.152478 -0.778426 -1.23348 0.379304 -0.0154869
+land 0.334517 0.140553 -0.821633 0.0900146 0.463881 0.360067 0.0324759 0.525422 0.236196 -0.685604 -0.410916 -1.53602 0.173502 0.324498 0.936309 0.264632 -0.64282 0.49144 0.754711 1.05879 0.685838 1.44189 0.103705 0.200316 -0.598186 0.816362 0.253733 -0.512018 0.235349 -0.134108 -0.931059 0.29959 -0.117046 -0.35497 -0.28766 0.509396 0.579167 0.743014 -0.295361 0.244951 0.137514 0.164852 0.246523 -0.809348 -0.0450956 -0.13141 -1.10994 -0.0605287 0.478923 -0.408028
+said 0.216011 -0.565805 0.0842639 0.0893296 0.060592 -0.946205 -0.432402 -0.669055 0.13322 -1.1415 -0.114035 -0.0714204 0.0106783 0.00971972 -0.740654 0.712837 -0.368206 -0.0916619 -0.28996 0.23635 0.241701 1.18925 0.326775 0.0093816 -0.0658837 0.0594747 1.32152 -0.265284 0.34622 -0.50551 -0.264182 -0.233898 -0.301097 -0.41738 0.36622 0.277901 -0.295137 -0.528342 -0.2553 0.455205 -0.356686 0.385707 0.342175 -0.0961151 0.356345 1.45556 -0.717212 -0.00777933 -0.085659 0.7669
+father -0.552892 -0.810996 0.163102 1.37944 -0.933338 -0.0772196 -0.320784 1.18673 0.0288147 -1.3171 0.895438 0.145457 -0.238832 0.0465831 1.20465 -0.0753713 0.0377682 0.109844 0.28945 0.46165 0.739256 0.442793 0.565762 0.932095 0.0429141 0.661317 1.15683 -0.307022 -0.363118 -0.355174 -0.242747 0.467405 -0.106762 0.826119 1.14986 -0.43598 -0.23327 -0.443883 0.0217077 -0.515 -0.460306 -0.135711 1.46566 0.161951 0.353353 0.87344 -0.276875 0.3171 -0.364301 0.495346
+ii 0.667274 0.450765 0.0257839 -0.0814143 -0.489157 0.466076 -0.11934 1.41899 0.248272 -0.571144 0.145793 -0.0805902 -0.745387 0.650959 0.183888 0.12682 0.406789 0.0454527 -0.0379635 0.0386525 0.22722 0.174454 -0.0770074 0.225611 -0.10549 -0.527355 0.548716 -0.982035 0.135713 -0.382223 -0.580261 -0.63892 0.142354 -0.266956 -0.746674 -0.710639 1.74063 0.489338 0.805138 0.144068 -0.460717 0.533941 1.9553 0.419095 -0.318843 0.381792 -0.349274 -0.0114658 0.855353 0.240974
+men 0.0474068 -0.89098 0.319747 0.640078 -0.12764 -0.472008 1.02858 -0.34704 0.0635755 -1.00069 -0.462903 -0.397214 0.241466 -0.731341 0.804731 -0.0219234 -0.890556 0.373389 0.736555 -0.187828 0.410952 0.349715 0.406467 -0.73294 0.139855 -0.246814 1.00614 0.682293 -0.118057 -0.999738 0.244538 -0.00293858 0.32846 0.168577 -0.729902 -1.32047 0.467684 0.593713 -0.620549 -0.468255 0.0489659 1.10933 0.647839 0.863816 -0.0663148 -0.0451679 -1.41177 0.513489 1.16879 0.435539
+led 0.648743 -0.324224 -0.0995637 0.459407 0.159169 -0.0261093 0.475265 -0.00666943 0.352146 -0.230329 0.499422 0.13348 -0.440448 0.81819 -0.869969 -0.0622051 -0.864049 0.765067 0.26626 -0.22924 0.318187 0.0135218 -0.18503 -0.296533 0.618969 0.0668961 0.934429 -0.413452 -0.0721868 -0.338469 0.255148 -0.702826 0.736741 -0.0618177 0.885584 -0.203393 0.686311 -0.335704 -0.424905 0.214672 0.839248 0.613536 0.751481 0.0591366 0.566105 0.705201 -1.32699 -0.270872 -0.590344 0.0207145
+london -1.06121 1.1402 0.440949 -1.21971 -1.15389 -0.493122 0.697728 -0.0278128 -0.331139 0.229814 0.0343036 -0.00595365 0.536403 1.37796 -0.375212 -0.460034 -0.394872 -0.309811 -0.890572 1.54675 0.136535 0.10597 0.931609 -0.302733 -0.164682 0.728778 1.45119 -1.42585 -0.202115 -0.297409 0.682727 0.372051 0.11878 0.109414 0.654641 -1.1712 -0.181327 -0.0724221 0.258266 -0.485771 0.339507 0.278912 0.711565 -0.664456 0.734188 -0.785826 -0.981759 -0.356176 1.43279 1.25632
+center 0.530022 -0.852862 0.370451 0.561086 0.199602 0.636311 -1.37206 0.0484873 1.21994 0.576624 -0.573161 -0.303857 -0.712524 -0.0687749 -1.06901 -0.138986 -0.106811 0.339251 0.977918 0.688674 0.151446 0.450206 0.0134839 -0.332878 0.124408 0.376141 0.486343 -0.937463 -0.0482904 -0.127999 0.947448 0.564727 0.505482 0.104812 -0.442329 -0.139897 0.0690854 0.31171 -0.0417454 -0.522872 -0.438173 -0.677989 0.777488 -0.144705 -0.403431 0.128 -0.712972 0.0116051 0.626318 -0.206791
+make 0.467722 0.439223 0.789983 0.462613 -0.292979 -0.413657 0.0171668 -0.752073 0.242714 -0.125712 -0.0114522 -0.41889 -0.174911 0.924305 -0.0487399 0.28744 -0.316171 1.14076 0.0052625 -0.520431 0.927537 0.896527 0.667861 -0.0854651 -0.108439 -0.0063522 0.936686 0.268044 -0.490443 -0.592454 -0.585728 -0.541212 0.162716 -0.294652 0.241546 -0.346819 -0.336809 -0.239591 0.0927309 -0.0433112 -0.0391916 -0.0438811 -0.386941 -0.166081 -0.220964 0.113992 -0.988372 -0.375707 -0.455434 -0.041674
+german 0.408375 0.945445 -0.00870139 -0.279171 0.064629 0.213609 1.25739 -0.0466788 -0.241808 1.23362 0.420125 0.283389 0.90167 0.966914 0.480818 0.0487034 -0.947624 0.133522 0.22532 0.288747 -1.13109 0.148872 0.309432 -0.835039 0.133892 0.258279 1.77657 -1.01971 -0.114421 -0.454169 -0.719396 -0.0193982 -0.486339 0.849469 -0.698701 -0.78071 0.399229 0.618289 1.39685 -0.694799 -1.16547 0.170232 0.949007 -0.423372 0.599983 1.01356 -1.80329 1.59835 1.0912 -0.97725
+currently -0.239729 0.233979 -0.237458 0.495368 -0.691329 0.139715 -0.663044 0.21722 0.00440092 0.966236 0.341958 -0.279864 -0.23341 -0.738372 -0.909157 0.0279488 -0.565946 -0.218063 -0.208608 0.0122194 0.527929 0.0600642 -0.319797 0.865844 -0.494196 0.437186 0.783054 -1.00923 0.0316789 -0.082099 0.168422 0.907307 0.095745 -0.198525 -0.939827 0.199878 -0.402341 -0.348657 0.0205394 0.107848 -1.10629 -0.547108 0.0777139 0.554178 0.306951 0.334352 -0.204278 -0.417278 0.584782 0.363591
+player -1.14074 0.280275 0.235524 1.44398 -0.332453 0.473599 1.03009 -0.484557 -0.203003 0.553155 1.56169 0.665461 -0.419187 -1.58179 -0.827132 0.632779 -0.345127 0.606482 0.143426 -0.777033 0.430782 2.11728 -0.0858874 -0.433948 -0.738922 0.590454 1.08571 -0.0991917 0.336157 -0.732948 0.000199215 0.65195 0.280521 -0.386497 0.039941 -1.51168 -0.503471 -0.0351038 0.557284 0.680649 -0.509084 -1.49185 1.05349 1.29985 0.313608 -0.391335 -1.04017 -0.949483 0.00485096 0.0645085
+become -0.0287848 0.0952269 0.408134 0.580563 -0.415837 0.00611096 -0.0746622 -0.25178 -0.352167 -0.164311 0.500155 -0.199468 -0.211564 0.754507 -0.773851 0.059963 -0.0617751 0.271076 0.133635 -0.446397 0.876188 0.803654 0.185605 0.519257 0.351096 -0.273444 0.856002 -0.0977847 -0.379144 -0.263058 -0.410173 -0.0528981 -0.053685 0.0557947 0.0309158 -0.525207 -0.0852236 0.052472 -0.865697 -0.118281 0.0356155 0.268483 -0.0897183 -0.664999 0.172274 0.355139 -0.477758 -0.233911 0.218193 0.16446
+western -0.424365 -0.370538 0.107835 -0.0965984 -0.028337 0.106357 -0.014281 0.138502 1.36182 0.108382 0.692917 0.110157 0.214028 1.01391 -0.410759 0.252895 -1.58996 0.702588 0.460922 0.544508 0.120855 -0.749135 0.155208 0.305815 -0.4151 0.364968 0.10106 -0.212489 -0.547405 0.131054 -0.348908 0.405297 -0.325404 -0.0951983 -0.952128 -0.54122 0.507468 0.732034 -0.387407 -0.371931 -0.47702 0.464218 -0.06108 -0.263987 0.390141 0.467417 -0.706159 -0.634005 0.477289 0.163289
+included 0.414171 0.559235 0.0219426 -0.310137 -0.273284 0.141194 0.327344 0.0395066 0.0525048 -0.554912 -0.0830702 0.00527511 -1.19156 0.239412 -0.717544 0.020189 -0.869675 0.0940759 0.0159309 -0.304587 0.105018 -0.450796 -0.110481 -0.286286 -0.773915 0.202074 0.47872 -0.602765 0.247283 -0.300907 0.111767 -0.438875 0.635555 0.125376 0.176846 0.0981757 0.494645 -0.43174 0.308024 0.465285 -0.129839 0.562013 0.227217 0.125939 -0.165682 0.709357 -0.941881 -0.368296 0.063268 0.0588543
+species -2.56268 0.617177 0.0728623 0.115749 0.0312431 0.79051 0.0933506 -0.0108337 0.619398 0.0214863 1.60061 -1.5077 -0.743905 -0.558316 0.207423 -0.356597 -0.992987 -1.27909 0.927514 0.423987 1.01017 1.28632 0.0648097 -0.0666279 -0.645086 0.978109 1.37576 1.02841 -0.257833 -0.594932 -1.02319 0.0420231 1.24755 -0.577236 -1.60962 1.27921 0.586773 0.127447 -0.687983 0.757726 -1.12189 -0.420467 2.11652 -1.99285 -0.692704 -0.0536058 -0.980681 -0.944343 0.138585 -1.15509
+point 0.0561537 -0.669108 0.148936 -0.0804162 0.449864 -0.111005 0.0295527 -0.173471 0.199105 -0.086766 0.194963 -0.606961 0.304066 0.265045 -0.591293 0.614166 -0.454917 0.620002 0.167354 0.0173461 0.455941 0.910852 0.33999 -0.319865 0.206065 0.344105 0.097142 0.175772 1.05418 -0.0176947 -0.0674156 0.466973 0.39251 -0.383299 0.168104 -0.44349 -0.358493 0.534952 0.501598 -0.659963 -0.253021 -0.3365 0.721149 -0.377238 -0.416589 0.0470285 -0.66553 -0.409085 -0.096715 0.583203
+every 0.676018 0.440947 -0.0341171 -0.0925673 -1.07748 -0.754879 -0.37263 -0.608863 0.3347 -0.334996 -0.213444 -1.23574 -0.351547 -0.70707 -0.322455 0.36353 -0.407843 -0.0493707 -0.278513 -0.360339 0.157065 0.590883 0.539033 0.386185 -0.26581 -0.0323374 0.667293 -0.711352 0.535144 -0.898141 -0.423688 0.170174 0.816304 -0.569357 -0.324297 -0.0813805 -0.239479 0.597844 -0.0268432 -0.886344 0.0473085 0.00103422 0.29527 -0.0536895 -0.563335 -0.718492 -0.205662 0.354146 -0.00171502 -0.0544511
+moved 1.07922 -0.0604824 0.314973 0.0479865 0.248196 0.453983 -0.32375 0.109152 -0.146903 -0.178142 1.53579 0.266945 -0.849485 1.02437 -0.358238 -0.116966 -1.14905 0.892574 0.217232 0.545976 0.733274 0.527928 0.672403 0.798421 -0.223532 -0.103188 1.25928 -0.0439717 0.156123 -0.108316 0.923373 0.308722 0.493783 0.4145 0.505773 -0.359982 0.269668 -0.399626 -0.566058 -0.795775 0.751365 0.0617288 -0.26722 -0.375998 0.785831 0.488803 -1.05826 -0.146671 0.113724 0.603953
+education 0.206214 -0.441291 -0.495092 0.157857 -0.462689 0.641794 -1.10418 0.606665 1.18225 0.171373 -0.197866 -1.01165 0.441642 -0.611484 -0.538756 -0.67806 0.047408 0.880961 0.040732 0.544666 0.790702 0.856958 0.557423 0.376356 -1.01052 0.0445024 1.26608 -0.904629 -0.866557 -0.661537 -0.402129 1.79235 0.072716 1.44737 0.513667 -0.603279 0.393156 -0.0888826 -0.00722861 -0.194399 0.215605 1.0453 0.541567 0.0863256 0.284872 0.132351 -0.417755 0.379751 -0.0858124 -0.342321
+though -0.325885 0.0618121 0.610608 0.150618 0.0541964 -0.487091 0.327257 0.314247 -0.281242 -0.372769 0.168753 -0.0418037 -0.323108 0.378063 -0.68439 0.612961 -0.413673 -0.147815 0.0136677 -0.435804 0.374307 0.476178 0.361949 0.56289 -0.363238 -0.204707 0.292235 -0.0934012 -0.118651 -0.900385 -0.795614 0.0449918 0.108778 -0.320292 -0.0506422 -0.21724 -0.13682 0.179114 -0.0550503 0.0529293 -0.491809 0.0832416 -0.00822101 -0.454292 -0.473541 0.673527 -0.466193 0.134191 0.136559 0.329305
+england -1.6332 0.513073 0.534868 -1.07328 -0.385543 0.0808569 1.3246 -0.134565 -0.136296 -0.0277205 0.76114 -0.670472 0.601673 0.816603 0.479345 0.147829 -0.321775 0.244123 -0.541047 1.55604 1.12793 0.0856769 0.776494 0.797412 -0.665708 0.235318 1.41445 -1.69015 -0.090448 -0.178854 -0.10519 0.398256 0.377609 -0.319754 -0.477112 -1.28115 0.471788 0.0882255 0.635311 0.225978 0.347667 0.287005 0.458931 -0.0553292 0.79035 -0.693007 -1.0365 0.014711 0.487863 0.851865
+region -0.446976 -0.202982 -0.577024 -0.3696 0.713238 0.638871 -0.782398 0.443299 1.29319 -0.134111 0.562623 -0.28791 -0.245761 0.598294 -0.650548 -0.719153 -1.36588 0.852379 0.239203 0.602756 0.0681442 0.396574 -0.600964 0.286242 -0.334684 0.72114 0.110117 0.00930743 -0.344207 0.505417 -0.755733 0.315416 -0.0529851 -0.727691 -1.02072 -0.919507 -0.41689 0.672912 -1.04187 -0.618313 -0.499929 0.226292 0.56776 -0.298266 -0.359123 0.731227 -0.688667 -0.403204 0.123985 -0.2925
+came 0.998751 0.0259391 0.164443 -0.099816 0.0282896 -0.200157 0.0538431 -0.25078 0.0317325 -0.71783 1.11546 0.254177 -0.00482839 0.900947 0.210411 0.102925 -0.967187 0.626132 0.22798 -0.306298 0.560364 0.479927 0.721687 0.281492 -0.089227 0.0313565 0.86324 -0.48028 0.36233 -0.268448 0.342013 -0.0629316 0.546461 -0.26396 0.814755 -0.197416 0.421298 -0.24776 -0.13978 -0.250778 0.658712 0.664298 0.288313 -0.0462959 0.253911 0.320365 -0.79906 -0.0164488 -0.267652 0.19555
+given 0.317735 0.769406 -0.178052 -0.0592065 0.0276818 -0.405226 0.245117 0.0411418 0.372755 -0.354124 -0.2375 -0.436766 -0.385096 -0.368037 -0.383122 0.522835 -0.363313 0.586788 -0.344366 -0.207541 -0.0338728 0.238101 -0.0479625 -0.0194203 0.270013 -0.0470824 0.6441 -0.263031 0.937756 -1.02191 -0.455013 -0.0751387 0.0986908 0.0114705 0.12491 0.0111387 -0.378432 -0.739069 -0.221035 -0.252134 -0.21564 0.178271 1.03011 -0.681939 0.137623 0.714861 -0.655952 -0.0447075 -0.349697 0.461394
+court -0.525862 -1.30773 1.28294 -0.0377553 -0.356281 -0.48019 0.270548 0.323827 0.482982 -0.461335 -0.0670727 0.0266447 -0.537119 -0.714281 0.0457269 -0.0236753 0.285463 1.021 0.00314947 0.479588 -0.479476 1.45319 0.327112 -0.562697 -0.568078 1.50952 0.00391437 -0.70998 0.406614 -0.940877 -0.73997 1.10561 -1.5451 -0.124737 0.565962 -1.12424 1.1876 -0.293864 -0.113085 -0.416268 -0.0519559 1.27836 -0.165214 0.377148 0.158746 0.278296 -1.6503 -0.891531 1.07998 0.565759
+division 0.663784 0.388064 0.196955 -0.139409 0.267239 1.20563 0.116116 0.323084 0.4273 0.617972 1.00263 0.442449 -0.125056 -0.722199 0.0268482 -0.217037 -0.799238 1.46022 0.0620364 0.000821795 0.181981 0.246436 -0.441021 -0.140968 0.372171 -0.125159 0.241425 -0.434688 -0.0872055 -0.0059889 -0.0403895 0.917887 0.239051 -0.211085 -0.824634 -0.928 -0.466686 1.2208 -0.664901 0.32407 0.0199411 0.475595 1.07391 0.468855 -0.222886 -0.0308898 -1.8912 -0.39969 0.674477 1.05937
+see 0.538564 0.0023989 0.544554 -0.660459 -0.49693 -0.219017 -0.321054 -0.0900174 0.251502 -0.508647 0.26538 -0.292009 0.0892641 0.48262 -0.179592 0.0252719 -0.439607 0.441791 -0.189786 -0.492341 0.622329 0.264389 0.557588 -0.605029 -0.142292 0.424391 0.872276 -0.0182608 -0.286444 -0.00817066 -0.60766 -0.491204 -0.245937 -0.59551 -0.492492 -0.479866 -0.207406 -0.256386 0.508475 0.0287864 -0.575982 -0.254141 0.128296 -0.589432 0.120014 0.369113 -0.577222 -0.433647 -0.323516 -0.0201987
+non 0.510749 0.637405 0.267446 0.203004 -1.18567 -0.320126 0.590044 0.237512 0.783198 0.351317 -0.316614 -0.755875 0.173797 -0.703342 -0.732463 -0.130785 -0.459477 0.340344 -0.314417 -0.239019 0.481207 -0.0972352 -0.426197 0.319594 -0.216728 0.499028 0.70147 -0.20353 -0.168902 -0.535826 -0.31688 0.136038 0.652114 -0.425919 -0.373997 0.414547 0.0243698 0.348238 0.126221 -0.00570602 -0.317654 0.350192 0.12956 -0.216718 -0.328158 -0.160112 -0.32587 0.410391 0.55693 -0.273973
+radio -0.111598 1.25205 -0.352594 0.0838322 -0.610149 -0.961405 -0.00183418 0.725658 0.503842 -0.649328 0.596625 -0.211966 -0.660978 0.0359337 -1.58868 0.808419 -0.914541 -1.31216 -0.00185223 0.621221 -0.440172 1.1716 -0.0544101 -1.04236 -0.258475 0.180259 1.53963 -0.978901 -1.11214 0.770501 0.624841 1.5179 0.59346 0.0840453 0.487376 -0.197772 -1.12295 0.672517 0.109752 -0.821999 -0.238777 0.463905 -0.872826 0.489478 0.74331 0.374432 -1.3619 0.0639721 -0.578408 -0.14282
+television -0.829367 1.9238 0.0278974 0.356154 -0.481671 -1.14976 -0.909614 0.450854 0.744482 -1.49099 0.673063 0.395617 -0.821221 0.44024 -0.899863 0.528845 0.0494665 -0.509924 -0.377544 0.0734361 0.424759 0.303513 -0.224031 -0.798897 -0.416735 0.45703 0.992896 -1.03955 -0.680087 -0.181152 0.222042 1.82606 0.218954 0.0214961 -0.271927 -0.149722 -0.476848 0.209384 -0.226435 -1.34834 -0.130352 0.543696 -0.316639 1.25906 0.691099 0.33789 -1.34661 -0.320569 0.382052 -0.641693
+political -0.13762 -0.912416 -0.617148 -0.0941003 -0.770266 -0.52917 -0.039052 0.352648 0.487689 -0.407792 0.324131 -0.248359 0.377592 0.320517 -1.40977 -0.531768 0.524609 0.411347 -0.0935536 -0.692655 0.552794 0.235685 0.754647 -0.340642 -0.587636 0.952395 0.120772 -1.10519 -0.333553 -0.802841 -0.0878383 0.805997 0.247424 0.143809 1.16058 -0.216333 0.614295 1.49478 -0.984758 -0.00782022 -0.0777059 0.601045 0.392533 -0.513199 0.554547 1.0431 -1.0272 0.783201 0.163616 -0.690288
+few 0.0065628 0.216656 -0.101718 0.111828 -0.541934 -0.542308 0.295677 -0.316079 -0.82489 -0.297888 0.0766497 -0.581825 -0.482959 0.280501 -0.8506 0.0563877 -0.779625 0.012927 0.411748 0.214885 0.0447338 0.0178157 0.427418 0.901048 -0.0823679 -0.360646 0.417707 0.0331312 0.198394 -0.669358 -0.460272 -0.0446051 0.451415 0.0426296 0.0520888 -0.310918 -0.106316 0.558756 -0.375983 -0.698791 -0.30549 0.0627248 -0.0741691 -0.0844098 -0.89232 0.116343 -0.782065 -0.0412667 -0.0124908 0.00564485
+various 0.346307 0.723327 0.0231259 -0.492038 -1.29391 -0.551085 0.636305 -0.274882 0.273039 -0.295777 0.279861 -0.8085 -0.433213 0.0840471 -1.12979 -0.523732 -0.309211 0.376139 0.531678 -0.0237164 0.508078 -0.406807 0.128641 0.334476 -0.16485 0.0609202 -0.0476971 -0.247344 -0.451703 -0.421817 0.251872 -0.176863 0.0558161 0.124002 -0.0349866 -0.0498628 0.326456 0.172577 -0.178421 -0.455886 -0.617641 0.340936 0.257229 0.511497 -0.289908 -0.097243 -0.664272 -0.244211 0.356445 -0.47662
+black 0.211404 0.303924 -0.415354 0.587373 0.362157 -0.430434 0.871309 -0.64798 1.50797 -0.652651 0.146751 0.363 -0.195468 -0.307655 0.080046 0.157797 -1.1469 -0.286558 -0.191629 0.348445 0.840033 -0.0214888 0.729713 -0.318958 0.322515 0.341039 0.362619 -0.666853 -0.0562995 -0.394152 -0.455982 -0.423752 0.868926 0.178869 0.233195 0.0183821 0.162254 0.246938 -0.387264 0.355891 -0.603775 -0.551374 0.0804971 -0.330273 -0.169174 -0.0493148 -0.513193 0.300249 1.54519 -0.136404
+us 0.31964 -0.147676 -0.0456585 0.187511 -0.0953304 -0.387326 -0.219804 -0.839888 0.499783 0.00493323 -0.406122 -0.462142 0.454278 0.556592 0.484749 0.768414 -0.022413 0.122054 -0.375008 0.479134 0.147746 0.25642 -0.519239 0.0368765 -0.395353 -0.278078 0.781263 -0.747294 -0.104544 0.259424 0.706695 0.0703203 0.0394144 -0.471314 0.0477108 -0.335639 -0.0405768 0.628741 0.644106 -0.0955523 -0.976219 0.442461 0.162375 -0.783765 -0.97514 0.44817 -1.62436 -0.186513 0.581142 0.268233
+having -0.30338 -0.0563092 0.484375 0.341249 -0.531161 -0.459169 0.21867 0.395894 -0.458601 -0.0395225 0.132271 -0.239866 0.00478989 0.106229 -0.256281 0.0147857 -0.22751 -0.0656898 -0.233225 -0.329736 0.306245 0.304748 0.638604 0.475746 0.098165 -0.110007 0.397763 -0.0456216 -0.0841502 -0.556348 -0.639233 0.172623 0.236209 -0.119969 0.0902312 -0.124136 -0.127821 0.263454 -0.236893 -0.00541875 -0.381215 -0.00195825 0.0936543 0.255895 -0.435492 0.02897 -0.663147 0.00662024 0.219278 0.256387
+military 0.770625 -0.297991 -1.14876 0.0397924 0.32574 -0.469348 0.63944 1.44674 0.921925 -0.641257 0.156197 -0.256742 0.0793388 0.495838 -0.767532 0.323606 1.13886 0.83034 0.546762 -0.331831 -0.253446 0.700676 0.58765 0.117649 -0.255031 -0.00840629 1.04324 -0.904669 -0.0546206 -0.693634 0.258993 0.0704788 -0.251777 0.731548 -0.00612711 -0.806757 0.814737 1.18944 -0.271316 0.247808 -0.89955 0.447655 0.740672 -0.147678 -0.509685 0.0173667 -1.57897 0.243625 0.536549 -0.210287
+went 1.35449 0.0701423 0.355328 -0.232056 -0.236632 -0.258057 -0.130139 -0.500671 -0.0627486 -0.565914 1.41128 0.136583 -0.239178 0.911866 0.3062 0.142783 -0.886606 0.57308 0.119083 -0.17139 0.671291 0.675166 0.874472 0.17558 -0.255096 -0.318695 1.14139 -0.224463 -0.0591097 -0.0652416 0.587517 0.201698 0.44732 -0.0494678 1.08682 -0.518842 0.405509 -0.321893 -0.274235 -0.360867 0.799222 0.484735 0.0269371 0.439348 0.680216 0.339723 -1.35509 -0.253526 -0.0836125 0.519637
+street 0.209769 -0.397975 0.538696 -0.452229 -0.314778 -0.265651 -0.606058 -0.434713 0.13961 -0.677805 -1.12474 0.71906 -0.803632 0.42203 -0.661381 -0.415547 -0.820252 0.801558 0.388936 1.10999 0.748479 0.567613 0.688964 -0.0057057 -0.220672 1.14474 0.316926 -1.16497 0.228292 -0.327096 0.800804 0.508928 0.345448 0.396312 0.561085 -0.687512 -0.211685 1.22136 1.26805 -0.786926 -0.00376288 0.0869737 0.0939712 -0.416593 0.733338 0.11858 -0.249042 -0.726042 1.08674 1.05037
+using 0.333479 0.809683 0.411692 0.234672 -0.0189483 -0.565326 0.548588 0.288874 0.520645 0.151953 -0.230674 0.0174035 -0.522678 0.544908 -0.730412 0.111032 -0.439855 0.205981 0.0811784 -0.060749 0.771668 0.51986 0.0512564 0.0839325 0.0194989 0.245541 0.256169 -0.106446 -0.068716 -0.626139 0.126459 -0.377012 0.0213184 -0.156188 0.490661 0.627389 -0.278106 -0.157634 0.739356 -0.257766 -0.970368 0.158439 0.671039 -0.141188 -0.423471 -0.536613 -0.701024 0.158481 -0.490822 -0.169606
+next 0.23397 0.0978943 0.141125 0.506926 -0.232047 0.109235 -0.388234 -0.260927 -0.483954 -0.0502695 -0.129139 0.0397333 -0.931107 0.830242 0.188021 0.0176637 -0.77377 0.345138 -0.306989 0.0330314 -0.000786356 0.557483 0.450907 0.0805994 0.208063 -0.253143 0.0101279 -0.165662 0.117389 -0.908817 -0.0329006 0.273839 0.516556 -0.239989 0.238239 -0.800532 0.032558 0.331787 -0.0298227 -0.595246 -0.16626 -0.247853 0.273429 -0.0542578 -0.538664 -0.00734092 -1.12851 0.0290896 -0.693183 0.80749
+period -0.277671 0.321352 -0.176746 -0.211896 -0.396677 0.362763 0.202523 -0.0103082 0.106756 -0.442129 0.293241 -0.256705 -0.444675 1.14526 -0.818257 -0.0893569 -0.112499 0.047887 0.582905 -0.418968 -0.134846 0.33368 0.727667 0.905253 0.316568 0.319761 0.391088 -0.231909 0.966459 -0.0680656 -0.514365 0.824845 0.412058 -0.102357 0.566573 -0.697597 1.17047 0.357967 -0.528046 -0.768327 -0.175402 0.78949 0.924986 0.0498834 -1.444 0.121222 -0.399832 0.241842 -0.303929 0.305552
+support -0.0143444 0.242939 -0.0100151 0.091841 -0.318315 0.0483825 0.714358 0.537966 0.4131 0.262892 -0.32187 -0.669444 -0.478146 -0.101999 -0.392616 -0.35866 0.43144 1.13581 -0.286723 -0.0274181 0.712612 0.840294 0.436118 -0.0899671 0.203822 -0.207406 0.694786 -0.500537 -0.931402 0.312981 0.511008 -0.313826 0.6883 0.367595 0.262494 0.0862241 0.320397 0.520062 -0.394199 0.18141 0.183191 0.307543 0.242894 -0.130512 -0.474452 0.904187 -1.30188 -0.078122 -0.733678 -0.204212
+title -0.150318 0.547912 0.769372 1.01413 -0.146747 0.0325726 0.211663 -0.0620861 0.0611826 -0.782045 0.502617 0.132132 -0.280795 -0.5592 1.08354 0.621168 -0.689032 0.296679 -0.502034 -0.908857 0.0242353 -0.036455 0.214523 -0.19634 -0.28004 0.385363 -0.0300399 -1.039 0.244436 0.276925 -0.710774 1.00185 -0.464137 -0.623554 0.476769 -1.56022 0.242502 0.346456 -0.243279 0.220955 -0.671473 0.398302 1.34394 0.0020137 0.13161 0.648307 -0.777407 -0.630409 0.192084 0.84052
+record -0.34739 0.43244 0.359032 0.469557 -0.464048 -0.373227 0.336832 -0.806706 -0.0411659 0.287085 0.215133 -0.599497 0.0547264 -0.311857 0.164924 0.741474 -1.46093 -0.474203 0.443101 -0.505259 0.0914611 0.454797 -0.157677 0.492118 -0.0570142 0.0435418 1.17166 -0.438249 0.614953 0.775412 0.23354 1.02434 0.317009 -0.0918741 1.09725 -0.709768 -0.0193073 0.0274047 0.112671 0.884677 0.337693 -0.133983 0.531819 0.409297 -0.765586 0.34216 -1.59094 -0.465354 -0.433442 0.406087
+council -0.176815 -0.860687 -0.810533 -0.398219 -0.210637 0.0528155 -0.456607 0.464434 0.5234 0.307525 -0.78709 -0.752292 -0.447707 -0.295126 -0.420952 -0.598043 -0.422499 0.834824 -0.979567 0.416156 0.235206 1.05869 0.0153192 -0.118477 -0.251646 1.62731 0.716545 -1.40762 -0.799148 -0.398977 -0.689544 0.517087 0.22155 -0.230275 0.0467623 -0.758961 0.951333 0.224212 -2.03375 0.134926 -0.551383 0.493119 0.403221 -0.079755 0.728618 -0.696782 -0.810589 0.0163131 0.297651 0.619148
+established 0.670516 0.290026 -0.506463 -0.0111858 0.203048 0.511362 0.183234 0.366459 0.520224 0.204747 0.00495195 -0.55667 -0.455748 0.175917 -0.758261 -0.124933 -0.807609 0.481365 0.372492 0.217122 -0.00242546 0.101285 -0.517646 0.772369 -0.514111 0.449279 0.719294 -1.02207 0.356931 -0.573103 0.509524 0.136243 -0.0525188 0.0971556 -0.0534445 0.179896 0.422323 -0.461322 -0.859229 -0.00257137 0.268422 0.906159 0.769645 -0.607491 0.439391 0.791843 -0.489777 -0.0354501 0.391367 0.692904
+art 0.0442056 0.478941 0.616111 0.633224 -0.212345 -0.542107 0.100435 -0.359958 0.635748 -0.188148 -0.531506 -0.714773 -0.84918 0.501678 -0.683729 -0.141247 -0.72203 0.664824 0.57256 -0.00580661 0.450675 0.293444 1.36789 -0.0423109 -1.14126 1.076 1.17789 -0.65166 -0.192544 0.15999 0.549849 1.19159 -0.677329 0.928709 1.30925 -0.413152 0.101071 -0.240178 -0.161295 0.222392 -0.453087 -0.258888 1.41681 0.249268 0.233506 -0.328537 0.582237 -0.00675521 1.34809 -0.578016
+six 0.151387 0.119056 -0.206645 0.258879 -1.15178 -0.316196 0.0918697 -0.36678 -0.635959 0.075347 -0.123163 -0.30534 -0.631889 -0.442271 -0.0207991 0.153088 -0.919731 0.307687 0.00888021 0.323834 0.0577865 -0.640508 0.432136 0.357468 -0.0127714 -0.749593 0.505492 -0.391742 0.497566 -0.545208 -0.18461 0.0812741 0.801765 -0.205378 -0.00484566 -0.416074 0.509624 0.376823 -0.36168 -0.527073 -0.48743 0.368483 0.140873 0.475323 -0.617563 -0.674447 -1.30902 -0.297409 0.0892122 0.212502
+take 0.859895 0.376069 0.349113 0.251614 -0.24459 -0.497798 -0.0558407 -0.756993 0.0498548 -0.293783 0.250308 -0.661815 -0.176909 0.742594 0.0653836 0.0390156 -0.0770787 0.98042 -0.118218 -0.559181 0.682931 1.00918 0.841942 -0.27195 0.334751 -0.0414057 0.531136 0.0785886 -0.401324 -0.666151 -0.171718 -0.462457 0.338392 -0.276458 0.11809 -0.788402 0.0387628 -0.175192 0.0192343 -0.650325 0.119381 0.244084 -0.092165 -0.31787 0.0567728 -0.163697 -1.27619 -0.386309 -0.578681 0.337529
+popular -0.876332 0.551298 -0.219471 0.830191 -0.55789 -0.467752 0.451086 -0.529889 0.278222 -0.605824 0.340273 -0.229885 -0.895055 0.450595 -1.29259 -0.286767 -0.735538 -0.0573781 0.527547 -0.348357 0.0914505 0.212912 -0.145709 -0.0212163 -0.689849 0.0425683 0.207612 -1.03197 0.362517 -0.631287 -0.269934 0.0328454 -0.116771 0.370567 -0.0919276 -0.130179 -0.315991 1.00563 0.155223 -0.38992 -0.166716 0.414254 -0.11229 -0.16401 0.268895 1.22671 0.203725 0.0971724 -0.121373 -0.178506
+class -0.203095 0.473835 0.435913 0.219858 -1.19411 0.602393 0.276309 0.817356 0.20884 0.212374 -0.0824799 0.551512 0.15096 -0.488085 0.243835 0.393549 -0.437064 1.18943 0.342706 -0.500148 0.707788 0.438307 0.0315709 -0.179555 0.200262 -0.845026 0.845268 -0.0408061 0.224358 -1.21453 -0.0960589 1.48051 0.626382 0.453405 -0.753555 0.0359096 0.119905 0.563949 0.897258 -0.463765 -0.284489 1.09179 0.925494 -0.340461 0.116053 -0.450233 -0.0787225 0.0404943 0.130165 0.473414
+program 0.244182 0.743272 0.0115429 0.998788 0.150283 0.71966 -0.900202 -0.386249 0.928349 0.213512 -0.678879 -0.648955 -0.770153 -0.151756 -0.943059 0.714373 0.272336 -0.11645 -0.162511 -0.306199 0.310492 0.713934 -0.352878 0.0812216 -0.179941 -0.201916 0.628416 -0.773084 -0.876974 -0.540175 0.692807 1.44192 0.37932 0.570974 -0.286966 -0.0657292 0.0429735 0.121217 0.0789184 -0.742483 -0.652541 0.481358 0.52279 0.500144 -0.361326 0.0338942 -1.4945 0.375136 -0.777747 -0.433553
+created 0.366657 0.681209 -0.666425 0.0371478 0.153186 0.0460375 -0.0509455 0.264489 0.285618 -0.208425 0.25226 -0.0935193 -1.37825 -0.161633 -0.585229 0.258936 -0.62351 0.449662 -0.0426608 -0.0720572 0.747684 -0.246774 -0.28822 0.0253956 -0.557862 0.396693 0.0643608 -1.07208 0.159675 -0.368366 0.00805261 -0.266764 -0.265151 -0.43455 0.402589 0.450777 0.245764 -0.491886 -0.525721 0.0291229 -0.0754662 0.590586 0.822319 -0.401629 0.78761 0.51973 -0.457541 0.081488 0.439394 -0.159543
+business 0.215203 -0.00323231 0.272018 0.257772 -0.396696 0.165092 -0.608521 -0.137823 -0.236028 0.233813 0.0188193 -0.588048 0.400241 0.695254 -0.127309 -0.518733 0.388903 0.787464 0.452085 0.0884293 1.02452 0.79424 -0.765372 0.491056 -0.567838 0.645675 0.574669 -1.25561 -0.241021 -0.795748 0.515721 1.17546 -0.0991389 0.886724 1.18261 0.314609 -0.768593 0.971805 -0.0334503 -0.663966 -0.267194 0.610309 0.142905 -0.008567 -0.252575 0.234005 -0.726802 -0.655758 0.573809 0.654399
+produced 0.0212747 1.39517 -0.371872 0.25535 -0.074086 -0.466711 -0.0639946 -0.289517 0.327174 -0.267313 0.381183 0.266918 -1.37651 0.45056 -0.292805 0.148114 -1.2344 -0.367372 0.115796 0.480896 0.375958 -0.221288 -0.685653 -0.353157 -0.808499 -0.270858 0.731188 -0.439174 0.393381 0.123313 -0.290128 -0.125441 0.305438 -0.0718559 0.323956 0.907337 -0.170478 -0.470801 0.2709 0.0017271 0.00202241 1.58667 0.807626 0.468738 -0.328289 0.730086 -0.526735 0.140684 0.553953 -0.225405
+started 1.06803 0.667755 -0.0947668 0.122805 0.0302178 -0.131143 -0.39398 -0.0522289 -0.0876399 0.051504 0.855184 0.258038 -0.24156 0.886314 -0.360181 -0.13807 -1.01657 0.181623 0.464988 -0.290186 0.375273 0.541026 0.276172 0.839171 -0.386747 0.0144122 0.887772 -0.469407 0.0153801 -0.519774 1.03056 0.42986 0.446135 0.100884 0.725305 -0.0318435 0.109523 -0.131067 0.100233 -0.198821 0.591904 0.560021 0.477677 0.717485 0.305363 0.631675 -0.572349 0.129219 -0.238972 0.428799
+written 0.392841 0.593234 -0.808004 0.0346608 -0.796559 -0.260707 0.258933 -0.893401 -0.198236 -0.855354 -0.110852 0.294982 -0.733498 -0.407403 -0.36004 1.03693 -0.675776 -0.0264554 -0.338445 0.824138 0.571891 -0.503214 0.307385 -0.356031 -1.67692 0.624823 0.62427 -0.108621 0.377674 -0.0669453 -0.410145 0.254819 -0.239388 -0.181518 0.538136 0.237587 -0.0913337 -0.385129 0.450316 -0.360965 -0.867558 1.15336 1.29679 0.0930131 -0.0267933 1.48096 -0.443661 0.139496 -0.573429 0.179303
+we 0.710978 0.106024 0.152589 -1.02173 -0.790319 -1.00316 -0.524512 -1.68775 0.208935 -1.28714 -1.02478 -0.966057 0.333 -0.0919779 -0.404144 1.19246 -0.327031 0.381486 -0.251003 0.166059 0.496925 1.75054 0.390861 0.246562 -0.248657 -0.177938 0.426814 -0.111868 -0.402341 0.163475 0.0361716 -0.127221 0.478674 -0.642529 0.215732 -0.253757 -1.3331 0.415863 -0.0476637 0.11402 -0.617506 0.578831 1.12402 -0.194731 -0.237661 0.987714 -1.15635 1.01813 0.371789 0.570424
+force 0.571211 -0.0928145 -0.789564 0.363081 0.24222 -0.39676 0.819085 1.02989 0.935413 -0.226337 0.755812 -0.391589 -0.181237 0.195586 -0.37497 0.565676 0.626438 0.873967 0.15602 0.241423 0.313603 1.40932 0.182121 -0.627623 1.0014 -0.119842 0.30423 -0.125126 -0.63381 0.182714 0.233367 0.317528 0.583994 -0.507386 -0.404593 -0.274773 -0.106263 0.731958 -0.250618 0.00745316 -0.468289 0.850922 0.609209 -0.305185 -0.122916 -0.286691 -1.52431 0.531363 0.636835 0.562668
+role -1.04936 0.464368 -0.162513 0.229014 -0.92366 0.719671 0.0190674 0.230086 0.300231 -0.567078 0.75047 0.727841 -0.670732 0.594451 -0.752051 0.260596 1.43765 0.85091 0.135999 -0.682879 0.581045 0.586822 0.377243 -0.272026 0.366498 0.46688 0.269694 -0.232857 -0.691948 -0.0702478 -0.306383 0.719424 0.364163 -0.0968553 -0.658291 -0.809025 -0.354033 -0.163241 -0.49526 -0.236507 0.0689891 0.622695 1.25393 0.733261 -0.382044 0.94704 -0.729634 -0.102682 0.500892 0.529866
+research -0.314587 0.160925 0.128658 -0.0177873 0.3981 0.677731 -1.01699 -0.0518146 0.485821 0.567323 0.0146411 -1.26395 0.270215 0.300079 -1.68683 -0.620391 0.243752 -0.331721 0.278965 0.48712 0.867467 0.234368 0.271597 -0.177084 -0.215485 0.119326 1.52815 0.0362026 -0.8961 -0.256313 0.49053 0.778838 -0.507662 0.671348 -0.0299217 0.868973 0.156964 -0.183596 -0.68118 -0.214645 -0.244089 0.60563 1.91543 0.0406159 -0.358615 -0.315652 -1.61366 0.37698 0.0497558 -0.316783
+field 0.381354 -0.114566 0.457056 -0.142298 0.313724 0.351043 -0.20638 -0.359887 0.881895 0.236016 0.583266 -0.760958 -0.000606937 -0.18352 -1.1157 0.500972 -0.58977 0.592326 0.775322 0.101684 -0.0471785 0.535742 0.17522 -0.405954 0.75938 0.216187 0.553998 0.000404529 0.425579 -0.117238 0.437109 0.409669 0.352912 0.40047 -0.327299 -0.115092 -0.591257 0.61328 0.295331 0.550225 0.150815 -0.157787 1.41462 0.564999 -0.00141034 -0.774875 -1.1201 0.465 -0.11092 0.345776
+young -0.262569 -0.0407389 -0.329495 1.52985 -0.856594 -0.264181 0.376129 -0.302467 0.187943 -0.51787 -0.145253 -0.168957 -0.111852 -0.0552749 0.262758 -0.390427 -0.679839 -0.153324 0.10484 0.0490684 0.422605 -0.0320168 0.484503 0.071552 0.181279 0.0486396 0.87258 0.041368 -0.58435 -0.946544 -0.488234 0.0406404 0.696068 0.925375 -0.0579542 -0.460006 -0.225283 -0.216389 -0.431501 -0.482857 0.26081 -0.283273 0.620426 0.23917 0.541503 0.511755 -0.516096 0.493584 0.161742 0.0529407
+version -0.62021 1.60991 0.45623 1.2483 -0.220481 0.440093 0.399317 -0.055141 -0.0550771 -0.909009 0.0526977 0.432676 -1.26751 -0.313446 0.316612 0.814919 -0.356433 0.0294153 -0.489471 0.201096 0.563708 0.656726 0.465317 -0.379676 -0.498287 0.113644 0.604391 -0.757242 0.0538419 1.31643 -0.311899 0.229846 -0.193497 -0.955849 -0.0261336 -0.170356 0.135681 0.551723 0.934865 -0.426143 -1.25763 0.858491 0.972959 -0.19003 -0.855231 0.43809 -0.37394 -0.124483 -0.360569 -0.138162
+without 0.0473376 0.261082 0.627523 0.0244841 -0.309016 -0.763561 0.381429 0.461565 0.331036 0.330923 -0.281503 -0.480697 -0.191049 0.0514862 -0.142143 0.337973 -0.230851 0.0327698 -0.126781 -0.473374 0.271687 0.523489 0.689098 0.316277 -0.0546218 -0.0835883 0.00388042 -0.136443 -0.25089 -1.05492 -0.484931 -0.136777 0.184093 -0.600415 0.579203 0.0214142 -0.227098 0.010489 0.349266 0.0458927 -0.490482 -0.0519226 -0.0167484 -0.113703 -0.508395 0.32351 -1.00262 0.480963 -0.194219 0.23571
+women -0.899444 -0.519667 0.796935 0.776869 -0.682603 -0.268642 0.0943747 -0.804573 0.61876 -0.37698 -1.07396 -0.145353 0.427144 -1.00453 0.89826 -0.673392 -0.613866 0.3777 0.361662 -0.566179 0.783299 0.0412739 0.936099 -0.607029 -0.217113 -0.115795 1.41219 0.21976 -0.31412 -1.79297 0.608174 0.751838 0.376146 0.368317 -1.22661 -0.979081 0.772701 0.146954 -1.0444 -0.42929 0.230182 1.64997 0.0581027 0.542368 0.0244149 0.712612 -0.928565 0.406826 1.50635 -0.0306862
+head -0.406394 -0.483495 0.000730728 0.71578 0.26453 -0.0491681 -0.382225 -0.118628 0.465133 -0.00274701 1.06722 0.436265 0.210023 -0.457385 5.11245e-05 0.321247 -0.632961 0.0798379 0.731115 0.501786 0.243985 0.342998 0.863453 0.0186732 1.29687 0.315346 0.127906 -0.968026 -0.154042 0.0548445 -0.327061 -0.00969908 0.0761683 0.507169 0.422984 -0.14967 -0.166771 -0.350196 -0.970352 -0.30038 -1.36329 0.174184 0.344817 0.593509 0.288691 -0.268601 -1.08178 -0.332372 -0.159883 0.456761
+production -0.274973 1.90412 -0.343718 0.475909 -0.0569883 0.052523 -0.616163 -0.450605 0.460992 0.40109 -0.0327206 0.299047 -1.14758 1.49866 0.36046 0.232942 0.106752 0.415805 0.208511 0.243944 0.339809 0.768109 -0.358064 0.0973121 -0.0531732 0.130284 0.753866 -0.576128 -0.317507 0.541839 -0.817458 0.397642 -0.00698503 0.256456 0.0210754 0.599144 -0.494022 0.337725 0.264965 -0.600379 0.17824 1.73749 1.0337 0.947327 -0.951984 0.167722 -0.657714 -0.112181 0.806015 -0.50033
+living -0.193605 0.0772098 -0.694683 0.614103 -1.01415 -0.329829 -0.316911 0.11323 -0.0659205 -0.760339 0.361963 -1.04574 -0.180975 -0.405177 0.271608 -0.323018 -0.717489 0.356389 0.48986 0.682467 0.840109 0.451026 0.228198 0.671183 -0.636567 0.888292 1.28964 0.102156 0.0290747 -0.643734 -0.162301 0.283976 0.692147 0.0660256 -0.1143 0.207456 0.275222 -0.616449 0.0122999 -0.66278 0.327595 -0.192319 0.262438 -0.648635 -0.203084 0.0706451 -0.219125 0.905558 0.897252 -0.799843
+together 0.401002 0.149896 0.100556 0.257341 -0.973081 -0.469704 0.14481 0.0536582 -0.286673 -0.0925949 -0.107487 -0.446731 -0.217109 -0.116816 0.38221 -0.120896 -1.00576 0.288289 -0.342794 0.154155 0.488864 0.594862 0.076437 0.524865 -0.676976 0.395309 0.0190877 0.524784 -0.413909 -0.0263051 0.350675 -0.593491 0.969349 0.318286 0.111174 0.0918833 -0.180406 0.13076 -0.195739 -0.735981 -0.31708 0.283722 0.662299 0.0547469 -0.189359 -0.163868 -0.528388 0.556495 0.561332 -0.250659
+million -0.315963 0.359216 -0.24565 0.573187 -0.0749004 -0.359595 -0.865887 -0.926042 -0.340614 -0.173164 -1.4799 -0.584595 -0.0857595 0.461755 0.11158 -0.555054 -0.988057 0.168193 0.0251745 0.651333 0.293087 0.590239 -0.511916 0.63332 0.117424 -0.772228 1.76192 -1.19565 0.790163 0.283169 -0.546166 0.323462 0.574857 -0.581118 0.631099 0.712218 0.467011 0.0830857 -1.0636 -0.670726 0.228075 -0.222388 0.162533 0.350516 -1.32683 0.288631 -2.28052 -0.741151 0.511319 -0.487867
+union -0.258942 -0.78774 0.038633 0.0934766 -0.684193 0.592996 0.474909 0.117986 1.0371 0.945896 -0.01888 0.0103316 0.568666 -0.0633498 0.0334602 0.646902 -1.03077 0.922649 -0.0269184 -0.189212 -1.12985 0.521482 -0.0331433 -0.0226369 -0.296357 1.04095 1.25138 -0.630133 -0.665826 -0.534545 0.455021 -0.161414 0.728783 -0.0463326 -0.174609 0.289657 0.555022 1.49011 -0.602199 -0.260118 -0.456634 0.830202 0.384832 -0.316586 0.321036 -0.244599 -0.673559 0.470176 0.352602 0.81172
+live -0.0736485 1.07596 -0.115966 0.325346 -0.755977 -0.665698 8.44158e-05 -0.654419 -0.0173017 -0.776935 0.496755 -0.847952 -1.0985 -0.062702 -0.372904 -0.22779 -0.594563 -0.543313 -0.104802 0.327036 0.405787 0.933644 0.423062 0.0539129 -0.429393 0.581442 0.542257 -0.306049 -0.715471 0.0935172 0.233966 0.0406769 0.566842 -0.261241 -0.42733 -0.884577 0.220494 -0.516447 0.0965671 -0.516607 0.491406 -0.448222 -0.636512 -0.0439766 -0.199896 0.598722 -0.771627 -0.149031 -0.0612553 -0.608574
+founded 1.22103 0.102814 -0.104771 0.566836 0.0301239 0.901136 0.0309042 0.151956 0.0594123 0.129163 0.26461 -0.340743 -0.719805 -0.090514 -0.574636 -0.804398 -1.43882 0.153795 0.601125 0.477993 0.385399 0.153398 -0.992896 0.374778 -0.67836 0.648442 1.3928 -1.72966 0.212237 0.0351643 1.11081 0.11495 -0.177169 0.161732 -0.0526322 0.283091 0.211225 -0.369462 -1.00142 -0.177666 0.462223 1.03286 1.22119 -0.252202 1.20489 1.15989 -0.177626 0.458253 0.513418 0.693267
+short -0.237715 0.989902 -0.511788 0.297368 -0.547836 -0.656513 -0.0567709 -0.37039 0.28117 0.498694 0.273444 -0.110709 -0.0566453 0.185868 -0.295111 0.881456 -0.405249 -0.533976 0.173117 -0.0431459 0.392919 -0.840122 0.982554 0.487014 -0.377774 0.120602 0.00728218 -0.171389 0.319503 -0.479043 -0.516533 0.287464 0.526295 0.0670287 0.159956 0.184534 -0.038566 0.898133 -0.206744 -1.0494 -0.422849 0.00912336 0.502982 0.17032 -0.352133 0.100952 -0.541799 -0.0271171 0.104966 0.262419
+per 0.869027 0.394974 -0.185283 0.476124 -0.713015 -0.574883 -0.339323 -0.139663 0.674249 0.712664 -0.532007 -0.691903 -0.120989 -0.637008 -0.315792 -0.498199 -0.890891 -0.0905378 -0.283229 0.212428 0.0352099 0.477964 -0.244929 -0.250486 -0.073638 -0.473878 1.53267 -0.715874 1.75395 -0.5602 -1.44108 0.0158487 0.900547 -0.475305 0.213221 0.416931 0.211951 0.69914 0.224752 -1.05392 0.0637457 0.174649 0.044799 0.21672 -0.81337 -0.377609 -0.859362 -0.587091 -0.356763 -0.26433
+award -0.468496 0.630408 -0.686756 1.05918 -0.904874 -0.533597 -0.674328 -0.539377 1.79226 -0.194912 -0.548262 -0.362511 -0.876244 -0.119428 0.505243 0.0120306 -0.235198 0.58463 -1.11482 -0.834435 0.270404 -0.0551463 -0.306198 -1.39628 -1.05704 -0.594939 1.80689 -0.937249 0.292395 0.0732242 -0.261291 1.33963 0.489747 0.90477 0.650255 -0.748573 -0.862026 -0.0696153 -0.649561 0.456917 0.0282681 -0.644529 2.15055 0.917937 0.148271 0.353812 -1.18753 -0.24161 0.0400855 1.04763
+works -0.440887 0.274449 0.310741 -0.545309 -1.05437 -0.287756 -0.24703 0.414145 -0.355489 0.182012 -0.0733359 -0.205182 -0.716521 0.467399 -0.307776 0.154817 -0.138222 0.418679 0.989205 0.230705 0.417684 -0.185087 0.484372 -0.0669593 -1.51322 0.548136 1.18245 0.0140659 -0.677427 0.239663 -0.17246 0.450204 -0.104957 0.271387 1.02937 0.00808442 0.158506 0.00450883 0.0431099 0.110374 -0.588327 0.649561 1.30625 0.378396 0.14872 0.114999 0.255701 -0.313771 -0.00185261 -0.121713
+france -1.15448 0.0993029 0.159567 -0.0659034 0.581006 0.307439 0.573135 0.151737 0.222503 0.606384 1.44246 -1.35461 0.0259236 0.905899 0.662092 -0.546519 0.0732431 0.296208 -0.73963 1.03837 -0.782984 -0.353944 1.10032 0.462236 -0.862093 -0.373852 1.83375 -1.04574 -0.104014 -0.035197 0.622964 -0.929916 -0.184341 -1.13051 0.074013 -1.96614 -0.256362 1.02924 0.480533 -0.515015 -1.12736 1.3262 0.501277 -0.192544 0.561192 0.406193 -1.26955 0.323219 1.50861 -1.04381
+once -0.0556259 -0.0291859 0.298271 0.158916 0.0442576 -0.403429 -0.0827274 -0.0743452 -0.355459 -0.000284354 0.051531 -0.141358 -0.601972 0.357807 -0.23571 0.397655 -0.373106 -0.228977 -0.118114 -0.131823 0.335314 0.869296 0.397477 0.689859 -0.228575 -0.158211 -0.129028 -0.119544 0.0695665 -0.940717 -0.365843 -9.75007e-05 0.0647256 -0.463787 -0.197744 -0.303984 -0.233284 0.481075 -0.406339 -0.23208 -0.298935 -0.224968 0.287886 -0.0179025 -0.271868 0.0622708 -0.570396 0.146791 0.144763 0.695379
+run 0.128923 0.642947 -0.110358 0.162603 -0.0683672 -0.418448 -0.738699 -0.678209 -0.20483 0.264566 0.0425902 0.0966369 -0.525001 -0.275608 -0.107873 0.327758 -0.523582 0.319965 0.136587 -0.103833 0.816295 0.471538 0.0497742 0.078824 -0.379225 -0.0281378 0.426901 -0.472139 0.103754 -0.683017 0.12081 0.251149 0.873845 -0.0233815 0.187709 0.00521553 0.0627707 1.10575 0.182951 -0.420413 0.492847 0.052443 0.0286479 -0.0490928 0.726058 -0.0378748 -1.04321 -0.631524 -0.834796 0.410724
+northern -0.733299 -0.877776 -0.613392 -0.483805 0.438452 -0.0491687 0.291911 0.530102 1.25968 0.306331 0.684717 -0.0892093 -0.0750581 0.832704 -0.316319 -0.270398 -1.61213 0.496019 -0.00891771 0.521817 0.300518 -0.300046 0.218081 0.752821 -0.395554 0.660557 0.143047 -0.566853 -0.25928 0.574737 -0.179111 0.0728596 -0.071233 -0.181654 -1.07351 -0.663239 0.184872 0.779308 -0.792092 -0.827711 -0.57602 0.357497 -0.209094 -0.185364 0.102558 0.544405 -0.894228 -0.689838 0.167036 0.59482
+level -0.260184 -0.00737179 -0.228507 -0.185554 0.165712 0.272984 -0.545705 0.177874 0.276719 0.830916 0.273695 -0.708519 0.645779 -0.637743 -0.978545 -0.0923177 -0.473389 0.906985 0.426486 -0.379735 0.607584 1.16973 0.534614 -0.289461 0.113108 -0.249459 -0.190028 -0.531071 0.526463 -0.117889 -0.611783 0.9332 0.500301 -0.0531757 -0.27122 -0.649231 0.246654 -0.290058 0.395757 0.00146458 -0.594375 0.025518 0.318598 0.155824 -0.328143 -0.231772 -0.991683 -0.509539 0.12867 -0.0189652
+california 0.525972 -1.47985 0.438205 -0.502129 0.0467808 -0.679769 0.775709 -1.60098 1.54599 0.556909 0.471707 -0.480189 -2.1286 0.972589 0.534867 0.163137 -0.0422428 -0.783339 0.582544 -0.217241 1.03089 0.26695 -0.930259 0.972949 0.478304 -0.000505195 0.662371 -0.380451 -0.096727 0.211985 0.297978 2.87835 -0.177137 1.27621 -0.751269 0.649562 -0.088742 -0.0271817 0.763113 -0.710959 -0.421566 -0.898462 0.99277 -0.974265 0.288511 0.421953 -2.37428 -0.86423 0.997382 -0.857957
+department 0.196928 -0.871926 -0.772841 0.0586178 0.852451 1.37571 -1.15478 -0.0479629 1.21117 0.296053 0.274949 -0.402975 -0.594877 -0.208127 -0.654939 -0.0715415 0.202106 1.04835 0.535171 1.08831 -0.446124 -0.116882 0.40703 0.288946 -0.442603 0.772628 1.0679 -1.19287 -1.29603 -0.0699637 -0.509168 0.821092 -0.871974 0.0713739 0.210272 -0.035921 -0.574107 0.256254 0.0445827 -0.676119 -1.19859 0.943205 0.440825 0.480033 -0.200786 -0.567404 -1.55347 -0.227146 0.889669 0.331738
+office 0.275176 -0.337887 -0.642061 -0.460015 0.108607 0.305142 -0.944097 0.157651 -0.154045 -0.528121 0.0206768 -0.0846297 -0.859313 0.20389 -0.376094 -0.199025 0.950452 1.11589 -0.466203 0.53018 -0.0849433 0.950975 0.469348 0.244886 -0.186181 1.00226 0.645203 -1.68837 -0.230219 -0.519681 -0.611259 1.03305 -0.237892 0.668259 0.946802 0.264926 -0.00875717 0.775284 -0.572244 -0.526323 -1.15662 -0.0790887 -0.15033 -0.170087 -0.673539 -0.190103 -1.00479 -0.449369 0.364688 0.800848
+battle 0.183168 -0.511949 -0.757605 0.229254 0.480452 -0.333681 0.908002 1.10095 0.308635 -1.27479 0.588254 -0.0544197 -0.922469 -0.079553 0.386565 0.142402 0.0458508 0.603939 0.433976 0.011265 0.732516 0.891227 0.449056 -0.919991 0.420867 -0.131302 0.142459 -0.0779419 0.188717 -0.12706 0.28001 -0.20048 0.162695 -0.809567 -0.477131 -1.65486 0.597669 1.97726 -0.0127129 -0.612611 0.646247 -0.00961514 2.14895 0.244407 -0.600406 0.372599 -1.08829 -0.034294 0.280864 0.564169
+present 0.33034 0.0135036 -0.0390949 -0.112814 -0.100488 0.0932472 -0.228806 0.329928 0.204042 -0.402661 -0.122051 -0.794408 -0.624106 -0.168578 -0.383043 0.127738 -0.831266 0.167774 -0.356837 0.102477 0.0415459 0.3286 0.211667 0.453102 0.0298429 0.742974 0.553781 -0.0861776 0.464179 -0.573295 -0.000883971 -0.325845 0.268754 -0.268893 -0.487477 -0.197142 0.106003 -0.316055 -0.13514 -0.498371 0.24586 0.335448 0.464935 -0.920024 -0.476702 0.161589 0.00622936 -0.107924 0.106418 -0.0444317
+total 0.223176 0.0806392 -0.570364 0.0735678 -0.732947 -0.271141 -0.391653 -0.170551 0.0714081 0.322983 -0.249646 -0.772629 0.0256454 -0.823064 -0.0579498 -0.233349 -1.10165 0.942307 0.139921 0.260591 -0.198077 -0.00339847 -0.0899398 0.0403847 0.240011 -0.533314 0.938754 -0.986583 1.12111 0.267533 -1.08325 0.309512 0.762423 -0.849519 -0.422305 0.315823 0.0993235 0.506349 0.129555 -0.453182 0.437284 -0.00591093 0.545861 0.262011 -1.03763 -0.300607 -1.43187 -0.398317 0.326645 -0.484564
+list -0.691807 0.218659 -0.284618 -0.70706 -1.79886 0.366548 0.181254 -0.32377 0.0370101 -0.858195 0.0855021 -0.27009 -0.39006 -0.962643 -0.24811 -0.230629 -0.680231 1.04402 -0.0135599 -0.232762 0.00183892 -0.687571 -0.430322 -0.201462 0.267227 0.134821 0.948514 -0.886612 0.253193 0.430406 -0.449596 0.720515 -0.205441 -0.637352 -0.779798 -0.136796 0.166809 0.301019 0.367215 0.0685452 -1.27332 -0.787149 0.653695 -0.00724863 -0.382628 0.508204 -1.18792 -0.870715 -0.685296 -0.0432082
+full 0.276703 0.936972 -0.168359 0.276695 -0.632271 -0.406275 0.211851 -0.186267 0.184391 0.616478 0.0660441 -0.726235 -0.324039 -0.284184 -0.0910628 0.53039 -0.283269 0.0748087 -0.190642 -0.222245 0.408708 0.00355412 0.904956 0.444177 0.297103 -0.264199 0.0750924 -1.05443 0.0822795 -0.229642 -0.118581 0.503385 0.267946 0.0926404 0.522902 -0.164172 -0.0834142 0.0260627 -0.236495 -0.269518 -0.496289 -0.0786146 0.105242 0.0139347 -0.617102 -0.0845037 -0.636856 0.0609199 -0.0624295 0.178847
+right -0.317444 -0.137523 0.941465 -0.281036 -0.25864 -0.324674 -0.127503 -0.0528993 0.45509 0.0604326 0.257364 0.410403 0.431716 -0.78114 0.332755 0.485313 -0.374298 1.07015 0.493635 -0.0381457 0.475193 0.80924 1.28901 -0.152939 0.297402 0.667473 -0.265391 -0.561113 0.270281 0.073733 0.153345 0.332839 0.475014 -0.315372 0.45813 -0.142405 -0.410748 0.228938 -0.0120215 0.203067 -0.474839 -0.197074 -0.0418853 -0.634423 0.66609 0.728455 -1.32384 0.326012 -0.248789 0.464573
+election -1.18867 -1.25386 -0.820387 -0.897236 0.158855 -0.14795 -1.41236 0.0630121 0.44485 -0.656544 0.586432 0.0367264 -1.53434 -0.580975 -0.574208 -0.87634 0.194358 1.19853 -2.70448 -0.432502 0.542594 1.64074 0.0937107 -0.323927 -0.420221 1.35018 0.745663 -1.01511 0.256691 -1.04938 -1.04145 0.98217 1.03527 0.0687486 1.15096 -0.745914 1.04133 2.29815 -1.39399 0.0377807 -0.062719 0.514658 0.605184 -0.605886 0.524304 0.410304 -1.68571 0.334491 -1.05942 0.0190131
+site -0.0536643 0.0727909 0.130793 -0.198023 1.04825 0.61064 -0.254424 0.198421 -0.0865886 -0.257603 -1.52379 -1.08973 -0.993457 0.420468 -0.949805 -0.58468 -0.927465 0.283379 0.384963 0.779494 0.775393 1.29909 0.230303 0.088667 0.238236 1.52118 0.941692 -0.305038 0.525699 0.232521 0.638467 0.452039 -0.910831 -0.0701807 -0.976578 0.12018 0.458149 0.368757 -0.0627569 -0.276256 -0.0111305 -0.6141 0.817902 0.330509 -0.838507 0.163023 -0.67958 -0.297107 -0.253642 0.683359
+married -0.0689002 -0.706681 -0.345083 0.965791 -1.36575 -0.0954339 -0.778015 1.2666 -0.227224 -1.15537 1.2428 -0.417328 -0.520468 -0.464833 1.22797 -0.283017 -0.21399 -0.00519791 -0.845005 0.558876 1.02751 0.492074 0.199903 0.50915 -1.24818 0.368516 2.03303 0.59318 0.65515 -0.78725 -0.063216 0.224453 0.567458 1.15365 0.364761 -0.0604312 0.368946 -0.732558 -0.502838 -1.2048 0.52368 0.438507 0.546357 -0.589858 0.996063 1.23293 -0.533677 -0.0434453 0.84006 0.329421
+common -0.729985 0.512713 0.210074 -0.343543 -0.426471 -0.15084 0.803902 -0.129798 0.771082 0.181479 0.433989 -0.457541 -0.165051 -0.28581 -1.01393 -0.161633 -0.295004 0.178431 0.454124 -0.0203171 0.932093 0.0672957 -0.112283 0.313848 0.0183615 0.517999 0.150419 -0.417615 0.659066 -1.12567 -0.364424 -0.357082 0.114275 0.221592 -0.431496 0.108429 0.213202 0.156802 0.402054 -0.481588 -0.822363 0.483272 0.189992 -1.01835 -0.278187 0.569718 0.107692 0.0357329 -0.222035 -0.0579524
+control -0.0521711 0.279575 -0.210161 0.204439 0.343623 -0.346308 0.220092 0.96826 0.567477 0.175994 0.410094 -0.329045 -0.183724 -0.178908 -0.462595 0.166321 0.398696 0.877942 0.601846 0.37686 0.676423 1.01969 0.122502 0.308378 0.76727 0.147218 -0.235229 -0.87301 -0.738201 0.141198 -0.113466 -0.236541 0.1719 -0.722441 0.635291 0.211401 0.266874 0.253711 -0.00215813 -0.490444 -0.264913 0.554005 0.572077 -0.219539 -0.27685 -0.457691 -1.47776 0.0317446 -0.140586 -0.486193
+considered -0.583548 0.124632 -0.0494568 0.366663 -0.0859124 -0.484586 0.327526 -0.0709222 0.435191 -0.140859 0.0273272 -0.184876 -0.476759 -0.138814 -1.25524 0.606268 -0.451824 0.544853 0.41026 -0.204781 0.129872 0.134724 -0.462439 0.233659 -0.309397 -0.145335 0.762236 -0.0708878 0.776296 -0.862294 -0.759178 -0.151059 -0.0707016 -0.179991 -0.249827 -0.0718335 0.0159969 -0.494783 -0.117668 0.36646 -0.485893 0.263122 0.503127 -0.850848 0.369215 1.49491 -0.177482 0.0591131 0.00326464 0.448329
+services -0.00570939 0.854634 -0.222962 -1.19096 -0.48326 -0.0168001 -0.306642 1.41239 0.613022 0.256311 -0.793199 -0.809229 0.0547714 -0.201435 -0.364514 -0.829355 0.397545 0.785496 0.497696 0.577039 0.69916 0.677416 -0.432551 0.371469 -0.237376 0.390806 1.40537 -0.991308 -1.19219 -0.558549 0.444571 0.608984 0.755098 0.66861 0.364906 0.083378 -0.0475064 0.63298 -0.0375972 -0.639045 -0.86795 0.399192 -0.248436 0.136903 -0.0782894 0.327146 -0.828481 -1.14641 -0.421512 0.762455
+important -0.662549 0.0267819 -0.155467 0.155668 -0.560536 -0.0277056 0.172311 0.080768 0.336329 0.518205 -0.157794 -0.755798 -0.468078 0.76498 -1.50544 -0.234461 0.123088 0.730659 0.976823 -0.105222 -0.131658 -0.0413049 -0.231928 -0.0319103 -0.344972 0.379421 0.607122 -0.0608942 0.555898 -0.902161 -0.63702 -0.331748 -0.06112 -0.0984183 -0.358327 -0.498931 -0.438735 0.26091 -0.446382 0.186926 -0.0510926 0.126961 0.993899 -0.429283 -0.168091 1.15512 -0.0460776 -0.0990669 -0.376292 0.21216
+average -0.331778 0.292495 -0.474269 0.916676 -0.10342 0.445969 -1.01369 0.343342 1.07479 0.174191 0.394938 -0.696921 0.40779 -1.03993 -0.341338 -0.322175 -1.21228 0.706488 0.470955 0.138232 0.299645 0.763676 0.089362 -0.64934 0.105454 -0.357808 1.95684 -0.463211 1.93822 -0.833894 -1.76625 0.999496 1.77076 0.0980164 0.150335 0.246425 -0.236927 0.645483 0.815887 -0.535505 0.990448 -0.559335 0.259031 -0.368119 -1.19352 -0.0315851 -0.9053 -0.419402 -0.0666494 -0.256517
+language 0.427606 0.850354 -0.520686 0.60141 0.0137036 1.42918 0.826065 0.0788638 1.16895 -0.919342 0.0927354 -0.168791 0.698073 -0.287125 -0.823997 0.270686 -0.671787 0.403066 -0.386117 0.31721 0.136207 1.02915 0.961653 0.149281 -1.81463 1.10263 0.762165 -0.039287 -0.471152 -0.364749 -0.530621 1.34755 -0.442288 0.0265652 -0.144905 -0.171788 -0.809943 0.0744116 -0.109037 -1.1252 -0.918424 1.19361 0.819244 -0.331295 -0.33752 0.950354 0.0242349 0.537951 -0.927806 -1.1783
+re 0.44945 0.546933 -0.0992291 -0.278423 -0.494159 0.253695 -0.620984 -0.278414 -0.0383348 0.171917 -0.154796 -0.00565405 -1.17284 0.196253 0.0257776 0.485871 -0.247447 -0.138152 -0.926156 -0.184396 0.444397 0.857107 0.783834 0.545273 -0.324079 -0.000114322 0.0966218 -0.291384 -0.550075 0.164332 -0.388315 -0.266418 -0.320696 -0.65559 0.39151 -0.727188 -0.288201 0.83748 -0.193099 0.486143 -0.451068 0.605526 -0.231704 0.100668 0.189527 0.312575 -0.973363 0.453676 0.454638 0.258589
+term -0.623584 -0.0316263 -0.0756826 0.286122 -0.611006 0.473821 -0.06619 -0.538178 0.94755 0.256656 0.325213 0.118121 -0.10172 0.195277 -1.15188 0.0682779 0.578666 -0.260112 -0.189858 -0.840386 0.816569 0.664392 -0.237425 0.594711 0.489622 1.12211 0.412613 -0.290692 0.652692 -0.747596 -0.908762 0.531415 0.0472502 0.0608301 0.481596 -0.549715 0.628673 0.729363 -0.264128 -0.471033 -1.30858 0.720361 0.269905 -1.07567 -0.384668 -0.246164 -0.0711273 0.513775 -0.711275 0.304056
+story -0.321968 -0.011187 -0.184334 1.0306 -0.144702 -0.11446 -0.717819 -0.524179 -0.756994 -1.5638 -1.2006 0.380538 -0.781891 0.37158 0.3309 1.05071 -0.43392 0.32363 0.620514 0.308598 0.983083 -0.30047 1.13213 -1.02018 -0.781567 0.981547 0.555283 -0.303264 -0.0966855 -0.637588 -0.241395 0.979082 -0.518333 -0.363774 0.183348 -0.240541 0.0317006 0.91606 0.053643 -0.856577 -0.587588 0.133586 1.17588 0.113943 -0.67468 0.433635 -0.418832 -0.180181 0.0447332 0.166604
+example -0.652105 0.929333 -0.018494 -0.223432 -0.803181 -0.118389 0.575358 -0.555528 0.638365 -0.103499 -0.190026 -0.617969 -0.249842 -0.126747 -1.00342 0.325797 0.526549 0.923034 0.351693 -0.295003 0.447771 0.514425 0.0638115 -0.0438383 0.318695 0.617096 0.904055 -0.172706 0.814011 -0.403931 -0.600774 -0.170716 0.191376 -0.221621 -0.0128681 -0.0355334 -0.0244869 0.134606 0.48618 -0.436996 -0.865935 0.207085 0.929641 -0.52797 -0.415172 0.364333 0.173357 -0.00973246 -0.438432 -0.166082
+modern 0.0848633 0.246317 0.336364 0.0687165 -0.458887 -0.139359 0.38069 0.092769 0.599073 -0.102861 -0.0864124 -0.0908595 0.0559727 0.895697 -1.31731 0.29711 -0.663053 0.163729 0.871583 -0.0261757 0.501878 -0.00659625 0.507436 0.420415 -0.43185 0.0251775 0.462096 -0.804579 0.398474 -0.103178 0.00804395 -0.112169 -0.794917 0.00306774 0.0121576 -0.498923 0.248478 0.317792 0.0140862 -0.329648 -0.372816 0.547437 0.67362 -0.441062 -0.245843 0.0440343 0.191737 0.161768 0.0957245 -0.395062
+current 0.112019 -0.0262533 0.0623646 0.183807 -0.158637 0.394321 -0.600383 0.307337 0.26792 0.431041 -0.120631 -0.296498 -0.0793903 0.0559805 -1.05596 0.147933 -0.812171 -0.0179349 -0.25627 -0.547425 0.497257 0.266971 -0.341926 -0.0418595 0.804259 0.683706 0.15351 -1.28756 0.0804692 -0.116031 -0.189211 0.380504 0.104827 -0.054912 0.107111 -0.0269917 0.0411613 -0.177884 -0.0583945 -0.0485656 -0.847061 0.00683978 0.201399 -0.130497 -0.113661 0.0068856 -0.411207 -0.468812 -0.35576 0.0363806
+association -0.367521 -0.130786 0.666474 0.805798 -0.534498 0.807491 0.48044 -0.286357 1.37453 0.581337 -0.170888 -0.632831 0.127197 -0.779324 -0.370734 -0.571495 -1.00387 0.202594 0.397072 -0.680939 -0.214724 -0.165733 -0.609443 -0.00580769 0.0613954 1.02472 1.37085 -0.730637 -0.355787 -0.0485248 0.542153 0.712106 0.180912 0.552416 -0.048975 -0.180832 -0.118548 0.268151 -0.855622 0.152454 -0.208645 0.623146 0.510518 0.188284 0.422065 -0.0143612 -0.790758 0.0901143 0.314313 1.11041
+returned 0.839511 0.00657211 -0.0450364 0.0866022 -0.0371253 0.0616309 0.230668 0.0719896 -0.0910989 -0.201377 1.72424 0.242663 -1.12303 1.18619 0.192762 0.177201 -0.36909 0.696556 -0.236157 0.128416 0.50994 0.415988 0.75904 0.558482 -0.0689839 -0.437283 1.55853 0.152705 -0.184703 -0.539547 0.533592 0.0565411 0.502043 -0.120522 0.772969 -0.670408 0.746122 -0.449081 -0.734408 -0.565326 0.526638 0.0973763 0.0424898 0.0801311 0.618832 0.499995 -1.39277 -0.365562 -0.442831 0.821373
+rock 0.398457 0.0328175 -0.745372 0.568411 -0.704152 -0.782054 1.04203 -0.790122 0.461514 -0.0285783 0.499917 -0.710635 -1.2389 0.776882 -0.537313 0.544417 -2.09694 -0.918075 0.646689 -0.0196678 0.929667 1.62013 0.196741 -0.623925 0.288358 1.4443 0.0489315 -0.156306 0.397294 1.00695 0.0977966 0.796438 0.0989933 0.591376 -0.028586 -0.562523 -0.159069 0.454492 0.0225863 0.742617 0.0951943 -0.201455 -0.596033 -0.0102468 -0.403839 0.5956 -0.364246 0.162344 0.685553 -0.222667
+southern -0.240943 -0.838527 -0.0517579 -0.0478815 0.378379 0.0863848 0.201485 0.122061 1.44019 0.149234 0.65889 0.059386 -0.172921 0.825264 -0.36694 -0.145253 -1.51697 0.226281 0.53541 0.902115 0.27511 -0.272023 -0.0583711 0.682906 -0.429682 0.216109 -0.0403501 -0.533291 -0.283328 0.638436 -0.205114 0.540025 0.192674 -0.0433713 -1.07348 -0.497218 0.421727 0.682149 -0.650556 -0.488892 -0.488909 0.229661 -0.121021 -0.464216 0.29815 0.446283 -0.946854 -0.594081 0.188348 0.481422
+days 0.0699765 0.101743 -0.599379 -0.319248 -0.320622 -0.359543 -0.160287 -0.416182 -0.1063 -0.85201 0.281487 -0.657822 -0.799622 0.117104 0.128629 -0.204135 -0.420946 -0.411356 0.0617032 0.172043 -0.448412 0.625677 0.594958 0.510726 0.265819 -0.249251 0.883961 0.316781 0.223087 -0.488622 -0.138549 0.399532 1.11466 -0.410441 0.438355 -0.622299 0.802295 0.54659 -0.219606 -1.09426 -0.209091 0.372133 0.346022 0.505164 -1.26281 -0.0506126 -0.756455 0.155826 -0.320134 0.780176
+himself -0.017362 -0.201635 0.234933 0.639371 -0.652105 -1.13216 0.446334 -0.0855506 -0.337405 -0.814229 0.923295 -0.183314 -0.0893835 0.17004 -0.0425691 0.618219 0.545259 0.115971 0.224113 0.356666 0.26385 0.75494 0.936766 0.54098 0.0864426 0.367641 0.467108 -0.00365083 -0.47529 -0.343405 -0.481873 -0.0280208 -0.18564 0.0377893 1.27113 -0.547053 0.0698092 0.492536 -0.449584 0.0466258 -0.5327 -0.280656 1.25063 0.467813 0.591738 0.313722 -0.601198 0.319948 0.00652324 0.334639
+canada -2.13427 0.0574663 -0.73323 -0.698725 -0.917755 0.884348 -0.401799 -0.366113 1.35772 -0.356295 0.915099 -1.4979 -0.132747 0.538685 0.198227 0.537232 -1.24696 0.421028 -1.2281 -0.0123697 0.509058 -0.320746 -0.570135 0.298332 0.40643 0.240642 1.35459 -0.743261 -0.0648584 -0.586752 1.91594 0.16192 -0.603669 -0.466912 0.502428 -1.0723 -1.04189 0.323019 -0.0620758 0.056846 -0.575322 0.582117 -1.13956 -0.463759 0.0839747 0.0633817 -1.8433 0.023349 1.72543 -0.958456
+never -0.359217 0.207045 0.696427 0.27357 -0.252764 -0.584755 0.0157454 0.0347132 -0.525893 -0.656737 -0.0971634 -0.198609 -0.286303 0.176634 -0.123601 1.21727 -0.249504 -0.287748 -0.262591 -0.376544 0.215901 0.858344 0.748071 1.20616 -0.285746 -0.39995 0.467214 0.0681785 -0.148136 -0.414838 -0.628901 0.236578 -0.238302 -0.491124 0.192542 -0.215515 -0.215406 0.654727 -0.202068 0.0957504 -0.412485 0.000761114 0.231348 -0.0834677 -0.200996 0.7087 -0.961733 0.612743 0.51491 0.646397
+director 0.130819 0.286946 -1.03132 1.31135 -0.611076 0.485144 -0.904538 -0.186523 0.530807 0.0580128 0.392543 0.135202 -0.645854 0.526714 -0.739733 0.351188 0.680834 0.306405 -0.204112 0.430008 -0.460623 0.386633 -0.0068965 -0.415903 0.0111068 0.88652 1.2422 -0.904373 -0.927724 0.688075 -0.60291 1.10387 -0.240346 0.974527 0.689214 -0.066767 -0.498769 -0.396464 -1.0314 -0.410277 -0.632546 0.411126 1.28955 1.18508 0.624799 0.17569 -1.13843 -0.435766 0.674648 0.601829
+san 1.26536 -2.17661 0.576866 -0.353269 -0.0385725 -0.693059 1.32898 -0.884683 1.27523 0.83315 -0.693918 0.176334 -2.86442 1.19164 0.13939 -1.08879 -0.472915 -0.20021 1.08981 0.171969 -0.163934 0.733153 -0.49162 0.944768 0.330524 -0.00347814 0.900848 0.41844 -0.180649 0.00645262 0.038442 2.46368 0.137357 0.164587 -0.375978 0.168636 -0.695748 0.275159 0.457758 -0.796148 -0.875729 -1.14684 1.95847 0.589692 1.00137 1.2704 -1.48705 -1.8525 -0.0250948 -0.785855
+open -0.265279 0.430349 0.971877 0.778728 0.337312 -0.157198 0.0236307 -0.625683 -0.249157 1.11182 -0.123644 -1.36066 -0.618549 -0.0386306 -0.390538 -0.21752 -0.69296 0.622791 -0.153692 0.376789 0.173787 0.568241 0.547537 -0.640186 -0.436952 0.279428 0.0672315 -0.143194 0.211542 -1.08787 0.73039 -0.133879 0.0578596 -0.0740465 -0.400191 0.0431549 0.461932 0.589512 0.0924675 -0.0350151 -0.360023 -0.0928792 -0.307472 -0.296187 0.183047 0.39619 -0.658988 -0.295861 -0.288606 0.555056
+others -0.0997506 -0.195715 0.363584 -0.220646 -0.572721 -0.545174 0.474905 -0.155357 -0.209781 -0.810866 -0.0891247 -0.404966 -0.0765293 -0.249939 -0.501704 -0.278377 -0.416118 0.0425215 0.462891 0.134691 0.365722 0.500813 -0.318905 0.0834712 -0.51966 0.0792956 0.382741 0.515589 -0.629889 -0.0770089 0.0110894 -0.477442 0.64229 0.281756 0.411983 -0.0595422 0.0934492 0.127882 -0.0225239 -0.159966 -0.582639 0.49071 0.653756 0.145587 0.126982 0.433247 -0.634059 0.0471398 0.257697 -0.27843
+society -0.458977 0.00442826 0.185869 -0.0145071 -0.832971 0.610898 0.419968 0.0349235 0.961125 -0.588067 -0.556826 -0.914687 -0.0549111 0.110094 -0.345526 -0.481664 -0.151842 0.0763039 0.568052 -0.23564 0.361628 0.318396 0.254245 -0.537283 -0.285654 0.895978 1.15042 -0.628516 -0.881282 -0.430079 0.290479 0.688802 -0.314143 0.335619 0.227176 -0.0689579 -0.0842852 0.150789 -1.39441 0.0291134 0.039934 0.624053 1.413 -0.773987 0.342205 -0.223098 -0.289493 1.1002 0.521469 0.573003
+free 0.393506 0.601081 0.0759394 0.251769 -0.399985 -0.403206 0.669239 -0.45971 0.741926 1.14112 -0.0869234 -0.812273 -0.0260163 -0.563272 -0.152772 -0.0055674 -0.426623 0.39102 -0.249792 0.44868 0.275727 0.560015 -0.0462602 0.138062 -0.18964 0.409744 0.592344 -0.551619 -0.465448 -0.869691 -0.0144618 0.110018 0.438089 -0.667842 0.741351 0.11947 -0.020815 0.404448 0.0968229 -0.179185 -0.317455 -0.736185 -0.0395606 0.0404651 0.21538 0.243356 -0.192412 0.401622 -0.0679582 0.127355
+route -0.0788311 -0.127883 -0.649679 -0.423249 0.95434 -0.4473 -0.732268 0.909111 1.0058 0.557089 -0.57772 -0.497649 -0.0470095 0.607776 0.201977 0.619352 -0.99409 0.536263 0.106306 0.92039 0.694637 0.708405 -0.31585 1.12728 -0.769329 1.17216 0.947537 0.780329 0.515027 -0.117105 0.3893 0.289839 0.76101 -0.219672 0.075233 -1.38268 0.630357 2.0681 0.762884 -1.19297 -1.16609 -0.204609 0.732705 -1.13459 -0.10187 -0.0210387 -0.625348 -1.47602 -0.436731 1.18084
+similar -0.037757 1.09584 0.537006 0.138034 -0.232607 -0.290036 0.669816 -0.555419 0.317289 -0.0918604 -0.0380606 -0.327719 -0.359443 0.0407509 -1.09068 0.3602 -0.369688 0.524049 0.0280634 -0.140744 0.919977 0.118442 0.527617 0.125845 0.667447 -0.0875411 0.398579 -0.0839204 0.365744 -0.389531 -0.209478 -0.54372 -0.216067 -0.13284 -0.348264 0.175945 0.30509 0.110659 0.044402 -0.47607 -0.673473 0.483492 0.0213483 -0.66888 0.0325646 0.490857 -0.434027 -0.504999 0.0718675 -0.568358
+america -0.694396 -0.203284 0.496205 0.293512 -0.518638 0.214622 0.552015 -1.46751 0.822304 -0.321642 0.601685 -0.524296 -0.276246 0.683236 0.509898 0.426549 -0.215379 0.0917038 0.46094 0.522858 0.768751 -0.660468 -0.336636 0.362155 -0.478975 -0.972099 1.18851 -1.17937 -0.583734 0.361614 0.760379 0.736674 0.218258 -0.540585 -0.610044 -0.154129 0.183761 0.753732 -0.26783 -0.340663 -0.938966 -0.242351 0.0520631 -0.802176 0.498409 0.44525 -1.41259 0.166118 0.623077 -0.44617
+usually -0.594911 0.88972 0.271509 -0.335526 -0.720691 -0.772408 0.617423 -0.492864 0.657903 0.244864 0.141945 -0.439909 -0.386911 -0.540814 -0.99099 -0.0584273 -0.22546 -0.082771 0.289182 -0.307889 0.486856 0.505703 0.307833 0.399274 0.348284 0.0527071 -0.241374 -0.0992541 0.452619 -1.0978 -0.599798 -0.144864 0.12137 0.189084 -0.538534 -0.289919 -0.289571 0.479225 0.119026 -0.744637 -1.41144 0.167471 -0.062932 -0.153044 -0.440762 -0.0154407 -0.305478 0.0967288 -0.0085637 -0.0526456
+red 0.189368 0.6194 -0.441326 -0.33416 0.60624 0.139473 0.0849445 -0.322863 1.66243 -0.505974 0.189489 0.489511 -0.427446 -0.344276 0.182877 0.0630317 -1.5417 -0.188193 0.235004 0.174539 0.429695 0.0785377 0.583896 -0.258005 0.896829 0.0462564 0.19262 -0.72406 -0.180823 -0.615782 -0.178747 -0.890873 0.944307 0.229862 0.207564 0.107878 -0.132943 0.698743 -0.344147 0.308594 -1.1418 -1.11803 0.516015 0.445215 -0.16358 0.459782 -0.429097 -0.169624 1.25871 0.217319
+royal -0.564894 0.704789 -0.181128 -0.463255 -0.336471 0.299379 1.2995 1.69948 0.859626 -0.820296 0.165924 -0.680681 -0.327427 0.96332 0.109116 -0.449119 -0.413966 -0.162671 -0.160195 0.126774 -0.627143 1.03123 0.984448 -0.476381 0.35975 0.032195 0.509955 -1.53496 0.380527 -0.647509 0.433533 -0.318843 -0.697175 0.567753 -0.104261 -0.885184 -0.262606 -0.8498 -0.611182 -0.202112 -0.151362 0.869823 1.09886 -0.099193 0.391114 -0.777664 -1.20229 -0.4121 1.06717 0.750872
+co 0.463205 0.336342 -0.0923461 0.2586 -1.30033 0.144995 -0.230662 0.180823 0.0797622 0.191756 -0.288578 0.0377428 -0.306853 0.239634 0.0580533 0.0120677 -0.346243 -0.572444 -0.382029 0.251571 0.821464 -0.0923136 -0.818252 -0.0712229 -0.2536 0.392656 0.388179 -0.57951 -0.629559 0.172364 0.34645 0.28427 -0.0103352 0.319097 0.107085 0.503935 -0.448031 0.47493 -0.218073 -0.249866 -0.625071 1.15275 0.867045 1.05721 0.383372 0.522684 -0.537653 0.0185683 0.636257 0.914936
+continued 0.810058 0.395357 0.407308 -0.398123 0.100319 -0.0206087 0.299338 0.0811676 0.120431 -0.148325 0.640673 0.0529191 -0.564163 1.3902 -0.388588 0.242434 -0.405981 0.358056 0.309664 -0.288926 0.591208 0.308222 0.578679 0.769016 -0.0307673 -0.372573 0.574126 -0.150657 -0.350674 -0.430165 0.587014 0.152612 0.67694 0.307921 1.05864 -0.450122 0.804259 -0.11863 -0.4987 -0.198374 0.627362 0.962394 0.18049 -0.0816592 0.0101633 0.709222 -1.18959 -0.105074 -0.152809 0.45425
+design -0.754008 1.09011 0.670075 0.368533 0.167033 0.313197 -0.351615 0.461842 0.216094 0.510314 -1.01155 -0.156666 -1.02715 0.865334 -0.69083 0.845666 -0.111834 1.62992 0.0618561 -0.252432 1.22821 0.999608 0.757015 -0.432474 -0.289346 -0.39383 0.808113 -0.610959 0.397552 0.297611 0.86314 0.562642 -0.497707 1.02827 0.76303 0.5596 0.4251 0.336252 0.554512 0.320811 -1.11539 0.711746 1.57691 0.13878 -0.0739027 -0.436299 -0.355892 -0.133783 0.563345 -0.342242
+position -0.496184 -0.692909 0.533984 -0.366101 -0.278393 0.329079 0.0404314 0.176365 0.211537 0.180404 1.29419 0.0426864 0.367128 -0.10454 -0.538073 0.21174 0.207558 0.673657 0.0715964 -0.247569 0.130859 1.18587 0.661714 -0.11392 0.904329 -0.342739 -0.00483315 -0.344087 0.358417 0.0782883 0.00675923 0.819381 0.291417 0.312738 0.901933 -0.652914 -0.117891 0.311661 -0.547542 -0.00339797 -0.971464 -0.013675 0.621975 -0.0493084 -0.105567 0.444172 -1.27897 0.207408 -0.536049 0.854006
+appeared 0.276307 0.739209 0.0367216 -0.61249 -0.519005 -0.35777 -0.129482 -0.647367 -0.0806541 -1.32267 1.82825 0.800413 -1.23869 0.320395 -0.631986 0.207708 -0.210407 -0.406382 0.0947866 -0.477737 0.96596 -0.452427 0.84087 -0.627419 -0.447915 -0.0187974 1.23022 -0.32584 0.444868 -0.0729567 0.453913 0.26277 0.535133 -0.468567 0.226358 -0.405734 0.416375 -0.500833 -0.335218 -0.246647 0.10772 0.59785 0.526081 0.482968 0.201246 1.07301 -0.704812 -0.253902 0.556047 0.45331
+william -0.110168 -0.875233 0.524238 -0.031019 -0.648719 0.895432 0.97253 0.44967 0.413638 -1.17657 0.144866 -0.613956 -0.564654 0.544563 0.0249084 0.762663 -0.140608 -0.25156 -0.561164 0.986612 0.708949 -0.118445 0.145768 0.128741 0.00860821 0.541986 0.650016 -0.646318 0.310267 -0.22878 -0.74168 -0.279146 -0.0405209 0.968591 0.760044 0.219178 0.479072 0.53533 0.504624 -0.226851 0.304927 0.430518 1.43626 0.222137 1.06032 -0.34156 -1.14424 -0.397803 0.597292 0.725666
+lost -0.0646464 -0.436172 0.154592 0.0969243 0.28725 -0.441507 -0.0233649 0.0292622 -0.57299 -0.345077 0.740086 0.225108 -0.535997 0.232493 0.248939 0.31279 -0.900959 0.47059 -0.134632 -0.396499 0.518015 0.17784 0.218445 -0.0358889 -0.104465 -0.363084 0.47421 -0.248271 0.38845 -0.80298 -0.161621 -0.10057 0.506207 -0.775264 0.11566 -0.0501645 0.279714 0.123426 -0.359934 0.229857 0.644944 0.16503 0.719258 0.0703701 0.122459 0.854175 -1.10889 -0.0540839 0.0610379 0.305566
+little -0.0482479 0.0862038 -0.118719 0.182525 -0.113944 -0.165341 0.182244 -0.315726 -0.00330482 -0.297141 -0.0685523 -0.426475 -0.0652971 0.168741 -0.167458 0.67744 -0.310444 -0.536381 0.535626 0.169611 0.663777 0.489956 0.763642 0.382474 -0.156018 -0.064403 0.154254 -0.348039 0.0735861 -0.209324 -0.877882 -0.250359 0.25836 0.22307 -0.164253 -0.058567 -0.684263 0.746675 -0.0709423 -0.318666 0.245205 0.166555 0.0708828 -0.150983 -0.29371 0.911255 -0.43343 -0.100433 0.51821 0.249308
+further 0.0790783 0.389863 0.309883 -0.468628 0.0169826 -0.296076 0.0946233 0.253725 -0.0618166 0.310776 -0.579971 -0.522138 0.230944 0.485477 -0.44926 0.075311 -0.485917 0.3162 -0.309036 0.138444 0.434324 0.261985 0.814324 0.627043 -0.0168175 -0.0883479 -0.214077 0.233096 -0.343848 -0.287781 -0.353874 0.0828207 -0.00159936 -0.283043 0.261044 -0.0330311 0.208747 0.265761 -0.0685221 -0.12731 -0.0564595 0.552902 0.574912 0.0833255 -0.436466 0.194985 -1.15982 -0.165599 0.155528 0.184689
+australia -2.57323 0.587849 -0.534965 -0.865481 -0.869553 -0.828769 0.983002 -1.33934 0.459972 -0.0378729 0.990324 -0.670515 -0.112569 1.4092 0.47794 -0.842178 -1.80035 0.642138 -1.03546 0.42991 -0.0750108 -0.0163931 0.583266 0.685605 0.585611 -0.45143 1.24228 -0.480529 -1.14788 -0.524566 0.39149 2.38478 -0.214042 -0.0428209 -1.0897 0.915906 -0.114281 -0.0862211 0.864534 0.994394 -0.593278 0.979259 0.279561 -0.870485 0.722518 -0.478372 -1.55009 -1.57502 0.71709 0.4277
+cup -2.16373 0.580476 0.976304 0.574928 0.611632 0.163509 -0.590941 -0.856539 0.559094 0.254853 1.29542 -0.315338 0.00689315 0.223522 0.723283 -0.650741 -1.44759 0.434964 -0.730941 -1.00278 0.521373 1.1304 -0.208988 0.275293 -0.417672 0.185876 1.63885 -0.569119 1.01226 -1.21792 0.733022 -0.377249 1.31562 -0.804599 -0.836782 -1.109 0.367619 1.53747 -0.834719 0.691607 -1.21029 -0.183461 1.88352 1.53681 0.542615 -0.0103972 -0.345866 -0.852262 0.117155 1.23028
+playing -0.774178 0.727684 -0.00969193 0.547729 -0.708319 0.16851 0.765965 -0.715513 -0.277546 0.193967 1.71719 0.45213 -0.683184 -0.570578 -0.908922 0.82878 -0.431753 0.313296 0.61577 -0.594611 0.184783 1.20527 0.645478 0.484657 -0.692438 0.83783 0.575609 -0.438058 -0.170179 -0.641933 0.392748 0.584094 1.09689 0.0489554 -0.0300147 -1.38433 -0.565633 -0.490329 0.544377 0.0660087 0.386908 -0.424534 0.652846 1.45958 0.0172438 0.302069 -0.340831 -0.574124 0.435981 0.285313
+act -0.638516 0.485688 -0.350412 -0.0157404 -0.7941 -0.309902 0.658596 0.0659386 1.17833 0.199071 -0.576597 -0.215115 -0.961679 -0.237423 0.40481 0.505258 0.45901 0.545475 -0.22559 -0.25747 0.221573 0.85042 0.0941901 -0.154773 0.0622278 1.50221 0.214488 -0.664831 -0.855116 -0.232786 -0.719419 0.135977 -0.14761 -0.5932 0.273121 -0.22682 0.484269 -0.295222 -0.0268843 -0.0978744 0.266099 1.23803 -0.101031 -0.62294 -0.0116829 -0.0352027 -1.03786 -0.177891 0.129444 0.460698
+originally 0.336674 0.901176 0.367166 0.592652 -0.312594 0.567072 0.159532 0.220087 -0.00324267 -0.267692 -0.642346 0.225587 -0.920913 0.1508 -0.37406 0.68998 -1.04835 -0.364612 -0.272864 0.356364 0.486269 0.4336 0.203257 0.848602 -0.0944223 0.360916 0.166599 -0.357886 0.0898874 0.0342282 -0.264623 0.130929 -0.768682 -0.160841 -0.562792 0.0134815 0.270251 0.525058 0.056773 -0.0531238 -0.566067 0.171405 0.0570102 -0.0117892 0.260772 0.0766139 -0.355431 0.0802031 0.40119 0.316804
+formed 0.746439 0.290493 -0.844234 0.000395157 -0.170392 0.46137 0.715622 0.0574061 0.0516196 0.619249 0.597211 -0.00394309 -0.922819 -0.0324638 -0.639697 -0.0112496 -1.77203 0.203556 0.247608 0.15429 0.352619 0.280529 -0.473983 0.233252 0.293301 0.785137 0.020825 -0.402387 0.15865 0.177088 0.703891 -0.195013 0.870508 -0.134114 0.0184672 0.178453 0.212195 -0.228565 -1.05328 0.331772 0.328828 1.24901 0.57044 -0.680155 0.248556 0.728446 -0.64867 0.505888 0.818817 0.701712
+worked 0.711833 0.233061 -0.896464 -0.207746 -0.753741 -0.348863 -0.398945 -0.21775 -0.228686 -0.179202 1.65625 0.263745 -0.688319 0.796725 -0.820948 0.0968086 0.104186 0.351847 0.602902 0.129592 0.835021 0.183136 0.124157 0.315044 -0.982028 0.21473 1.63824 -0.138933 0.0319514 -0.253857 0.642537 0.408452 0.61049 1.0851 1.27143 0.188816 0.21224 -0.666115 -0.606817 -0.169585 -0.188918 0.646474 0.343872 0.498937 0.866573 0.582279 -0.79435 0.261068 0.456488 0.549071
+half -0.393417 -0.0814647 -0.101989 0.526148 -0.304278 -0.0679612 -0.0147277 -0.314694 -0.412097 -0.064769 0.0986037 0.25287 -0.0434809 0.170705 0.199956 0.167159 -1.00792 0.33957 0.164407 0.498903 0.2814 -0.0202528 0.468128 0.309023 0.429879 -0.150263 0.336007 -0.590136 0.964261 -0.167348 -0.81891 0.181999 0.899302 -0.211576 0.191205 -0.348725 0.0972931 0.378626 -0.169016 -1.1424 -0.219482 0.0644448 0.546139 0.198767 -0.772999 -0.312744 -0.681383 -0.0527485 -0.412419 0.0400765
+european -0.957942 0.773521 0.10957 0.112829 -0.33645 0.102403 0.894909 -0.357242 0.551608 1.10779 0.3731 -0.663465 0.848571 1.35891 -0.0758184 -0.719756 -0.98686 0.255591 -0.564122 -0.566355 -0.325009 -0.0568939 0.0984093 0.163464 -0.537436 0.473746 0.872395 -1.03968 0.530412 -0.346873 0.54201 -0.196612 0.0943898 -0.282592 -0.160648 -0.618176 0.805293 0.140887 -0.0284323 0.0318133 -1.08693 0.455957 0.445367 -0.0802233 0.340164 0.272048 -1.0432 0.534441 0.340615 -0.785781
+george -0.352749 -0.926178 0.22188 -0.239891 -0.827008 0.458674 0.218286 0.25923 0.83364 -1.50644 0.109229 -0.35783 -0.15503 0.622157 -0.0555357 0.905813 -0.21342 -0.201827 -0.768185 0.78339 0.0278436 0.265842 0.378808 0.307075 0.0124851 0.359085 0.5663 -1.02878 0.0592749 0.0107436 -0.255181 -0.48446 0.512149 0.94726 0.567269 0.537054 0.305373 0.357186 0.633059 -0.0835893 -0.19331 -0.0275081 1.06633 0.496662 0.921821 0.0494123 -0.902644 -0.580058 0.63979 0.663896
+records 0.282572 0.68265 0.20856 -0.35476 -0.53052 -0.352022 0.722753 -0.247645 0.0826267 -0.330235 0.0369964 -0.53579 -0.154724 -0.572707 0.327994 0.415428 -1.5064 -1.32575 0.728177 -0.0544263 0.141565 0.374641 -0.0891354 0.774168 -0.652732 0.752407 0.920259 -0.512418 0.221455 1.19792 0.452136 0.643952 -0.12408 -0.309608 0.897498 -0.640101 0.12766 -0.536423 0.2194 0.902558 0.0763821 0.0658122 0.60333 0.171786 -1.1173 0.597491 -1.68791 -0.451159 0.34285 0.320471
+making -0.134365 0.185442 0.238314 0.297956 -0.095905 -0.538851 0.0921498 -0.242405 0.202135 0.27173 0.259806 -0.192022 -0.149623 0.749545 -0.0392936 0.15798 -0.321535 0.5414 0.161454 -0.37888 0.433523 0.252773 0.413875 0.141495 -0.416253 0.0254191 0.394454 -0.618671 -0.0684327 -0.606742 -0.350837 0.15873 0.391041 -0.359852 0.425575 0.232131 -0.34658 -0.242479 0.245338 0.0266981 0.000219991 0.0292499 0.198265 0.303859 -0.767887 0.0982176 -0.594445 -0.107313 -0.0662486 0.137106
+special 0.650064 1.07097 -0.403189 -0.343681 -0.792073 -0.705811 -0.128823 -0.28416 0.763282 -0.131951 -0.0414579 -0.711351 -0.900262 -0.355167 -0.592039 0.248294 0.458884 0.261192 -0.279559 -0.318255 0.115665 -0.136751 0.384186 -0.181273 0.261346 -0.105845 0.0684244 -0.814912 -0.212333 -0.680088 -0.117329 -0.00387312 -0.0416342 0.241281 -0.358628 -0.0536262 0.147167 0.118128 -0.02911 -0.0530626 -0.87159 0.137755 0.278858 0.417771 -0.186088 -0.0175077 -0.820073 -0.298999 0.23589 0.115987
+joined 0.899494 -0.192292 -0.535034 0.11227 -0.185383 0.37519 0.518512 0.475386 -0.0988404 0.131085 1.71678 0.734147 -0.703085 0.515866 -0.293471 -0.241543 -1.07093 0.283381 -0.106579 -0.0237993 0.240849 0.164147 -0.08997 0.00839732 -0.395118 0.30234 0.663683 -0.480823 -0.322605 -0.478416 1.06124 0.00566805 1.34286 0.309423 0.689284 -0.216995 0.138667 -0.596596 -0.87569 -0.025432 0.0259542 0.517419 0.632745 0.338711 0.584166 0.719349 -1.0924 0.328239 0.221643 1.32172
+today -0.123311 0.11833 0.42902 -0.139619 -0.159554 0.0289843 -0.243278 -0.0346835 0.105491 -0.359848 -0.366799 -0.528715 -0.159424 0.568234 -0.917656 0.159059 -0.711847 -0.250754 0.908549 0.0134356 0.164653 0.506658 -0.139185 0.778945 -0.479957 0.150581 0.527956 -1.01652 0.241885 -0.106337 -0.00877914 0.0635301 -0.449395 0.292879 -0.532015 -0.195321 -0.344585 0.535512 -0.256144 -0.107286 -0.640071 0.247476 0.0669612 -0.380911 -0.192899 0.62798 -0.117144 0.323827 0.213859 0.294514
+india -0.729041 1.23257 -0.0886151 -0.126736 -1.64337 -1.31035 -0.566911 0.105992 1.46369 -0.99605 1.48327 -0.522403 1.53072 0.961699 0.283892 -0.663049 -0.607785 0.431406 -0.445612 1.98672 -0.707097 0.328404 0.516276 0.278164 -0.562046 0.00547677 0.635493 -0.570008 -0.215532 -1.67943 1.02029 1.26096 -0.541362 -0.412856 -0.94927 0.296088 -0.242376 0.906436 -0.870856 1.29225 0.289644 1.51797 1.00919 -1.70061 -0.874803 -0.353879 -1.05883 -1.44971 0.231241 -0.154049
+square 0.646202 -0.465915 0.179063 0.210095 -0.0405158 -0.318605 -0.530061 -0.391932 0.38432 -0.173865 -0.874162 -0.361703 -0.672889 -0.172916 -0.639847 -0.124964 -1.43067 0.787005 -0.302245 1.36695 -0.14223 1.23874 0.828247 -0.830182 -0.82062 0.436447 0.89386 -1.04702 1.54563 0.18763 -0.459264 -0.466041 0.719661 -0.525179 -0.317435 0.530173 -0.242721 1.09252 0.124871 -0.682525 0.0709974 -0.304241 0.767366 -0.835436 -0.157077 -0.906318 -0.367441 -0.30502 1.15001 -0.172672
+information 0.172254 0.368316 -0.262919 -0.615654 0.107346 -0.134317 -0.270512 0.151504 0.239029 -0.232551 -0.3984 -1.26714 0.343071 -0.465402 -0.937089 -0.119853 0.138586 0.436976 0.00401244 0.335797 0.634605 0.174215 0.277734 -0.46718 -0.351446 0.602942 0.974255 -0.178434 -0.750233 -0.00568692 0.285016 0.562805 -0.771009 -0.220894 0.399709 0.431671 -0.908522 0.115931 0.12284 -0.455528 -0.885752 -0.210107 0.679769 -0.154436 -0.8 0.337942 -1.53991 -0.369144 -0.657174 -0.169094
+good -0.0443989 0.529262 -0.131386 -0.061475 -0.3717 -0.707813 0.0127569 -0.54797 0.281246 -0.188999 0.0176894 -0.956872 0.344904 0.00502391 -0.0958381 0.518322 -0.0542694 0.235734 0.427359 0.0772932 0.659262 0.743017 0.373569 0.148708 0.178951 -0.580175 0.367744 -0.480283 -0.156534 -0.8343 -0.685554 0.0578941 0.31845 0.204099 0.254714 -0.251237 -0.845157 0.255614 0.269775 0.0961291 -0.113993 0.00725838 0.457027 0.437106 0.0434675 1.32255 -0.112006 -0.0915638 0.0607283 0.603331
+areas -0.510284 -0.154888 -0.30023 -1.28266 0.411187 0.101519 -0.291776 0.129996 0.772595 -0.273798 0.175698 -1.13424 -0.0779787 0.12514 -1.33384 -0.858505 -0.574726 1.20869 0.90593 0.638339 0.786489 0.621412 0.125176 0.460511 -0.66445 0.369476 -0.0565905 -0.0232668 -0.841671 0.189766 -0.409905 0.286787 0.717562 0.0502965 -0.675488 -0.178416 0.310439 0.540471 -0.238748 -0.393865 -0.512701 0.361673 -0.106101 -0.290509 -0.744822 -0.0364443 -1.06099 -0.411645 0.797109 -0.431133
+upon 0.174559 -0.0652186 0.541318 0.093063 -0.184418 -0.189629 0.444849 0.954396 0.671278 -0.0564345 -0.402099 -0.367359 -0.26634 0.671851 0.206256 0.328716 -0.167062 -0.157811 -0.114235 -0.0584915 0.384279 0.482816 0.530206 0.105309 -0.0143153 0.166317 -0.398198 -0.305852 -0.637948 -0.893248 -0.453242 0.280045 -0.0586735 -0.294488 0.55807 -0.217878 -0.0466347 -0.269952 0.0276159 -0.0165368 -0.402721 -0.236448 0.589559 -0.0541217 -0.707781 0.0634522 -0.880068 0.540588 0.0746514 0.768595
+social -0.117277 0.112832 -0.258835 -0.0944082 -0.873512 0.309273 -0.371687 -0.185183 0.677365 -0.131006 -0.412415 -0.84088 0.466719 -0.214302 -1.19631 -1.29403 0.35286 0.518203 0.422769 -0.821574 1.14916 0.346537 0.805569 -0.35406 -0.245621 1.04879 0.0755756 -1.06916 -0.783129 -1.25629 0.0420627 0.75691 0.57508 0.567484 0.649775 -0.047624 0.142274 0.479675 -0.374248 -0.26294 0.203429 0.654415 0.630842 -0.449741 0.444101 1.00346 -0.655778 1.14144 0.479427 -0.585439
+professional -0.194305 0.788156 0.840772 1.35028 -0.753728 0.116617 0.636996 0.106801 0.607939 1.02473 1.13758 -0.636585 0.165001 -0.897741 -0.845805 -0.125778 0.0164202 0.488538 0.361839 -1.01228 0.269088 -0.0101748 -0.202136 0.664871 -0.421732 0.63158 1.26822 -0.267311 -0.0155186 -0.687747 0.555154 1.11841 0.221892 0.839791 0.241494 -0.891241 -0.167703 0.0695004 0.162989 0.225116 -0.0685223 -0.159318 0.415996 1.21258 0.539186 0.21046 -0.782305 -0.323959 0.819005 0.438387
+james -0.10202 -1.00755 0.127158 0.278132 -0.888972 0.462321 0.909696 -0.077968 0.224812 -1.15477 0.137845 -0.753143 -0.628523 0.688521 0.0723796 0.756065 -0.249935 -0.0543941 -1.04534 0.992742 0.949887 -0.133481 -0.129306 0.0634523 0.213232 0.620228 0.557354 -0.613546 0.207217 -0.111872 -0.239176 -0.141339 0.132537 0.498228 0.618047 0.132505 -0.0555529 0.265538 0.930229 0.175415 0.0425643 0.338014 1.08649 0.680427 0.542037 -0.1754 -0.762382 -0.511604 0.622808 0.770235
+case -0.729406 -0.105698 0.838091 -0.437899 -0.0708099 -0.1213 -0.00596162 -0.0845146 0.292139 -0.390892 0.348247 0.276631 -0.47859 -0.519429 -0.361942 0.200157 0.46965 0.729607 -0.0604189 -0.0890022 0.113037 0.459354 0.0461511 -0.387854 0.491159 1.14658 0.356759 0.240223 0.632204 -0.467597 -0.330065 0.638007 -0.559506 -0.665469 0.229982 0.176448 -0.202018 -0.0778054 0.308502 -0.429747 -0.714875 0.947301 0.507825 -0.461139 -0.740178 0.392447 -1.2203 0.230362 0.0966256 0.527067
+project 0.023539 0.857843 -0.641537 1.0243 -0.0291847 0.328153 -1.15469 -0.0430552 -0.254746 0.760152 -1.75448 -0.393283 -0.900716 0.976727 -0.60842 0.527261 -0.386224 0.253813 -0.0147467 0.276094 0.643569 0.899564 0.0784873 0.309655 -0.513494 0.729241 0.547514 -0.0782823 -0.754727 0.7311 0.51626 0.687484 0.0577937 -0.139732 0.379504 0.159313 0.41576 0.0250814 -0.146123 0.484041 -0.219241 0.148116 1.17475 0.0463373 -0.221045 0.221125 -1.17821 0.0522567 -0.21365 -0.0599383
+eastern -0.310457 -0.727218 0.202092 -0.286903 0.193877 0.449946 -0.259271 0.346187 1.3718 0.227991 0.623587 0.2427 0.454711 0.67835 -0.379181 0.124708 -1.83849 0.612478 0.397146 1.12273 0.161554 -0.527711 0.118673 0.246203 -0.261236 0.385478 0.122921 -0.326023 -0.40635 -0.0325436 -0.198821 0.160718 0.19516 -0.294465 -1.22058 -0.883291 0.992463 0.817833 -0.511559 -0.767164 -0.789579 0.230376 -0.181479 -0.191688 0.184774 0.831374 -0.690649 -0.620826 0.328321 0.268663
+elected -0.132871 -1.24414 -1.05897 -0.103511 -0.884562 0.267984 -0.514786 -0.043899 0.365417 0.234054 1.05735 -0.10515 -1.60316 -1.21547 -0.780207 -0.184899 -0.43325 1.03622 -1.21836 0.265464 0.102122 1.06846 -0.422314 0.192305 -0.392318 0.537552 1.65489 -1.14372 0.680668 -1.17494 -0.643754 0.241702 0.511092 0.351359 0.736137 -0.521465 0.787476 0.321258 -2.42892 0.045075 -0.151208 0.501651 0.68113 -0.795977 1.71435 -0.010463 -1.07611 0.355966 -0.60689 0.502342
+character -0.631234 0.731217 0.169928 0.574567 -0.119841 0.156531 0.0864428 0.142968 -0.212535 -1.55352 0.351839 1.15214 -0.958719 -0.175523 -0.416631 0.659025 0.562775 0.696222 -0.0937962 -0.604957 1.64101 0.187563 0.813454 -0.521675 0.0111218 0.887676 0.253647 -0.187645 -0.416414 -0.659253 -0.729656 0.854738 0.151579 -0.0345127 -0.131853 -0.50958 -0.568647 0.488565 0.159917 -0.597028 -0.723552 -0.026709 1.51505 -0.12796 -0.107277 0.680521 0.180339 -0.391063 0.377429 -0.0833271
+board 0.0174769 -0.147984 -0.408934 -0.102849 -0.758931 0.931657 -0.268462 -0.332281 0.674161 0.329417 -0.58269 -0.000827496 -0.448704 -0.887662 -0.223637 0.287438 -0.117678 0.672339 0.0848365 0.153732 0.373698 1.01226 0.0376973 -0.643085 0.0755148 0.275154 0.608111 -0.982508 -0.184185 -0.586059 0.227501 0.79325 -0.149784 0.0667738 0.637942 0.392524 0.0259533 0.249674 -0.866314 -0.378103 -0.621142 0.306739 0.368994 0.491263 0.496648 -1.3101 -1.60844 -0.261445 -0.125391 0.801542
+schools 0.227615 -0.95086 0.377232 -0.74202 -0.271445 0.304273 -0.513354 -0.0302047 0.797631 -0.527339 -0.530317 -0.463449 0.0567777 -0.851165 -0.883949 -0.579535 -0.934579 1.46148 0.227262 0.50678 0.569201 1.19771 -0.244728 0.673229 -1.23227 -0.362791 1.05878 -0.750947 -0.855827 -0.891139 -0.178977 1.68317 0.295877 1.55908 -0.502799 -1.29945 0.598454 0.522899 0.489248 -0.414123 -0.0596179 1.55604 0.400968 0.154337 0.219166 -0.268615 -0.33252 0.0766099 0.0811696 0.0884598
+available -0.139603 1.87165 -0.0954345 -0.117467 0.296978 -0.323167 0.173075 -0.550814 0.189076 0.318191 -0.271132 -1.21455 -1.03653 -0.751481 -0.784892 0.105853 -0.628784 0.648896 -0.123703 0.7499 0.473722 0.331411 -0.323556 0.14366 -0.687021 -0.556396 1.08585 -0.510828 0.268192 -0.533394 -0.0108653 -0.028085 -0.366103 -0.0700249 -0.135488 0.0584198 -0.0632149 0.0175673 0.514527 -0.312203 -0.687936 0.327021 -0.175955 -0.230571 -0.608719 0.687984 -0.652716 -0.619327 -0.205309 0.109196
+developed 0.0555064 0.904706 -0.185372 0.546813 0.664105 0.325076 0.020415 0.271535 0.121218 0.298467 0.362437 -0.108695 -0.425468 0.50572 -1.44665 0.189313 -0.546599 0.381774 0.735164 0.29375 1.03101 0.0527742 -0.252899 0.47163 -0.723608 -0.14584 0.488293 -0.435871 0.22395 -0.473742 0.450567 -0.117568 0.28333 -0.0616043 0.298203 0.647173 0.344841 -0.41201 0.426733 -0.0517758 -0.00645005 1.0924 1.10018 -0.397899 0.185878 0.48704 -0.352691 0.219325 0.282063 -0.213577
+forces 0.447056 -0.499968 -1.03519 0.0206439 0.226459 -0.518382 0.891419 1.55197 0.822802 -0.741615 0.683127 0.0318679 -0.0362472 0.258613 -0.387489 -0.14532 0.494744 1.21901 0.14041 0.113775 0.207533 1.51263 0.207398 -0.809685 0.159431 0.212119 0.0552234 0.384865 -0.738635 0.112588 0.495074 -0.447206 0.41281 -0.253826 -0.212008 -0.689669 0.473527 1.30643 -0.453391 -0.175027 -0.743593 0.736125 1.13672 -0.22794 -0.352095 0.0159412 -1.78449 0.595757 1.0069 0.1113
+post 0.598809 -0.241629 -0.468274 -0.477285 -0.161129 0.4847 -0.22686 0.0793646 0.478104 -0.150542 0.220493 0.0948367 -0.222162 0.887083 -0.972387 -0.016485 0.00487894 0.140432 -0.323807 -0.201696 0.120624 -0.0066327 0.674655 0.48284 0.0602246 0.529837 0.543786 -0.89195 -0.030413 -0.55865 -0.149294 0.708389 0.0256874 0.347955 0.633367 -0.270083 0.169053 0.70022 0.136681 -0.0682983 -0.518086 -0.0243715 0.231397 0.194281 -0.984325 -0.00702043 -0.517305 0.243683 0.224601 0.82302
+summer -0.797867 0.0708959 -0.373947 0.614155 -0.152935 0.325314 -0.293757 -0.719406 0.642245 0.358101 0.510744 -1.02332 -0.661133 0.736483 0.204386 -0.295144 -1.21027 -0.222365 -0.18732 -0.134052 -0.371936 0.851631 0.61837 0.239423 0.186847 0.0551638 0.985158 0.111865 0.296897 -0.299606 0.359883 0.434517 0.439767 0.413236 -0.537617 -1.27101 1.27861 0.271542 0.0884054 -1.18313 0.095959 -0.410528 0.157632 0.972515 -0.459032 0.298637 -0.321588 0.0842647 0.453833 0.356982
+la 1.39628 -0.624764 0.225846 0.666532 0.28007 0.449947 0.0862194 -0.167313 0.455268 0.604608 -0.120319 -1.14228 -1.73731 0.336504 0.297694 -0.619827 0.110196 -1.08582 0.360381 -0.379303 -0.292056 -0.0604798 1.27784 0.110214 -0.596887 0.45659 0.453569 -0.248819 0.692638 0.140239 -0.0358454 -0.00841858 0.40415 -0.712039 0.388026 -1.57728 -1.57613 1.00903 -0.407001 -0.498577 -0.871924 1.55446 0.559909 0.0698619 0.952818 1.03112 -0.930666 -0.820747 1.21356 -0.696793
+body -0.919579 -0.274897 0.51328 0.705491 0.130347 -0.65829 -0.200513 0.12144 0.421185 0.312827 0.193388 -0.284308 -0.624241 -0.767336 0.261758 -0.0683584 -0.609516 0.188433 0.540588 0.0916268 0.690543 0.315912 0.98724 -0.165357 1.14889 0.116237 0.263947 -0.403506 -0.476142 -0.18242 -0.0545256 0.238358 -0.146888 -0.0963312 0.10107 0.160566 -0.280094 -0.616488 -0.734354 -0.0799469 -1.26345 1.12109 0.422233 -0.548746 -0.360357 -0.663346 -0.72744 0.746092 0.367278 0.108805
+working 0.174933 0.876273 -0.859545 0.239822 -0.832993 -0.288584 -0.17045 -0.394426 -0.258255 0.337503 0.541669 -0.175193 -0.210593 0.354221 -0.48988 0.036828 0.238197 0.522198 0.492691 0.0832949 0.853101 0.244135 0.163516 0.514685 -0.307049 0.501073 1.01654 -0.00696373 -0.211612 -0.436724 0.611725 0.679904 0.521769 0.736964 0.684494 0.250311 -0.15135 -0.130339 -0.108862 -0.348306 -0.118794 0.399342 0.237622 0.193118 0.318618 0.325944 -0.621924 0.654031 0.376837 0.350843
+lake -0.603976 -1.21857 -0.799609 0.383557 0.24117 0.430574 -1.3399 0.177018 1.07179 0.519865 0.125177 -1.67295 -0.701667 0.710858 0.570955 1.51037 -1.71414 -0.607119 0.785543 0.772268 0.464505 1.3483 -0.390509 -0.269522 -0.172255 1.47452 0.34137 0.675337 0.340624 -0.568132 -0.31623 0.0579525 -0.232538 0.139902 -0.801495 -0.964414 0.573617 0.356674 -0.68435 -0.760857 -0.0374249 0.268817 -0.181682 0.297862 0.223367 0.282952 -0.607103 -1.07175 1.32359 -0.118851
+championship -0.607328 0.14837 1.34807 1.41621 1.08736 -0.493735 -0.287058 -0.313436 0.164081 -0.243732 1.61932 -0.119793 -0.0270396 -0.908468 0.374063 0.356309 -1.37041 1.06042 -0.300941 -1.64099 0.645056 1.18469 -0.878296 0.262427 -0.949458 0.197367 1.09321 -1.17755 0.816888 -0.797339 0.766191 1.52299 1.09904 -0.421145 -0.419445 -1.97408 1.02241 2.06039 -0.0282325 0.374934 -0.430753 0.646375 1.77169 1.50131 0.85525 -0.217216 -1.29626 -0.609966 0.151942 1.12906
+eventually 0.0658841 0.0477027 0.509312 0.77127 -0.0966113 -0.161429 0.230223 0.325768 -0.3876 -0.0408263 0.0365604 0.190746 -0.394594 0.810443 0.171758 0.308022 -0.54003 0.0419728 -0.102457 0.16926 0.604371 0.648028 0.237463 0.958944 0.244971 -0.0689268 -0.117295 0.0923548 -0.7176 -0.605618 -0.201315 0.187856 -0.015324 -0.230151 0.641133 -0.219376 -0.0357752 0.590215 -0.383474 -0.388174 -0.0541134 -0.0783458 0.297473 0.0730092 -0.152958 0.170246 -1.06616 0.362541 0.432723 0.325889
+germany -0.569474 0.751589 0.210232 -0.916911 -0.0958845 -0.567632 0.165509 -0.553157 -0.628367 1.59406 1.23032 -0.13866 0.865371 1.13529 0.678246 -0.301116 -0.470846 0.408869 -0.0302927 0.527442 -0.720907 0.396355 0.12315 0.0841648 -0.335937 -0.229117 1.92688 -1.39631 -0.649033 0.76045 -0.142569 0.0121284 -0.518872 0.252303 -0.878485 -1.24923 1.07067 0.453944 0.903028 -0.687092 -1.16127 0.0847861 1.01081 -0.76289 0.347847 1.15692 -1.95134 1.2036 1.76181 -0.546526
+throughout -0.641059 -0.0921372 0.301419 0.235901 -0.109388 -0.223549 0.342117 0.362641 0.444519 -0.248591 -0.13885 -0.34521 -0.727674 1.12997 -0.649596 -0.29864 -0.776379 -0.0239441 0.509975 -0.221834 0.119017 -0.343051 -0.180064 0.323798 -0.537033 -0.182334 -0.168952 -0.73754 -0.817188 -0.789641 -0.0494759 0.341409 0.709266 0.00909229 -0.156752 -0.609084 -0.00896708 0.169504 -0.079511 -0.250748 -0.639764 -0.0294752 0.108145 0.235548 -1.0187 0.304542 -0.651154 0.456701 0.0301754 0.487359
+video -0.535163 1.71838 -0.174444 1.36263 0.169795 -1.34007 -0.146201 -0.543553 0.413165 -0.772945 -0.0212688 0.420563 -1.41415 -0.66892 -1.12484 0.188567 -0.534706 0.153382 -0.549808 0.17526 0.74534 1.31806 0.472559 -0.801831 -0.940428 0.772057 0.478078 -1.29285 -0.333984 0.191837 1.09228 1.38911 -0.736172 -0.485763 0.334942 -0.670217 -0.459675 -0.391031 1.33447 -0.36626 -0.134811 -0.779266 0.408744 1.14677 -1.08941 0.475022 -1.61699 -0.180178 0.441178 -1.27207
+hall -0.205523 -0.704184 0.859359 0.456106 -0.984402 0.0210441 -0.540297 -0.128458 -0.248869 -0.581892 -0.404063 -0.462262 -1.19364 -0.335401 -0.753952 0.432442 -0.986719 -0.161065 0.684021 0.316245 -0.189777 1.12828 0.907151 -0.446298 0.321313 0.738222 0.751643 -1.89943 0.436396 -0.0220042 0.204261 0.178764 0.00181241 1.03042 0.191444 -0.714496 0.159011 0.0397021 -0.393981 0.214744 0.804479 -0.257179 0.633795 0.491943 0.700929 -0.90758 -0.549182 -0.351336 0.677277 0.752108
+result -0.723383 0.276993 0.00690039 -0.141778 -0.214838 -0.0277111 0.0569925 -0.0844744 0.220673 -0.0869308 0.249865 -0.18675 -0.289751 -0.0187447 -0.794707 0.0330978 0.0703374 0.822932 0.175102 -0.273713 0.339294 0.445852 -0.198782 0.279058 1.276 0.19229 0.377012 0.00020613 0.414582 -0.0899001 -0.632674 0.227647 0.171664 -0.588903 0.159093 -0.262201 0.203208 0.425919 0.0247219 -0.171678 -0.0171543 0.41906 0.629287 -0.335348 -0.268027 0.811959 -0.659451 0.0393105 -0.0424775 0.284356
+addition 0.335662 0.495817 0.353505 -0.256787 -0.622306 0.36394 0.0466013 -0.587227 0.327575 0.126041 0.451706 -0.396459 -0.812879 0.0926903 -0.712073 -0.140884 -0.0474283 0.936806 0.33593 0.0449224 0.644926 -0.0150165 0.154836 -0.174242 0.264075 -0.133601 0.932406 -0.340372 0.158697 0.225739 0.214703 0.0678766 0.0631194 0.169625 -0.221475 -0.413314 0.421039 -0.237947 -0.0795135 -0.489274 0.0298067 0.433846 0.0103454 -0.144162 -0.394875 0.228107 -0.541969 -0.721487 -0.0319919 -0.202919
+style -0.51824 0.468865 0.964279 0.983375 -0.299045 -0.0242979 0.875697 -0.32531 0.248067 -0.554526 0.11651 0.116647 -0.719814 0.497128 -0.815698 0.301816 -0.642425 1.01872 0.778654 -0.172785 0.401503 0.499528 1.43586 0.256888 -0.395073 0.337367 -0.0533546 -1.00442 0.144011 0.375188 0.0430928 0.496468 -0.0574857 0.871372 0.72049 -0.492974 0.12389 0.480521 0.649645 0.0224122 -1.03354 0.439037 0.487918 0.243216 -0.321536 0.8041 0.636662 -0.293357 0.495795 -0.160861
+recorded -0.271293 0.632847 -0.706165 -0.222938 -0.470326 -0.371689 0.547307 -0.226589 0.206271 -0.449021 0.638639 -0.28209 -1.31239 -0.31704 -0.349845 0.642368 -1.65546 -1.18239 0.183079 0.59514 -0.215275 0.302091 0.0564521 0.500417 -0.815787 0.469699 0.948181 -0.190948 0.981975 0.592148 -0.0604465 -0.0477926 0.692095 -0.221308 0.263909 -0.699053 0.103895 -1.06402 0.609233 0.582694 0.450538 0.269893 0.392742 0.0520912 -0.653558 1.20599 -1.00873 -0.244673 -0.534023 0.0541559
+human -0.397375 0.390597 -0.367855 0.41183 -0.33854 -0.574044 -0.477138 -0.206533 0.798745 0.0274452 -0.352006 -0.73762 0.128784 -0.484443 -0.554965 -0.101544 0.298444 -0.362127 0.632515 -0.163782 1.53097 0.0532096 0.885353 -0.449373 0.176609 0.623729 0.699493 -0.388738 -0.737604 -0.836484 -0.0662164 0.248207 -0.048733 -1.30875 -0.382158 0.481106 0.324847 -0.857058 -0.69112 0.322001 -0.284923 0.439367 1.11878 -0.244267 -0.00181796 0.0362005 -1.10569 1.04908 0.506415 -1.14079
+does 0.127598 0.0375644 0.791248 -0.615396 0.297371 -0.882584 -0.694522 -0.0889213 0.346062 -0.277887 0.0545098 -0.442899 -0.00259511 0.168424 -0.560207 0.873328 -0.00108042 -0.190648 -0.242654 -1.20778 1.20263 1.03741 0.25908 -0.177589 -1.02841 0.348251 0.494231 -0.221142 -0.632183 -0.939652 -0.876129 -0.282769 0.470822 -0.805594 -0.330212 0.0168595 -0.457045 -0.922943 0.359775 -0.144305 -1.40962 0.312672 0.133876 -0.891564 -0.568816 0.741331 -0.618501 -0.365658 0.138729 0.519597
+should -0.262394 -0.0702106 0.562794 -1.00639 0.312057 -0.959322 -0.212324 -0.00379494 0.846536 -0.35194 -0.808739 -0.534631 0.0819792 -0.037337 -0.592203 0.736872 -0.348353 -0.103083 -0.433568 -0.562752 0.453064 1.55585 0.158712 0.440499 -0.801028 -0.328838 -0.0152799 -0.136946 -0.355361 -1.3345 -0.519559 -0.634757 0.329417 -0.481248 0.290403 0.35517 -0.29389 -0.609838 0.208641 0.166843 -1.26399 0.602471 0.853169 -0.314139 -0.252123 0.937711 -1.20489 0.988941 0.111369 1.18163
+wife -0.953933 -0.881909 0.098918 1.1055 -1.09533 -0.0783605 -0.713045 1.19814 -0.294764 -1.4558 0.546983 -0.0379829 -0.505155 -0.0436312 1.51891 -0.25267 0.170416 -0.194502 0.176909 0.527883 0.70764 0.190671 0.585216 0.450033 -0.397631 0.777496 1.13293 0.0556646 -0.207695 -0.351203 -0.22457 0.231286 0.396944 1.07897 1.18617 -0.197423 0.272208 -0.314611 -0.311913 -0.833129 -0.485001 0.0418119 1.17419 0.135116 0.126039 1.09429 -0.816249 -0.204191 0.0243516 0.755615
+release -0.22726 1.54804 -0.460663 0.937657 -0.0372489 -0.121467 0.397514 -0.673088 -0.000145436 0.204991 0.317335 0.187763 -1.58697 -0.246646 0.309712 -0.0857402 -0.307704 -0.00774863 -0.701967 0.500984 0.276664 0.746328 0.384854 -0.0513054 -0.0310728 0.636855 0.800933 -0.0458388 -0.949726 0.846249 -0.342149 0.612928 -0.0495692 -1.18305 0.985147 -0.371753 0.326635 -0.149336 0.28615 0.0412167 0.0354352 0.067787 0.468809 -0.0107497 -1.87303 1.04869 -1.51924 -0.295088 -0.0487846 0.115146
+seen -0.71857 0.491566 -0.365478 -0.516139 0.388454 -1.00757 -0.242899 -0.402609 0.034346 -1.01287 0.310414 -0.128819 -0.698396 0.0536124 -1.37792 0.535317 -0.301905 0.259182 0.376771 -0.0851261 0.824739 0.418587 0.449715 0.275706 0.0484187 0.0280655 0.205416 -0.417212 0.744874 -0.674378 0.109957 -0.181063 0.432291 -0.28916 -0.360516 0.131561 -0.0886217 -0.438105 -0.0322467 -0.376898 -0.55245 0.163206 0.568589 -0.495484 0.372311 1.30351 -0.16984 0.188656 0.570349 0.591616
+railway -0.430743 0.905941 -0.622825 -1.2703 -0.00685976 -0.418989 -0.980981 2.0686 0.300418 0.371512 -0.377908 0.433318 -0.0770555 0.988855 -0.309904 0.777847 -2.10251 0.50649 1.59671 0.572197 -0.51283 1.18255 0.26943 0.19007 -0.872869 1.86895 1.6807 -0.2896 -0.169663 -0.268336 -0.166898 0.771873 -0.029385 0.615933 0.490025 -0.25185 -0.168267 2.6783 -0.0659001 -0.0148777 -0.111302 0.93442 0.226568 -0.753005 1.37715 -0.38667 -0.85122 -1.42807 -0.208632 1.00927
+opened 0.747965 0.429743 -0.12929 -0.171475 0.602347 0.307795 -0.976647 -0.218172 0.0188585 0.283522 -0.385666 0.0159465 -1.27533 0.509163 -0.481511 0.105385 -1.22483 0.702386 0.446091 0.807681 -0.257794 0.676176 0.584792 0.444764 -0.359055 0.508828 0.943551 -1.30078 0.405166 -0.605828 0.980198 0.18548 0.260352 0.218028 0.175275 0.0346037 0.486685 -0.189505 0.354741 -0.786856 0.472254 0.632321 0.212089 -0.0134537 0.476938 0.342847 -0.52383 -0.660781 0.549419 1.16043
+how -0.243309 0.180763 0.399299 -0.0523453 -0.533442 -0.701187 -0.320987 -1.05509 0.216315 -0.787633 -0.361391 -0.734285 0.227907 0.328327 -0.370847 0.544748 0.101176 0.329833 0.168386 -0.0328976 1.03008 0.88644 0.461728 -0.0923324 -0.14843 -0.0791058 0.340873 0.334188 -0.649402 -0.373891 -0.133747 0.0983689 -0.57085 -0.16731 0.247448 -0.0663162 -0.894356 0.0693459 0.28387 -0.546372 -0.519785 0.0738213 0.858837 -0.151751 -0.177521 0.683362 -1.13953 0.584497 -0.404098 0.0286054
+songs -0.898995 0.87752 -0.353292 0.137287 -1.78738 -0.874365 0.943847 -0.870763 0.138906 -1.46893 -0.204181 0.249155 -1.55574 -0.633453 0.360072 0.723862 -2.00536 -0.56168 0.419792 0.13992 -0.13303 1.72891 0.71216 -0.17208 -1.75881 0.825779 0.817386 0.083175 -1.02334 0.955521 0.0278337 0.897693 0.892152 0.350151 0.753303 -1.35105 -0.0871441 0.0202617 0.737127 0.156984 -0.230983 0.963865 0.350363 0.326351 -1.27438 1.66176 -0.472912 -0.329335 -0.609726 -0.821854
+less -0.665735 0.180023 0.139403 0.472021 -0.0889496 -0.744968 0.537342 -0.293299 -0.146147 0.495068 -0.339964 -0.438762 0.0656721 0.0681927 -1.11427 0.334049 -0.231464 0.21808 0.0222591 0.112905 0.769482 0.546561 0.142939 0.484853 -0.101116 -0.880445 0.634432 0.0427747 0.528412 -0.441222 -1.62807 -0.0441359 0.814121 -0.0379144 0.400506 -0.235482 -0.0853247 0.834466 -0.153848 -0.558916 -0.578212 0.476135 -0.105015 -0.568945 -1.02965 0.467052 -0.637434 0.107296 0.217042 -0.0993407
+go 1.04368 0.371785 0.195444 0.405715 -0.363767 -0.766235 -0.451076 -1.27209 -0.15136 -0.33252 0.226351 -0.172871 0.0909573 0.114133 0.371654 0.403079 -0.804778 0.696536 -0.0334784 -0.141312 0.796411 1.19757 0.601888 0.191877 -0.0631509 -0.0915499 0.700079 0.345227 -0.688224 -0.253884 -0.0983955 -0.352589 0.128815 -0.220508 0.125233 -0.814794 -0.102057 0.170571 0.262699 -0.830532 -0.16459 0.047768 -0.148913 -0.285861 0.603785 0.282919 -0.981094 -0.684171 -0.260814 0.406887
+census -0.336126 -0.970443 -1.27114 -0.579042 0.639274 1.28387 -1.4576 0.730455 1.16163 -2.01398 -0.556476 -0.812787 -0.527099 -0.428329 -0.51762 -0.793915 -1.59538 0.961592 -0.674313 1.2918 0.0746798 1.76989 -0.915049 0.683826 -1.0808 2.22383 2.54945 -0.578539 0.952373 -0.582319 -2.11815 1.47656 -0.418021 -0.713966 -1.91001 -0.699761 -0.0502829 1.17653 0.0321542 -0.810297 0.60095 0.611657 0.337166 -1.40897 -0.716501 0.146441 -1.64593 -0.0583592 -0.0600848 -0.64666
+largest -0.805262 -0.45756 -0.402159 0.554478 -0.09567 0.450541 -0.404318 0.0850735 0.309757 0.267898 -0.508075 -0.39763 -0.6631 0.561725 -0.550426 -0.844222 -1.58969 0.66407 0.939478 0.419897 -0.0478623 0.235754 -1.34414 -0.524838 -0.121046 -0.00928398 0.76924 -1.89794 0.848524 -0.307384 0.0851751 0.219665 0.289906 -0.549535 -0.958779 0.406719 -0.250603 0.224846 -0.758312 -0.179698 -0.306901 0.0402611 0.333733 -0.691732 -0.55253 0.288747 -0.930715 0.127092 0.19876 0.4713
+science 0.521057 0.099013 -0.393183 0.27467 -0.242844 0.586737 -1.30292 -0.597864 0.457066 -0.592998 0.497992 -1.38468 -0.0981909 0.0474054 -1.66617 0.138141 0.0378106 0.290259 -0.0347713 0.31116 1.58569 0.141911 0.392463 -1.24593 -0.388528 -0.242519 1.3331 -0.606423 -0.733673 -0.984586 -0.168409 1.76013 -1.08281 0.167754 0.233515 0.10368 0.196494 0.188059 -0.493318 -0.204479 -0.416818 0.618977 1.86843 0.162879 0.274662 -1.2412 -0.54955 0.892772 0.540331 -0.829893
+love -0.22527 0.281563 -0.0427492 1.09322 -1.29093 -0.817766 0.00995718 -0.643947 -0.32439 -1.0248 0.0892264 -0.720094 -0.10537 -0.216423 1.40232 0.726444 -0.407491 -0.31129 0.136583 0.263306 0.772924 0.764611 0.824216 -0.563466 -0.332536 0.366658 0.610393 -0.406464 -0.80289 -0.207188 0.179191 0.668683 0.756467 -0.323814 0.52074 -0.746755 -0.636494 -0.000715647 0.446159 -0.713497 0.163273 0.732395 0.940609 -0.317623 -0.437066 1.95039 0.179032 -0.381887 0.87277 0.291055
+thus -0.127048 0.189046 0.67688 0.0540789 -0.311781 -0.360587 0.292567 0.524074 0.348891 0.320875 -0.190383 -0.256395 0.148199 0.0580616 -0.175063 0.187918 -0.409218 0.404786 -0.0300594 -0.121048 0.345922 0.824295 0.0450184 0.259369 0.258758 -0.023781 -0.327642 0.0436394 0.0338356 -0.732579 -0.642038 -0.0891962 0.0696129 -0.514902 0.138283 -0.103859 -0.197119 0.253738 -0.145788 -0.204739 -0.483732 0.267172 0.791218 -0.579582 -0.389237 0.374117 -0.476925 0.360869 -0.118313 -0.00388496
+income -0.395617 0.23925 -0.00705623 0.0900733 -0.573392 0.196656 -0.795479 0.992838 1.57947 -0.307498 -0.767318 -0.888785 0.190811 -0.471579 0.530756 -1.00515 0.318592 0.87968 -0.0987604 0.240239 1.17513 1.95057 -0.601905 0.698178 -1.14153 1.26232 1.86686 -0.972138 1.23796 -1.06551 -1.86591 0.577409 1.27859 0.623003 0.954147 0.523879 -0.0679263 0.808361 -0.0541037 -1.1868 0.982143 0.165164 0.295239 -0.500047 -0.917797 0.25487 -1.06523 -0.400965 0.560635 -0.779745
+must 0.0275672 0.15508 0.178196 -0.828624 0.122142 -1.30161 -0.149164 -0.223645 0.651762 -0.063244 -0.687759 -0.871213 -0.115765 -0.332525 -0.135652 0.883863 -0.249303 0.37801 -0.133129 -0.713746 0.703007 1.2883 -0.0874114 0.171628 -0.93245 -0.226758 -0.299101 -0.41624 -0.161914 -1.67366 -0.431543 -0.338857 0.413669 -0.703128 0.117174 0.0181266 -0.421898 -1.00197 0.386061 -0.318582 -1.50689 0.268546 1.08579 -0.294963 -0.490938 0.0149448 -1.27939 0.734301 0.0830815 0.645906
+wrote 0.73152 -0.455486 -0.0574692 -0.310174 -0.906844 -0.151505 0.0895868 -0.657059 0.0538085 -1.25276 0.643397 0.243841 -0.631004 0.986101 -0.264505 0.620815 -0.300494 -0.440385 0.168961 0.241658 0.247751 -0.22401 0.705703 -1.24423 -1.60367 0.348317 1.69472 -0.00589747 -0.351252 -0.303834 -0.475044 0.3175 0.0467823 0.0449244 1.38288 -0.240699 0.0985855 -0.22427 0.0761454 0.201267 -0.0207509 1.06528 0.848397 0.249511 0.0340756 1.20952 -0.843583 0.332812 -0.509592 0.445993
+miles 0.438228 -1.00455 -0.92274 0.529886 0.500337 -0.824577 -0.476112 0.216698 -0.151637 -0.345667 -0.308839 -1.59652 0.00768664 0.343984 0.179464 0.836144 -1.6372 -0.205033 0.37317 1.83232 0.481459 0.487999 0.321503 0.0284641 -0.994443 0.136356 1.56808 -0.148503 0.916471 1.01374 -0.53967 0.0670986 0.900098 -0.203583 -0.443501 -0.278873 -0.130971 1.28744 0.246731 -0.860146 -0.0871332 0.294454 0.285083 -0.608196 -0.150229 -0.941881 -1.50569 -0.755205 0.157377 0.512389
+light 0.184631 0.843698 0.0121161 -0.193279 -0.0464833 -0.885123 0.248542 0.35713 1.28536 0.0634537 -0.0760105 -0.344919 -0.538535 0.179087 -0.299877 0.608341 -1.0939 0.153736 0.237839 0.644724 0.519564 0.302159 0.868432 -0.844244 1.06834 -0.868117 0.625122 -0.0976311 -0.178391 0.00571192 -0.187659 0.173466 0.118326 -0.0921671 0.411641 0.0524106 -0.179696 0.827069 0.153751 -0.202037 -0.364177 0.0240819 0.561118 -0.532218 -0.180518 -0.213086 0.213542 -0.145982 1.01792 0.135463
+race -0.962976 -0.0693464 -0.126208 1.07947 0.977708 -1.14288 -0.732395 -0.00983884 0.779044 -1.31663 0.0487888 -1.02697 -0.239803 -0.574027 0.531162 -0.139597 -1.03794 0.182206 -0.583878 -1.07466 1.24926 0.401672 -0.4442 0.212241 -0.085219 0.160037 1.0221 -0.031932 0.886767 -1.1401 0.0580202 0.941783 0.828355 -0.668227 0.330663 -0.904114 0.772183 2.00515 0.924753 -0.277427 0.0593043 0.442971 1.0116 -0.950434 1.20708 -1.22135 -0.908097 0.881529 -0.312112 -0.691011
+taken 0.284384 0.530703 -0.666649 -0.344726 0.430191 -0.837095 0.199672 -0.254833 -0.167545 -0.406914 -0.035365 -0.309306 -0.399097 -0.273103 -0.297771 0.350487 -0.724091 0.470534 0.0180011 0.546573 0.163926 0.274167 0.375487 0.484274 -0.0258932 0.133737 0.323893 -0.29062 0.702921 -0.814825 0.165159 -0.328803 0.114952 -0.367586 0.25519 0.371712 0.131328 -0.776822 0.025687 -0.605715 -0.046296 0.6225 0.84804 -0.272425 0.228351 0.702392 -1.06875 0.316213 -0.0285581 0.57686
+training 0.589549 0.813916 -0.507591 0.706647 0.085425 -0.0814069 0.362883 0.486933 0.668739 0.744052 0.151347 -1.01604 -0.00889854 -0.30168 -0.610192 0.132429 0.717115 0.68189 1.03016 -0.0559924 0.407938 0.599367 0.91711 0.85373 0.241007 -0.465056 1.18039 0.136291 -0.884011 -0.901253 1.20961 0.983639 0.509142 1.15644 -0.438247 -0.931732 0.416956 -0.0908725 0.193921 -0.195625 -0.393049 0.712278 1.03379 0.953576 -0.284075 -0.184236 -1.15494 0.00448029 0.203475 0.995713
+minister -0.532739 -0.634409 -2.09668 0.428761 -1.30665 0.332568 -1.01027 0.910402 -0.0316209 -1.1473 0.507897 -0.649941 0.297397 0.604909 -0.442088 -0.307376 0.0984765 1.1146 -1.12514 1.1227 -0.719093 1.65131 0.635615 -0.288269 0.573179 0.61926 1.56883 -1.49504 -0.906251 -0.767229 -0.846675 0.49656 -0.0492062 0.295307 1.54989 -0.00749338 1.14308 0.901384 -0.607899 0.436132 -1.83421 1.26945 0.124798 -0.0823941 0.729581 0.839542 -0.587786 -0.356903 -0.40957 0.234654
diff --git a/matchzoo/datasets/embeddings/embed_word.txt b/matchzoo/datasets/embeddings/embed_word.txt
new file mode 100644
index 0000000..d8ee0b6
--- /dev/null
+++ b/matchzoo/datasets/embeddings/embed_word.txt
@@ -0,0 +1,7 @@
+7 5
+asia 1 2 3 4 5
+beijing 1 1 1 1 1
+hot 2 2 2 2 2
+east 3 3 3 3 3
+capital 4 4 4 4 4
+china 5 5 5 5 5
diff --git a/matchzoo/datasets/embeddings/load_fasttext_embedding.py b/matchzoo/datasets/embeddings/load_fasttext_embedding.py
new file mode 100644
index 0000000..629510c
--- /dev/null
+++ b/matchzoo/datasets/embeddings/load_fasttext_embedding.py
@@ -0,0 +1,31 @@
+"""FastText embedding data loader."""
+
+from pathlib import Path
+
+import matchzoo as mz
+from matchzoo.utils import get_file
+from matchzoo import embedding
+
+_fasttext_embedding_url = "https://dl.fbaipublicfiles.com/fasttext/vectors" \
+                          "-wiki/wiki.{}.vec"
+
+
+def load_fasttext_embedding(language: str = 'en') -> mz.embedding.Embedding:
+    """
+    Return the pretrained fasttext embedding.
+
+    :param language: the language of embedding. Supported language can be
+        referred to "https://github.com/facebookresearch/fastText/blob/master"
+        "/docs/pretrained-vectors.md"
+    :return: The :class:`mz.embedding.Embedding` object.
+    """
+    file_name = _fasttext_embedding_url.split('/')[-1].format(language)
+    file_path = (Path(mz.USER_DATA_DIR) / 'fasttext').joinpath(file_name)
+    if not file_path.exists():
+        mz.utils.get_file(file_name,
+                          _fasttext_embedding_url.format(language),
+                          extract=False,
+                          cache_dir=mz.USER_DATA_DIR,
+                          cache_subdir='fasttext')
+    return mz.embedding.load_from_file(file_path=str(file_path),
+                                       mode='fasttext')
diff --git a/matchzoo/datasets/embeddings/load_glove_embedding.py b/matchzoo/datasets/embeddings/load_glove_embedding.py
new file mode 100644
index 0000000..6357ba8
--- /dev/null
+++ b/matchzoo/datasets/embeddings/load_glove_embedding.py
@@ -0,0 +1,67 @@
+"""GloVe Embedding data loader."""
+
+from pathlib import Path
+
+import matchzoo as mz
+from matchzoo import embedding
+from matchzoo.utils import get_file
+
+_glove_embedding_url = "http://nlp.stanford.edu/data/glove.6B.zip"
+
+
+def load_glove_embedding(term_index: mz.preprocessors.units.Vocabulary.TermIndex = None, dimension: int = 50) -> mz.embedding.Embedding:
+    """
+    Return the pretrained glove embedding.
+
+    :param dimension: the size of embedding dimension, the value can only be
+        50, 100, or 300.
+    :return: The :class:`mz.embedding.Embedding` object.
+    """
+    file_name = 'glove.6B.' + str(dimension) + 'd.txt'
+    file_path = (Path(mz.USER_DATA_DIR) / 'glove').joinpath(file_name)
+    if not file_path.exists():
+        mz.utils.get_file('glove_embedding',
+                          _glove_embedding_url,
+                          extract=True,
+                          cache_dir=mz.USER_DATA_DIR,
+                          cache_subdir='glove')
+    return mz.embedding.load_from_file(file_path=str(file_path), mode='glove', term_index=term_index)
+
+
+def load_glove_embedding_matching(term_index: mz.preprocessors.units.Vocabulary.TermIndex = None, dimension: int = 50,
+                                  **kargs) -> mz.embedding.Embedding:
+    """
+    Return the pretrained glove embedding.
+
+    :param dimension: the size of embedding dimension, the value can only be
+        50, 100, or 300.
+    :return: The :class:`mz.embedding.Embedding` object.
+    """
+    file_name = 'glove.6B.' + str(dimension) + 'd.txt'
+    file_path = (Path(mz.USER_DATA_DIR) / 'glove').joinpath(file_name)
+    if not file_path.exists():
+        mz.utils.get_file('glove_embedding',
+                          _glove_embedding_url,
+                          extract=True,
+                          cache_dir=mz.USER_DATA_DIR,
+                          cache_subdir='glove')
+    return mz.embedding.load_from_file_matching(file_path=str(file_path), mode='glove', term_index=term_index, **kargs)
+
+
+def load_glove_embedding_FC(term_index: mz.preprocessors.units.Vocabulary.TermIndex = None, dimension: int = 50, **kargs) -> mz.embedding.Embedding:
+    """
+    Return the pretrained glove embedding.
+
+    :param dimension: the size of embedding dimension, the value can only be
+        50, 100, or 300.
+    :return: The :class:`mz.embedding.Embedding` object.
+    """
+    file_name = 'glove.6B.' + str(dimension) + 'd.txt'
+    file_path = (Path(mz.USER_DATA_DIR) / 'glove').joinpath(file_name)
+    if not file_path.exists():
+        mz.utils.get_file('glove_embedding',
+                          _glove_embedding_url,
+                          extract=True,
+                          cache_dir=mz.USER_DATA_DIR,
+                          cache_subdir='glove')
+    return mz.embedding.load_from_file_FC(file_path=str(file_path), mode='glove', term_index=term_index, **kargs)
\ No newline at end of file
diff --git a/matchzoo/embedding/__init__.py b/matchzoo/embedding/__init__.py
new file mode 100644
index 0000000..0ceee99
--- /dev/null
+++ b/matchzoo/embedding/__init__.py
@@ -0,0 +1,4 @@
+from .embedding import Embedding
+from .embedding import load_from_file
+from .embedding import load_from_file_FC
+from .embedding import load_from_file_matching
diff --git a/matchzoo/embedding/embedding.py b/matchzoo/embedding/embedding.py
new file mode 100644
index 0000000..ebb252f
--- /dev/null
+++ b/matchzoo/embedding/embedding.py
@@ -0,0 +1,183 @@
+"""Matchzoo toolkit for token embedding."""
+
+import csv
+import typing
+
+import numpy as np
+import pandas as pd
+
+import matchzoo as mz
+from handlers.output_handler import FileHandler
+
+class Embedding(object):
+    """
+    Embedding class.
+
+    Examples::
+        >>> import matchzoo as mz
+        >>> train_raw = mz.datasets.toy.load_data()
+        >>> pp = mz.preprocessors.NaivePreprocessor()
+        >>> train = pp.fit_transform(train_raw, verbose=0)
+        >>> vocab_unit = mz.build_vocab_unit(train, verbose=0)
+        >>> term_index = vocab_unit.state['term_index']
+        >>> embed_path = mz.datasets.embeddings.EMBED_RANK
+
+    To load from a file:
+        >>> embedding = mz.embedding.load_from_file(embed_path)
+        >>> matrix = embedding.build_matrix(term_index)
+        >>> matrix.shape[0] == len(term_index)
+        True
+
+    To build your own:
+        >>> data = {'A':[0, 1], 'B':[2, 3]}
+        >>> embedding = mz.Embedding(data, 2)
+        >>> matrix = embedding.build_matrix({'A': 2, 'B': 1, '_PAD': 0})
+        >>> matrix.shape == (3, 2)
+        True
+
+    """
+
+    def __init__(self, data: dict, output_dim: int):
+        """
+        Embedding.
+
+        :param data: Dictionary to use as term to vector mapping.
+        :param output_dim: The dimension of embedding.
+        """
+        self._data = data
+        self._output_dim = output_dim
+
+    def build_matrix(
+        self,
+        term_index: typing.Union[
+            dict, mz.preprocessors.units.Vocabulary.TermIndex],
+        initializer=lambda: np.random.uniform(-0.2, 0.2)
+    ) -> np.ndarray:
+        """
+        Build a matrix using `term_index`.
+
+        :param term_index: A `dict` or `TermIndex` to build with.
+        :param initializer: A callable that returns a default value for missing
+            terms in data. (default: a random uniform distribution in range)
+            `(-0.2, 0.2)`).
+        :return: A matrix.
+        """
+        input_dim = len(term_index)
+        matrix = np.empty((input_dim, self._output_dim))
+        valid_keys = self._data.keys()
+        for term, index in sorted(term_index.items(), key = lambda x: x[1]):  # Starting the smallest index to the largest
+            if term in valid_keys:
+                matrix[index] = self._data[term]
+            else:
+                matrix[index] = initializer()
+        return matrix
+
+
+def load_from_file(file_path: str, mode: str = 'word2vec', term_index: mz.preprocessors.units.Vocabulary.TermIndex = None) -> Embedding:
+    """
+    Load embedding from `file_path`.
+
+    :param file_path: Path to file.
+    :param mode: Embedding file format mode, one of 'word2vec', 'fasttext'
+        or 'glove'.(default: 'word2vec')
+    :return: An :class:`matchzoo.embedding.Embedding` instance.
+    """
+    embedding_data = {}
+    output_dim = 0
+    count_word_hit = 0
+    if mode == 'word2vec' or mode == 'fasttext':
+        with open(file_path, 'r') as f:
+            output_dim = int(f.readline().strip().split(' ')[-1])
+            for line in f:
+                current_line = line.rstrip().split(' ')
+                if current_line[0] not in term_index: continue
+                embedding_data[current_line[0]] = current_line[1:]
+                count_word_hit += 1
+    elif mode == 'glove':
+        with open(file_path, 'r', encoding = "utf-8") as f:
+            output_dim = len(f.readline().rstrip().split(' ')) - 1
+            f.seek(0)
+            for line in f:
+                current_line = line.rstrip().split(' ')
+                if current_line[0] not in term_index: continue
+                embedding_data[current_line[0]] = current_line[1:]
+                count_word_hit += 1
+    else: raise TypeError("%s is not a supported embedding type. `word2vec`, `fasttext` or `glove` expected." % mode)
+
+    FileHandler.myprint("Word hit: " + str((count_word_hit, len(term_index))) + " " + str(count_word_hit / len(term_index) * 100))
+
+    return Embedding(embedding_data, output_dim)
+
+
+def load_from_file_matching(file_path: str, mode: str = 'word2vec',
+                            term_index: mz.preprocessors.units.Vocabulary.TermIndex = None, **kargs) -> Embedding:
+    """
+    Load embedding from `file_path`.
+
+    :param file_path: Path to file.
+    :param mode: Embedding file format mode, one of 'word2vec', 'fasttext'
+        or 'glove'.(default: 'word2vec')
+    :return: An :class:`matchzoo.embedding.Embedding` instance.
+    """
+    embedding_data = {}
+    output_dim = 0
+    count_word_hit = 0
+    if mode == 'word2vec' or mode == 'fasttext':
+        with open(file_path, 'r') as f:
+            output_dim = int(f.readline().strip().split(' ')[-1])
+            for line in f:
+                current_line = line.rstrip().split(' ')
+                if current_line[0] not in term_index: continue
+                embedding_data[current_line[0]] = current_line[1:]
+                count_word_hit += 1
+    elif mode == 'glove':
+        with open(file_path, 'r', encoding = "utf-8") as f:
+            output_dim = len(f.readline().rstrip().split(' ')) - 1
+            f.seek(0)
+            for line in f:
+                current_line = line.rstrip().split(' ')
+                if current_line[0] not in term_index: continue
+                embedding_data[current_line[0]] = current_line[1:]
+                count_word_hit += 1
+    else: raise TypeError("%s is not a supported embedding type. `word2vec`, `fasttext` or `glove` expected." % mode)
+    output_handler = kargs["output_handler_multiprocessing"]
+    output_handler.myprint("Word hit: " + str((count_word_hit, len(term_index))) + " " + str(count_word_hit / len(term_index) * 100))
+
+    return Embedding(embedding_data, output_dim)
+
+
+def load_from_file_FC(file_path: str, mode: str = 'word2vec', term_index: mz.preprocessors.units.Vocabulary.TermIndex = None, **kargs) -> Embedding:
+    """
+    Load embedding from `file_path`.
+
+    :param file_path: Path to file.
+    :param mode: Embedding file format mode, one of 'word2vec', 'fasttext'
+        or 'glove'.(default: 'word2vec')
+    :return: An :class:`matchzoo.embedding.Embedding` instance.
+    """
+    embedding_data = {}
+    output_dim = 0
+    count_word_hit = 0
+    if mode == 'word2vec' or mode == 'fasttext':
+        with open(file_path, 'r') as f:
+            output_dim = int(f.readline().strip().split(' ')[-1])
+            for line in f:
+                current_line = line.rstrip().split(' ')
+                if current_line[0] not in term_index: continue
+                embedding_data[current_line[0]] = current_line[1:]
+                count_word_hit += 1
+    elif mode == 'glove':
+        with open(file_path, 'r', encoding = "utf-8") as f:
+            output_dim = len(f.readline().rstrip().split(' ')) - 1
+            f.seek(0)
+            for line in f:
+                current_line = line.rstrip().split(' ')
+                if current_line[0] not in term_index: continue
+                embedding_data[current_line[0]] = current_line[1:]
+                count_word_hit += 1
+    else: raise TypeError("%s is not a supported embedding type. `word2vec`, `fasttext` or `glove` expected." % mode)
+
+    output_handler = kargs["output_handler_fact_checking"]
+    output_handler.myprint("Word hit: " + str((count_word_hit, len(term_index))) + " " + str(count_word_hit / len(term_index) * 100))
+
+    return Embedding(embedding_data, output_dim)
\ No newline at end of file
diff --git a/matchzoo/embedding/entity_embedding.py b/matchzoo/embedding/entity_embedding.py
new file mode 100644
index 0000000..85c6d47
--- /dev/null
+++ b/matchzoo/embedding/entity_embedding.py
@@ -0,0 +1,50 @@
+"""Matchzoo toolkit for token embedding."""
+
+import csv
+import typing
+
+import numpy as np
+import pandas as pd
+
+import matchzoo as mz
+from handlers.output_handler import FileHandler
+
+
+class EntityEmbedding(object):
+    """
+    Embedding class for entities
+
+    Examples::
+
+    """
+
+    def __init__(self, output_dim: int):
+        """
+        Embedding.
+
+        :param data: Dictionary to use as term to vector mapping.
+        :param output_dim: The dimension of embedding.
+        """
+        self._output_dim = output_dim
+
+    def build_matrix(
+        self,
+        term_index: typing.Union[
+            dict, mz.preprocessors.units.Vocabulary.TermIndex],
+        initializer=lambda: np.random.uniform(-0.2, 0.2)
+    ) -> np.ndarray:
+        """
+        Build a matrix using `term_index`.
+
+        :param term_index: A `dict` or `TermIndex` to build with.
+        :param initializer: A callable that returns a default value for missing
+            terms in data. (default: a random uniform distribution in range)
+            `(-0.2, 0.2)`).
+        :return: A matrix.
+        """
+        input_dim = len(term_index)
+        matrix = np.empty((input_dim, self._output_dim))
+        # Starting the smallest index to the largest to ensure reproducibility
+        for term, index in sorted(term_index.items(), key = lambda x: x[1]):
+            matrix[index] = initializer()
+        return matrix
diff --git a/matchzoo/engine/__init__.py b/matchzoo/engine/__init__.py
new file mode 100644
index 0000000..c58af07
--- /dev/null
+++ b/matchzoo/engine/__init__.py
@@ -0,0 +1,4 @@
+# `engine` dependencies span across the entire project, so it's better to
+# leave this __init__.py empty, and use `from matchzoo.engine.package import
+# x` or `from matchzoo.engine import package` instead of `from matchzoo
+# import engine`.
diff --git a/matchzoo/engine/base_metric.py b/matchzoo/engine/base_metric.py
new file mode 100644
index 0000000..2a87f2a
--- /dev/null
+++ b/matchzoo/engine/base_metric.py
@@ -0,0 +1,39 @@
+"""Metric base class and some related utilities."""
+
+import abc
+
+import numpy as np
+
+
+class BaseMetric(abc.ABC):
+    """Metric base class."""
+
+    ALIAS = 'base_metric'
+
+    @abc.abstractmethod
+    def __call__(self, y_true: np.array, y_pred: np.array) -> float:
+        """
+        Call to compute the metric.
+
+        :param y_true: An array of groud truth labels.
+        :param y_pred: An array of predicted values.
+        :return: Evaluation of the metric.
+        """
+
+    @abc.abstractmethod
+    def __repr__(self):
+        """:return: Formated string representation of the metric."""
+
+    def __eq__(self, other):
+        """:return: `True` if two metrics are equal, `False` otherwise."""
+        return (type(self) is type(other)) and (vars(self) == vars(other))
+
+    def __hash__(self):
+        """:return: Hashing value using the metric as `str`."""
+        return str(self).__hash__()
+
+
+def sort_and_couple(labels: np.array, scores: np.array) -> np.array:
+    """Zip the `labels` with `scores` into a single list."""
+    couple = list(zip(labels, scores))
+    return np.array(sorted(couple, key=lambda x: x[1], reverse=True))
diff --git a/matchzoo/engine/base_preprocessor.py b/matchzoo/engine/base_preprocessor.py
new file mode 100644
index 0000000..7fc57d1
--- /dev/null
+++ b/matchzoo/engine/base_preprocessor.py
@@ -0,0 +1,141 @@
+""":class:`BasePreprocessor` define input and ouutput for processors."""
+
+import abc
+import functools
+import typing
+from pathlib import Path
+
+import dill
+
+import matchzoo as mz
+
+
+def validate_context(func):
+    """Validate context in the preprocessor."""
+
+    @functools.wraps(func)
+    def transform_wrapper(self, *args, **kwargs):
+        if not self.context:
+            raise ValueError('Please call `fit` before calling `transform`.')
+        return func(self, *args, **kwargs)
+
+    return transform_wrapper
+
+
+class BasePreprocessor(metaclass=abc.ABCMeta):
+    """
+    :class:`BasePreprocessor` to input handle data.
+
+    A preprocessor should be used in two steps. First, `fit`, then,
+    `transform`. `fit` collects information into `context`, which includes
+    everything the preprocessor needs to `transform` together with other
+    useful information for later use. `fit` will only change the
+    preprocessor's inner state but not the input data. In contrast,
+    `transform` returns a modified copy of the input data without changing
+    the preprocessor's inner state.
+
+    """
+
+    DATA_FILENAME = 'preprocessor.dill'
+
+    def __init__(self):
+        """Initialization."""
+        self._context = {}
+
+    @property
+    def context(self):
+        """Return context."""
+        return self._context
+
+    @abc.abstractmethod
+    def fit(
+        self,
+        data_pack: 'mz.DataPack',
+        verbose: int = 1
+    ) -> 'BasePreprocessor':
+        """
+        Fit parameters on input data.
+
+        This method is an abstract base method, need to be
+        implemented in the child class.
+
+        This method is expected to return itself as a callable
+        object.
+
+        :param data_pack: :class:`Datapack` object to be fitted.
+        :param verbose: Verbosity.
+        """
+
+    @abc.abstractmethod
+    def transform(
+        self,
+        data_pack: 'mz.DataPack',
+        verbose: int = 1
+    ) -> 'mz.DataPack':
+        """
+        Transform input data to expected manner.
+
+        This method is an abstract base method, need to be
+        implemented in the child class.
+
+        :param data_pack: :class:`DataPack` object to be transformed.
+        :param verbose: Verbosity.
+            or list of text-left, text-right tuples.
+        """
+
+    def fit_transform(
+        self,
+        data_pack: 'mz.DataPack',
+        verbose: int = 1
+    ) -> 'mz.DataPack':
+        """
+        Call fit-transform.
+
+        :param data_pack: :class:`DataPack` object to be processed.
+        :param verbose: Verbosity.
+        """
+        return self.fit(data_pack, verbose=verbose) \
+            .transform(data_pack, verbose=verbose)
+
+    def save(self, dirpath: typing.Union[str, Path]):
+        """
+        Save the :class:`DSSMPreprocessor` object.
+
+        A saved :class:`DSSMPreprocessor` is represented as a directory with
+        the `context` object (fitted parameters on training data), it will
+        be saved by `pickle`.
+
+        :param dirpath: directory path of the saved :class:`DSSMPreprocessor`.
+        """
+        dirpath = Path(dirpath)
+        data_file_path = dirpath.joinpath(self.DATA_FILENAME)
+
+        if data_file_path.exists():
+            raise FileExistsError(
+                '{data_file_path} instance exist, fail to save.'.format(data_file_path = data_file_path))
+        elif not dirpath.exists():
+            dirpath.mkdir()
+
+        dill.dump(self, open(data_file_path, mode='wb'))
+
+    @classmethod
+    def _default_units(cls) -> list:
+        """Prepare needed process units."""
+        return [
+            mz.preprocessors.units.tokenize.Tokenize(),
+            mz.preprocessors.units.lowercase.Lowercase(),
+            mz.preprocessors.units.punc_removal.PuncRemoval(),
+        ]
+
+
+def load_preprocessor(dirpath: typing.Union[str, Path]) -> 'mz.DataPack':
+    """
+    Load the fitted `context`. The reverse function of :meth:`save`.
+
+    :param dirpath: directory path of the saved model.
+    :return: a :class:`DSSMPreprocessor` instance.
+    """
+    dirpath = Path(dirpath)
+
+    data_file_path = dirpath.joinpath(BasePreprocessor.DATA_FILENAME)
+    return dill.load(open(data_file_path, 'rb'))
diff --git a/matchzoo/engine/base_task.py b/matchzoo/engine/base_task.py
new file mode 100644
index 0000000..9b18b66
--- /dev/null
+++ b/matchzoo/engine/base_task.py
@@ -0,0 +1,83 @@
+"""Base task."""
+
+import typing
+import abc
+
+from matchzoo.engine import base_metric
+from matchzoo.engine import parse_metric
+
+
+class BaseTask(abc.ABC):
+    """Base Task, shouldn't be used directly."""
+
+    def __init__(self, loss=None, metrics=None):
+        """
+        Base task constructor.
+
+        :param loss: By default the first loss in available losses.
+        :param metrics:
+        """
+        self._loss = loss
+        self._metrics = self._convert_metrics(metrics)
+        self._assure_loss()
+        self._assure_metrics()
+
+    def _convert_metrics(self, metrics):
+        if not metrics:
+            metrics = []
+        elif not isinstance(metrics, list):
+            metrics = [metrics]
+        return [
+            parse_metric.parse_metric(metric, self) for metric in metrics
+        ]
+
+    def _assure_loss(self):
+        if not self._loss:
+            self._loss = self.list_available_losses()[0]
+
+    def _assure_metrics(self):
+        if not self._metrics:
+            first_available = self.list_available_metrics()[0]
+            self._metrics = self._convert_metrics(first_available)
+
+    @property
+    def loss(self):
+        """:return: Loss used in the task."""
+        return self._loss
+
+    @property
+    def metrics(self):
+        """:return: Metrics used in the task."""
+        return self._metrics
+
+    @metrics.setter
+    def metrics(
+        self,
+        new_metrics: typing.Union[
+            typing.List[str],
+            typing.List[base_metric.BaseMetric],
+            str,
+            base_metric.BaseMetric
+        ]
+    ):
+        self._metrics = self._convert_metrics(new_metrics)
+
+    @classmethod
+    @abc.abstractmethod
+    def list_available_losses(cls) -> list:
+        """:return: a list of available losses."""
+
+    @classmethod
+    @abc.abstractmethod
+    def list_available_metrics(cls) -> list:
+        """:return: a list of available metrics."""
+
+    @property
+    @abc.abstractmethod
+    def output_shape(self) -> tuple:
+        """:return: output shape of a single sample of the task."""
+
+    @property
+    @abc.abstractmethod
+    def output_dtype(self):
+        """:return: output data type for specific task."""
diff --git a/matchzoo/engine/callbacks.py b/matchzoo/engine/callbacks.py
new file mode 100644
index 0000000..fe1c01c
--- /dev/null
+++ b/matchzoo/engine/callbacks.py
@@ -0,0 +1,73 @@
+"""Callbacks."""
+import typing
+from pathlib import Path
+
+import numpy as np
+import keras
+
+import matchzoo
+from matchzoo.engine.base_model import BaseModel
+
+
+class EvaluateAllMetrics(keras.callbacks.Callback):
+    """
+    Callback to evaluate all metrics.
+
+    MatchZoo metrics can not be evaluated batch-wise since they require
+    dataset-level information. As a result, MatchZoo metrics are not
+    evaluated automatically when a Model `fit`. When this callback is used,
+    all metrics, including MatchZoo metrics and Keras metrics, are evluated
+    once every `once_every` epochs.
+
+    :param model: Model to evaluate.
+    :param x: X.
+    :param y: y.
+    :param once_every: Evaluation only triggers when `epoch % once_every == 0`.
+        (default: 1, i.e. evaluate on every epoch's end)
+    :param batch_size: Number of samples per evaluation. This only affects the
+        evaluation of Keras metrics, since MatchZoo metrics are always
+        evaluated using the full data.
+    :param model_save_path: Directory path to save the model after each
+        evaluate callback, (default: None, i.e., no saving.)
+    :param verbose: Verbosity.
+    """
+
+    def __init__(
+        self,
+        model: 'BaseModel',
+        x: typing.Union[np.ndarray, typing.List[np.ndarray]],
+        y: np.ndarray,
+        once_every: int = 1,
+        batch_size: int = 128,
+        model_save_path: str = None,
+        verbose=1
+    ):
+        """Initializer."""
+        super().__init__()
+        self._model = model
+        self._dev_x = x
+        self._dev_y = y
+        self._valid_steps = once_every
+        self._batch_size = batch_size
+        self._model_save_path = model_save_path
+        self._verbose = verbose
+
+    def on_epoch_end(self, epoch: int, logs: dict = None):
+        """
+        Called at the end of en epoch.
+
+        :param epoch: integer, index of epoch.
+        :param logs: dictionary of logs.
+        :return: dictionary of logs.
+        """
+        if (epoch + 1) % self._valid_steps == 0:
+            val_logs = self._model.evaluate(self._dev_x, self._dev_y,
+                                            self._batch_size)
+            if self._verbose:
+                print('Validation: ' + ' - '.join(
+                    f'{k}: {v}' for k, v in val_logs.items()))
+            for k, v in val_logs.items():
+                logs[k] = v
+            if self._model_save_path:
+                curr_path = self._model_save_path + str('%d/' % (epoch + 1))
+                self._model.save(curr_path)
diff --git a/matchzoo/engine/hyper_spaces.py b/matchzoo/engine/hyper_spaces.py
new file mode 100644
index 0000000..1193347
--- /dev/null
+++ b/matchzoo/engine/hyper_spaces.py
@@ -0,0 +1,216 @@
+"""Hyper parameter search spaces wrapping `hyperopt`."""
+import typing
+import numbers
+
+import hyperopt
+import hyperopt.pyll.base
+
+
+class HyperoptProxy(object):
+    """
+    Hyperopt proxy class.
+
+    See `hyperopt`'s documentation for more details:
+    https://github.com/hyperopt/hyperopt/wiki/FMin
+
+    Reason of these wrappers:
+
+        A hyper space in `hyperopt` requires a `label` to instantiate. This
+        `label` is used later as a reference to original hyper space that is
+        sampled. In `matchzoo`, hyper spaces are used in
+        :class:`matchzoo.engine.Param`. Only if a hyper space's label
+        matches its parent :class:`matchzoo.engine.Param`'s name, `matchzoo`
+        can correctly back-refrenced the parameter got sampled. This can be
+        done by asking the user always use the same name for a parameter and
+        its hyper space, but typos can occur. As a result, these wrappers
+        are created to hide hyper spaces' `label`, and always correctly
+        bind them with its parameter's name.
+
+    Examples::
+        >>> import matchzoo as mz
+        >>> from hyperopt.pyll.stochastic import sample
+
+    Basic Usage:
+        >>> model = mz.models.DenseBaseline()
+        >>> sample(model.params.hyper_space)  # doctest: +SKIP
+         {'mlp_num_layers': 1.0, 'mlp_num_units': 274.0}
+
+    Arithmetic Operations:
+        >>> new_space = 2 ** mz.hyper_spaces.quniform(2, 6)
+        >>> model.params.get('mlp_num_layers').hyper_space = new_space
+        >>> sample(model.params.hyper_space)  # doctest: +SKIP
+        {'mlp_num_layers': 8.0, 'mlp_num_units': 292.0}
+
+    """
+
+    def __init__(
+        self,
+        hyperopt_func: typing.Callable[..., hyperopt.pyll.Apply],
+        **kwargs
+    ):
+        """
+        :class:`HyperoptProxy` constructor.
+
+        :param hyperopt_func: Target `hyperopt.hp` function to proxy.
+        :param kwargs: Keyword arguments of the proxy function, must pass all
+            parameters in `hyperopt_func`.
+        """
+        self._func = hyperopt_func
+        self._kwargs = kwargs
+
+    def convert(self, name: str) -> hyperopt.pyll.Apply:
+        """
+        Attach `name` as `hyperopt.hp`'s `label`.
+
+        :param name:
+        :return: a `hyperopt` ready search space
+        """
+        return self._func(name, **self._kwargs)
+
+    def __add__(self, other):
+        """__add__."""
+        return _wrap_as_composite_func(self, other, lambda x, y: x + y)
+
+    def __radd__(self, other):
+        """__radd__."""
+        return _wrap_as_composite_func(self, other, lambda x, y: x + y)
+
+    def __sub__(self, other):
+        """__sub__."""
+        return _wrap_as_composite_func(self, other, lambda x, y: x - y)
+
+    def __rsub__(self, other):
+        """__rsub__."""
+        return _wrap_as_composite_func(self, other, lambda x, y: y - x)
+
+    def __mul__(self, other):
+        """__mul__."""
+        return _wrap_as_composite_func(self, other, lambda x, y: x * y)
+
+    def __rmul__(self, other):
+        """__rmul__."""
+        return _wrap_as_composite_func(self, other, lambda x, y: x * y)
+
+    def __truediv__(self, other):
+        """__truediv__."""
+        return _wrap_as_composite_func(self, other, lambda x, y: x / y)
+
+    def __rtruediv__(self, other):
+        """__rtruediv__."""
+        return _wrap_as_composite_func(self, other, lambda x, y: y / x)
+
+    def __floordiv__(self, other):
+        """__floordiv__."""
+        return _wrap_as_composite_func(self, other, lambda x, y: x // y)
+
+    def __rfloordiv__(self, other):
+        """__rfloordiv__."""
+        return _wrap_as_composite_func(self, other, lambda x, y: y // x)
+
+    def __pow__(self, other):
+        """__pow__."""
+        return _wrap_as_composite_func(self, other, lambda x, y: x ** y)
+
+    def __rpow__(self, other):
+        """__rpow__."""
+        return _wrap_as_composite_func(self, other, lambda x, y: y ** x)
+
+    def __neg__(self):
+        """__neg__."""
+        return _wrap_as_composite_func(self, None, lambda x, _: -x)
+
+
+def _wrap_as_composite_func(self, other, func):
+    def _wrapper(name, **kwargs):
+        return func(self._func(name, **kwargs), other)
+
+    return HyperoptProxy(_wrapper, **self._kwargs)
+
+
+class choice(HyperoptProxy):
+    """:func:`hyperopt.hp.choice` proxy."""
+
+    def __init__(self, options: list):
+        """
+        :func:`hyperopt.hp.choice` proxy.
+
+        :param options: options to search from
+        """
+        super().__init__(hyperopt_func=hyperopt.hp.choice, options=options)
+        self._options = options
+
+    def __str__(self):
+        """:return: `str` representation of the hyper space."""
+        return 'choice in %s'.format(self._options)
+
+
+class quniform(HyperoptProxy):
+    """:func:`hyperopt.hp.quniform` proxy."""
+
+    def __init__(
+        self,
+        low: numbers.Number,
+        high: numbers.Number,
+        q: numbers.Number = 1
+    ):
+        """
+        :func:`hyperopt.hp.quniform` proxy.
+
+        If using with integer values, then `high` is exclusive.
+
+        :param low: lower bound of the space
+        :param high: upper bound of the space
+        :param q: similar to the `step` in the python built-in `range`
+        """
+        super().__init__(hyperopt_func=hyperopt.hp.quniform,
+                         low=low,
+                         high=high, q=q)
+        self._low = low
+        self._high = high
+        self._q = q
+
+    def __str__(self):
+        """:return: `str` representation of the hyper space."""
+        return 'quantitative uniform distribution in  ' \
+               '[%s, %s], with a step size of %s'.format(self._low, self._high, self._q)
+
+
+class uniform(HyperoptProxy):
+    """:func:`hyperopt.hp.uniform` proxy."""
+
+    def __init__(
+        self,
+        low: numbers.Number,
+        high: numbers.Number
+    ):
+        """
+        :func:`hyperopt.hp.uniform` proxy.
+
+        :param low: lower bound of the space
+        :param high: upper bound of the space
+        """
+        super().__init__(hyperopt_func=hyperopt.hp.uniform, low=low, high=high)
+        self._low = low
+        self._high = high
+
+    def __str__(self):
+        """:return: `str` representation of the hyper space."""
+        return 'uniform distribution in  [%s, %s)'.format(self._low, self._high)
+
+
+def sample(space):
+    """
+    Take a sample in the hyper space.
+
+    This method is stateless, so the distribution of the samples is different
+    from that of `tune` call. This function just gives a general idea of what
+    a sample from the `space` looks like.
+
+    Example:
+        >>> import matchzoo as mz
+        >>> space = mz.models.Naive.get_default_params().hyper_space
+        >>> mz.hyper_spaces.sample(space)  # doctest: +ELLIPSIS
+        {'optimizer': ...}
+
+    """
+    return hyperopt.pyll.stochastic.sample(space)
diff --git a/matchzoo/engine/param.py b/matchzoo/engine/param.py
new file mode 100644
index 0000000..dac178b
--- /dev/null
+++ b/matchzoo/engine/param.py
@@ -0,0 +1,243 @@
+"""Parameter class."""
+
+import inspect
+import numbers
+import typing
+
+import hyperopt.pyll
+
+from matchzoo.engine import hyper_spaces
+
+# Both hyperopt native spaces and matchzoo proxies are valid spaces.
+SpaceType = typing.Union[hyperopt.pyll.Apply, hyper_spaces.HyperoptProxy]
+
+
+class Param(object):
+    """
+    Parameter class.
+
+    Basic usages with a name and  value:
+
+        >>> param = Param('my_param', 10)
+        >>> param.name
+        'my_param'
+        >>> param.value
+        10
+
+    Use with a validator to make sure the parameter always keeps a valid
+    value.
+
+        >>> param = Param(
+        ...     name='my_param',
+        ...     value=5,
+        ...     validator=lambda x: 0 < x < 20
+        ... )
+        >>> param.validator  # doctest: +ELLIPSIS
+        <function <lambda> at 0x...>
+        >>> param.value
+        5
+        >>> param.value = 10
+        >>> param.value
+        10
+        >>> param.value = -1
+        Traceback (most recent call last):
+            ...
+        ValueError: Validator not satifised.
+        The validator's definition is as follows:
+        validator=lambda x: 0 < x < 20
+
+    Use with a hyper space. Setting up a hyper space for a parameter makes the
+    parameter tunable in a :class:`matchzoo.engine.Tuner`.
+
+        >>> from matchzoo.engine.hyper_spaces import quniform
+        >>> param = Param(
+        ...     name='positive_num',
+        ...     value=1,
+        ...     hyper_space=quniform(low=1, high=5)
+        ... )
+        >>> param.hyper_space  # doctest: +ELLIPSIS
+        <matchzoo.engine.hyper_spaces.quniform object at ...>
+        >>> from hyperopt.pyll.stochastic import sample
+        >>> hyperopt_space = param.hyper_space.convert(param.name)
+        >>> samples = [sample(hyperopt_space) for _ in range(64)]
+        >>> set(samples) == {1, 2, 3, 4, 5}
+        True
+
+    The boolean value of a :class:`Param` instance is only `True`
+    when the value is not `None`. This is because some default falsy values
+    like zero or an empty list are valid parameter values. In other words,
+    the boolean value means to be "if the parameter value is filled".
+
+        >>> param = Param('dropout')
+        >>> if param:
+        ...     print('OK')
+        >>> param = Param('dropout', 0)
+        >>> if param:
+        ...     print('OK')
+        OK
+
+    A `_pre_assignment_hook` is initialized as a data type convertor if the
+    value is set as a number to keep data type consistency of the parameter.
+    This conversion supports python built-in numbers, `numpy` numbers, and
+    any number that inherits :class:`numbers.Number`.
+
+        >>> param = Param('float_param', 0.5)
+        >>> param.value = 10
+        >>> param.value
+        10.0
+        >>> type(param.value)
+        <class 'float'>
+
+    """
+
+    def __init__(
+        self,
+        name: str,
+        value: typing.Any = None,
+        hyper_space: typing.Optional[SpaceType] = None,
+        validator: typing.Optional[
+            typing.Callable[[typing.Any], bool]] = None,
+        desc: typing.Optional[str] = None,
+    ):
+        """
+        Parameter constructor.
+
+        :param name: Name of the parameter.
+        :param value: Value of the parameter, `None` by default, which means
+            "this parameter is not filled yet."
+        :param hyper_space: Hyper space of the parameter, `None` by default.
+            If set, then a :class:`matchzoo.engine.ParamTable` that has this
+            parameter will include this `hyper_space` as a part of the
+            parameter table's search space.
+        :param validator: Validator of the parameter, `None` by default. If
+            validation is needed, pass a callable that, given a value, returns
+            a `bool`. The definition of the validator is retrieved when the
+            validation fails, so either use a function or a `lambda` that
+            occupies its own line for better readability.
+        """
+        self._name = name
+        self._desc = desc
+
+        self._value = None
+        self._hyper_space = None
+        self._validator = None
+        self._pre_assignment_hook = None
+
+        self.validator = validator
+        self.hyper_space = hyper_space
+
+        if value is not None:  # bypass checking if no default
+            self.value = value
+
+    @property
+    def name(self) -> str:
+        """:return: Name of the parameter."""
+        return self._name
+
+    @property
+    def value(self) -> typing.Any:
+        """:return: Value of the parameter."""
+        return self._value
+
+    @value.setter
+    def value(self, new_value: typing.Any):
+        """
+        Set the value of parameter to `new_value`.
+
+        Notice that this setter validates `new_value` before assignment. As
+        a result, if the validaiton fails, the value of the parameter is not
+        changed.
+
+        :param new_value: New value of the parameter to set.
+        """
+        if self._pre_assignment_hook:
+            new_value = self._pre_assignment_hook(new_value)
+        self._validate(new_value)
+        self._value = new_value
+        if not self._pre_assignment_hook:
+            self._infer_pre_assignment_hook()
+
+    @property
+    def hyper_space(self) -> SpaceType:
+        """:return: Hyper space of the parameter."""
+        return self._hyper_space
+
+    @hyper_space.setter
+    def hyper_space(self, new_space: SpaceType):
+        """:param new_space: New space of the parameter to set."""
+        self._hyper_space = new_space
+
+    @property
+    def validator(self) -> typing.Callable[[typing.Any], bool]:
+        """:return: Validator of the parameter."""
+        return self._validator
+
+    @validator.setter
+    def validator(self, new_validator: typing.Callable[[typing.Any], bool]):
+        """:param new_validator: New space of the parameter to set."""
+        if new_validator and not callable(new_validator):
+            raise TypeError("Validator must be a callable or None.")
+        self._validator = new_validator
+
+    @property
+    def desc(self) -> str:
+        """:return: Parameter description."""
+        return self._desc
+
+    @desc.setter
+    def desc(self, value: str):
+        """:param value: New description of the parameter."""
+        self._desc = value
+
+    def _infer_pre_assignment_hook(self):
+        if isinstance(self._value, numbers.Number):
+            self._pre_assignment_hook = lambda x: type(self._value)(x)
+
+    def _validate(self, value):
+        if self._validator:
+            valid = self._validator(value)
+            if not valid:
+                error_msg = "Validator not satifised.\n"
+                error_msg += "The validator's definition is as follows:\n"
+                error_msg += inspect.getsource(self._validator).strip()
+                raise ValueError(error_msg)
+
+    def __bool__(self):
+        """:return: `False` when the value is `None`, `True` otherwise."""
+        return self._value is not None
+
+    def set_default(self, val, verbose=1):
+        """
+        Set default value, has no effect if already has a value.
+
+        :param val: Default value to set.
+        :param verbose: Verbosity.
+        """
+        if self._value is None:
+            self.value = val
+            if verbose:
+                print("Parameter \"%s\" set to %s.".format(self._name, val))
+
+    def reset(self):
+        """
+        Set the parameter's value to `None`, which means "not set".
+
+        This method bypasses validator.
+
+        Example:
+            >>> import matchzoo as mz
+            >>> param = mz.Param(
+            ...     name='str', validator=lambda x: isinstance(x, str))
+            >>> param.value = 'hello'
+            >>> param.value = None
+            Traceback (most recent call last):
+                ...
+            ValueError: Validator not satifised.
+            The validator's definition is as follows:
+            name='str', validator=lambda x: isinstance(x, str))
+            >>> param.reset()
+            >>> param.value is None
+            True
+
+        """
+        self._value = None
diff --git a/matchzoo/engine/param_table.py b/matchzoo/engine/param_table.py
new file mode 100644
index 0000000..186c4eb
--- /dev/null
+++ b/matchzoo/engine/param_table.py
@@ -0,0 +1,169 @@
+"""Parameters table class."""
+
+import typing
+import pandas as pd
+import collections.abc
+
+from matchzoo.engine.param import Param
+from matchzoo.engine import hyper_spaces
+
+
+class ParamTable(object):
+    """
+    Parameter table class.
+
+    Example:
+
+        >>> params = ParamTable()
+        >>> params.add(Param('ham', 'Parma Ham'))
+        >>> params.add(Param('egg', 'Over Easy'))
+        >>> params['ham']
+        'Parma Ham'
+        >>> params['egg']
+        'Over Easy'
+        >>> print(params)
+        ham                           Parma Ham
+        egg                           Over Easy
+        >>> params.add(Param('egg', 'Sunny side Up'))
+        Traceback (most recent call last):
+            ...
+        ValueError: Parameter named egg already exists.
+        To re-assign parameter egg value, use `params["egg"] = value` instead.
+    """
+
+    def __init__(self):
+        """Parameter table constrctor."""
+        self._params = {}
+
+    def add(self, param: Param):
+        """:param param: parameter to add."""
+        if not isinstance(param, Param):
+            raise TypeError("Only accepts a Param instance.")
+        if param.name in self._params:
+            msg = "Parameter named %s already exists.\n" \
+                "To re-assign parameter %s value, " \
+                "use `params[\"%s\"] = value` instead.".format(param.name, param.name, param.name)
+            raise ValueError(msg)
+        self._params[param.name] = param
+
+    def get(self, key) -> Param:
+        """:return: The parameter in the table named `key`."""
+        return self._params[key]
+
+    def set(self, key, param: Param):
+        """Set `key` to parameter `param`."""
+        if not isinstance(param, Param):
+            raise ValueError("Only accepts a Param instance.")
+        self._params[key] = param
+
+    @property
+    def hyper_space(self) -> dict:
+        """:return: Hyper space of the table, a valid `hyperopt` graph."""
+        full_space = {}
+        for param in self:
+            if param.hyper_space is not None:
+                param_space = param.hyper_space
+                if isinstance(param_space, hyper_spaces.HyperoptProxy):
+                    param_space = param_space.convert(param.name)
+                full_space[param.name] = param_space
+        return full_space
+
+    def to_frame(self) -> pd.DataFrame:
+        """
+        Convert the parameter table into a pandas data frame.
+
+        :return: A `pandas.DataFrame`.
+
+        Example:
+            >>> import matchzoo as mz
+            >>> table = mz.ParamTable()
+            >>> table.add(mz.Param(name='x', value=10, desc='my x'))
+            >>> table.add(mz.Param(name='y', value=20, desc='my y'))
+            >>> table.to_frame()
+              Name Description  Value Hyper-Space
+            0    x        my x     10        None
+            1    y        my y     20        None
+
+        """
+        df = pd.DataFrame(data={
+            'Name': [p.name for p in self],
+            'Description': [p.desc for p in self],
+            'Value': [p.value for p in self],
+            'Hyper-Space': [p.hyper_space for p in self]
+        }, columns=['Name', 'Description', 'Value', 'Hyper-Space'])
+        return df
+
+    def __getitem__(self, key: str) -> typing.Any:
+        """:return: The value of the parameter in the table named `key`."""
+        return self._params[key].value
+
+    def __setitem__(self, key: str, value: typing.Any):
+        """
+        Set the value of the parameter named `key`.
+
+        :param key: Name of the parameter.
+        :param value: New value of the parameter to set.
+        """
+        self._params[key].value = value
+
+    def __str__(self):
+        """:return: Pretty formatted parameter table."""
+        return '\n'.join(param.name.ljust(30) + str(param.value)
+                         for param in self._params.values())
+
+    def __iter__(self) -> typing.Iterator:
+        """:return: A iterator that iterates over all parameter instances."""
+        yield from self._params.values()
+
+    def completed(self) -> bool:
+        """
+        :return: `True` if all params are filled, `False` otherwise.
+
+        Example:
+
+            >>> import matchzoo
+            >>> model = matchzoo.models.Naive()
+            >>> model.params.completed()
+            False
+            >>> model.guess_and_fill_missing_params(verbose=0)
+            >>> model.params.completed()
+            True
+
+        """
+        return all(param for param in self)
+
+    def keys(self) -> collections.abc.KeysView:
+        """:return: Parameter table keys."""
+        return self._params.keys()
+
+    def __contains__(self, item):
+        """:return: `True` if parameter in parameters."""
+        return item in self._params
+
+    def update(self, other: dict):
+        """
+        Update `self`.
+
+        Update `self` with the key/value pairs from other, overwriting
+        existing keys. Notice that this does not add new keys to `self`.
+
+        This method is usually used by models to obtain useful information
+        from a preprocessor's context.
+
+        :param other: The dictionary used update.
+
+        Example:
+            >>> import matchzoo as mz
+            >>> model = mz.models.DenseBaseline()
+            >>> model.params['input_shapes'] is None
+            True
+            >>> prpr = model.get_default_preprocessor()
+            >>> _ = prpr.fit(mz.datasets.toy.load_data(), verbose=0)
+            >>> model.params.update(prpr.context)
+            >>> model.params['input_shapes']
+            [(30,), (30,)]
+
+        """
+        for key in other:
+            if key in self:
+                self[key] = other[key]
diff --git a/matchzoo/engine/parse_metric.py b/matchzoo/engine/parse_metric.py
new file mode 100644
index 0000000..6d921cc
--- /dev/null
+++ b/matchzoo/engine/parse_metric.py
@@ -0,0 +1,78 @@
+import typing
+
+import matchzoo
+from matchzoo.engine.base_metric import BaseMetric
+from matchzoo.engine import base_task
+
+
+def parse_metric(
+    metric: typing.Union[str, typing.Type[BaseMetric], BaseMetric],
+    task: 'base_task.BaseTask' = None
+) -> typing.Union['BaseMetric', str]:
+    """
+    Parse input metric in any form into a :class:`BaseMetric` instance.
+
+    :param metric: Input metric in any form.
+    :param task: Task type for determining specific metric.
+    :return: A :class:`BaseMetric` instance
+
+    Examples::
+        >>> from matchzoo import metrics
+        >>> from matchzoo.engine.parse_metric import parse_metric
+
+    Use `str` as keras native metrics:
+        >>> parse_metric('mse')
+        'mse'
+
+    Use `str` as MatchZoo metrics:
+        >>> mz_metric = parse_metric('map')
+        >>> type(mz_metric)
+        <class 'matchzoo.metrics.mean_average_precision.MeanAveragePrecision'>
+
+    Use :class:`matchzoo.engine.BaseMetric` subclasses as MatchZoo metrics:
+        >>> type(parse_metric(metrics.AveragePrecision))
+        <class 'matchzoo.metrics.average_precision.AveragePrecision'>
+
+    Use :class:`matchzoo.engine.BaseMetric` instances as MatchZoo metrics:
+        >>> type(parse_metric(metrics.AveragePrecision()))
+        <class 'matchzoo.metrics.average_precision.AveragePrecision'>
+
+    """
+    if task is None:
+        task = matchzoo.tasks.Ranking()
+
+    if isinstance(metric, str):
+        metric = metric.lower()  # ignore case
+
+        # matchzoo metrics in str form
+        for subclass in BaseMetric.__subclasses__():
+            if metric == subclass.ALIAS or metric in subclass.ALIAS:
+                return subclass()
+
+        # keras native metrics
+        return _remap_keras_metric(metric, task)
+    elif isinstance(metric, BaseMetric):
+        return metric
+    elif issubclass(metric, BaseMetric):
+        return metric()
+    else:
+        raise ValueError(metric)
+
+
+def _remap_keras_metric(metric: str, task) -> str:
+    # we do not support sparse label in classification.
+    lookup = {
+        matchzoo.tasks.Ranking: {
+            'acc': 'binary_accuracy',
+            'accuracy': 'binary_accuracy',
+            'crossentropy': 'binary_crossentropy',
+            'ce': 'binary_crossentropy',
+        },
+        matchzoo.tasks.Classification: {
+            'acc': 'categorical_accuracy',
+            'accuracy': 'categorical_accuracy',
+            'crossentropy': 'categorical_crossentropy',
+            'ce': 'categorical_crossentropy',
+        }
+    }
+    return lookup[type(task)].get(metric, metric)
diff --git a/matchzoo/losses/rank_cross_entropy_loss.py b/matchzoo/losses/rank_cross_entropy_loss.py
new file mode 100644
index 0000000..2bc6cbb
--- /dev/null
+++ b/matchzoo/losses/rank_cross_entropy_loss.py
@@ -0,0 +1,48 @@
+"""The rank cross entropy loss."""
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+
+class RankCrossEntropyLoss(nn.Module):
+    """Creates a criterion that measures rank cross entropy loss."""
+
+    __constants__ = ['num_neg']
+
+    def __init__(self, num_neg: int = 1):
+        """
+        :class:`RankCrossEntropyLoss` constructor.
+
+        :param num_neg: Number of negative instances in hinge loss.
+        """
+        super().__init__()
+        self.num_neg = num_neg
+
+    def forward(self, y_pred: torch.Tensor, y_true: torch.Tensor):
+        """
+        Calculate rank cross entropy loss.
+
+        :param y_pred: Predicted result.
+        :param y_true: Label.
+        :return: Rank cross loss.
+        """
+        logits = y_pred[::(self.num_neg + 1), :]
+        labels = y_true[::(self.num_neg + 1), :]
+        for neg_idx in range(self.num_neg):
+            neg_logits = y_pred[(neg_idx + 1)::(self.num_neg + 1), :]
+            neg_labels = y_true[(neg_idx + 1)::(self.num_neg + 1), :]
+            logits = torch.cat((logits, neg_logits), dim=-1)
+            labels = torch.cat((labels, neg_labels), dim=-1)
+        return -torch.mean(
+            torch.sum(labels * torch.log(F.softmax(logits, dim=-1)), dim=-1)
+        )
+
+    @property
+    def num_neg(self):
+        """`num_neg` getter."""
+        return self._num_neg
+
+    @num_neg.setter
+    def num_neg(self, value):
+        """`num_neg` setter."""
+        self._num_neg = value
diff --git a/matchzoo/losses/rank_hinge_loss.py b/matchzoo/losses/rank_hinge_loss.py
new file mode 100644
index 0000000..d0aaf12
--- /dev/null
+++ b/matchzoo/losses/rank_hinge_loss.py
@@ -0,0 +1,86 @@
+"""The rank hinge loss."""
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+
+class RankHingeLoss(nn.Module):
+    """
+    Creates a criterion that measures rank hinge loss.
+
+    Given inputs :math:`x1`, :math:`x2`, two 1D mini-batch `Tensors`,
+    and a label 1D mini-batch tensor :math:`y` (containing 1 or -1).
+
+    If :math:`y = 1` then it assumed the first input should be ranked
+    higher (have a larger value) than the second input, and vice-versa
+    for :math:`y = -1`.
+
+    The loss function for each sample in the mini-batch is:
+
+    .. math::
+        loss_{x, y} = max(0, -y * (x1 - x2) + margin)
+    """
+
+    __constants__ = ['num_neg', 'margin', 'reduction']
+
+    def __init__(self, num_neg: int = 1, margin: float = 1.,
+                 reduction: str = 'mean'):
+        """
+        :class:`RankHingeLoss` constructor.
+
+        :param num_neg: Number of negative instances in hinge loss.
+        :param margin: Margin between positive and negative scores.
+            Float. Has a default value of :math:`0`.
+        :param reduction: String. Specifies the reduction to apply to
+            the output: ``'none'`` | ``'mean'`` | ``'sum'``.
+            ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the
+                number of elements in the output,
+            ``'sum'``: the output will be summed.
+        """
+        super().__init__()
+        self.num_neg = num_neg
+        self.margin = margin
+        self.reduction = reduction
+
+    def forward(self, y_pred: torch.Tensor, y_true: torch.Tensor):
+        """
+        Calculate rank hinge loss.
+
+        :param y_pred: Predicted result.
+        :param y_true: Label.
+        :return: Hinge loss computed by user-defined margin.
+        """
+        y_pos = y_pred[::(self.num_neg + 1), :]
+        y_neg = []
+        for neg_idx in range(self.num_neg):
+            neg = y_pred[(neg_idx + 1)::(self.num_neg + 1), :]
+            y_neg.append(neg)
+        y_neg = torch.cat(y_neg, dim=-1)
+        y_neg = torch.mean(y_neg, dim=-1, keepdim=True)
+        y_true = torch.ones_like(y_pos)
+        return F.margin_ranking_loss(
+            y_pos, y_neg, y_true,
+            margin=self.margin,
+            reduction=self.reduction
+        )
+
+    @property
+    def num_neg(self):
+        """`num_neg` getter."""
+        return self._num_neg
+
+    @num_neg.setter
+    def num_neg(self, value):
+        """`num_neg` setter."""
+        self._num_neg = value
+
+    @property
+    def margin(self):
+        """`margin` getter."""
+        return self._margin
+
+    @margin.setter
+    def margin(self, value):
+        """`margin` setter."""
+        self._margin = value
diff --git a/matchzoo/metrics/__init__.py b/matchzoo/metrics/__init__.py
new file mode 100644
index 0000000..e98062b
--- /dev/null
+++ b/matchzoo/metrics/__init__.py
@@ -0,0 +1,13 @@
+from .precision import Precision
+from .average_precision import AveragePrecision
+from .discounted_cumulative_gain import DiscountedCumulativeGain
+from .mean_reciprocal_rank import MeanReciprocalRank
+from .mean_average_precision import MeanAveragePrecision
+from .normalized_discounted_cumulative_gain import \
+    NormalizedDiscountedCumulativeGain
+
+
+def list_available() -> list:
+    from matchzoo.engine.base_metric import BaseMetric
+    from matchzoo.utils import list_recursive_concrete_subclasses
+    return list_recursive_concrete_subclasses(BaseMetric)
diff --git a/matchzoo/metrics/average_precision.py b/matchzoo/metrics/average_precision.py
new file mode 100644
index 0000000..60dbee9
--- /dev/null
+++ b/matchzoo/metrics/average_precision.py
@@ -0,0 +1,45 @@
+"""Average precision metric for ranking."""
+import numpy as np
+
+from matchzoo.engine import base_metric
+from . import Precision
+
+
+class AveragePrecision(base_metric.BaseMetric):
+    """Average precision metric."""
+
+    ALIAS = ['average_precision', 'ap']
+
+    def __init__(self, threshold: float = 0.):
+        """
+        :class:`AveragePrecision` constructor.
+
+        :param threshold: The label threshold of relevance degree.
+        """
+        self._threshold = threshold
+
+    def __repr__(self) -> str:
+        """:return: Formated string representation of the metric."""
+        return "%s(%s)".format(self.ALIAS[0], self._threshold)
+
+    def __call__(self, y_true: np.array, y_pred: np.array) -> float:
+        """
+        Calculate average precision (area under PR curve).
+
+        Example:
+            >>> y_true = [0, 1]
+            >>> y_pred = [0.1, 0.6]
+            >>> round(AveragePrecision()(y_true, y_pred), 2)
+            0.75
+            >>> round(AveragePrecision()([], []), 2)
+            0.0
+
+        :param y_true: The ground true label of each document.
+        :param y_pred: The predicted scores of each document.
+        :return: Average precision.
+        """
+        precision_metrics = [Precision(k + 1) for k in range(len(y_pred))]
+        out = [metric(y_true, y_pred) for metric in precision_metrics]
+        if not out:
+            return 0.
+        return np.asscalar(np.mean(out))
diff --git a/matchzoo/metrics/discounted_cumulative_gain.py b/matchzoo/metrics/discounted_cumulative_gain.py
new file mode 100644
index 0000000..b6c711f
--- /dev/null
+++ b/matchzoo/metrics/discounted_cumulative_gain.py
@@ -0,0 +1,62 @@
+"""Discounted cumulative gain metric for ranking."""
+import math
+
+import numpy as np
+
+from matchzoo.engine.base_metric import BaseMetric, sort_and_couple
+
+
+class DiscountedCumulativeGain(BaseMetric):
+    """Disconunted cumulative gain metric."""
+
+    ALIAS = ['discounted_cumulative_gain', 'dcg']
+
+    def __init__(self, k: int = 1, threshold: float = 0.):
+        """
+        :class:`DiscountedCumulativeGain` constructor.
+
+        :param k: Number of results to consider.
+        :param threshold: the label threshold of relevance degree.
+        """
+        self._k = k
+        self._threshold = threshold
+
+    def __repr__(self) -> str:
+        """:return: Formated string representation of the metric."""
+        return "%s@%s(%s)".format(self.ALIAS[0], self._k, self._threshold)
+
+    def __call__(self, y_true: np.array, y_pred: np.array) -> float:
+        """
+        Calculate discounted cumulative gain (dcg).
+
+        Relevance is positive real values or binary values.
+
+        Example:
+            >>> y_true = [0, 1, 2, 0]
+            >>> y_pred = [0.4, 0.2, 0.5, 0.7]
+            >>> DiscountedCumulativeGain(1)(y_true, y_pred)
+            0.0
+            >>> round(DiscountedCumulativeGain(k=-1)(y_true, y_pred), 2)
+            0.0
+            >>> round(DiscountedCumulativeGain(k=2)(y_true, y_pred), 2)
+            2.73
+            >>> round(DiscountedCumulativeGain(k=3)(y_true, y_pred), 2)
+            2.73
+            >>> type(DiscountedCumulativeGain(k=1)(y_true, y_pred))
+            <class 'float'>
+
+        :param y_true: The ground true label of each document.
+        :param y_pred: The predicted scores of each document.
+
+        :return: Discounted cumulative gain.
+        """
+        if self._k <= 0:
+            return 0.
+        coupled_pair = sort_and_couple(y_true, y_pred)
+        result = 0.
+        for i, (label, score) in enumerate(coupled_pair):
+            if i >= self._k:
+                break
+            if label > self._threshold:
+                result += (math.pow(2., label) - 1.) / math.log(2. + i)
+        return result
diff --git a/matchzoo/metrics/mean_average_precision.py b/matchzoo/metrics/mean_average_precision.py
new file mode 100644
index 0000000..4794bf9
--- /dev/null
+++ b/matchzoo/metrics/mean_average_precision.py
@@ -0,0 +1,48 @@
+"""Mean average precision metric for ranking."""
+import numpy as np
+
+from matchzoo.engine.base_metric import BaseMetric, sort_and_couple
+
+
+class MeanAveragePrecision(BaseMetric):
+    """Mean average precision metric."""
+
+    ALIAS = ['mean_average_precision', 'map']
+
+    def __init__(self, threshold: float = 0.):
+        """
+        :class:`MeanAveragePrecision` constructor.
+
+        :param threshold: The threshold of relevance degree.
+        """
+        self._threshold = threshold
+
+    def __repr__(self):
+        """:return: Formated string representation of the metric."""
+        return "%s(%s)".format(self.ALIAS[0], self._threshold)
+
+    def __call__(self, y_true: np.array, y_pred: np.array) -> float:
+        """
+        Calculate mean average precision.
+
+        Example:
+            >>> y_true = [0, 1, 0, 0]
+            >>> y_pred = [0.1, 0.6, 0.2, 0.3]
+            >>> MeanAveragePrecision()(y_true, y_pred)
+            1.0
+
+        :param y_true: The ground true label of each document.
+        :param y_pred: The predicted scores of each document.
+        :return: Mean average precision.
+        """
+        result = 0.
+        pos = 0
+        coupled_pair = sort_and_couple(y_true, y_pred)
+        for idx, (label, score) in enumerate(coupled_pair):
+            if label > self._threshold:
+                pos += 1.
+                result += pos / (idx + 1.)
+        if pos == 0:
+            return 0.
+        else:
+            return result / pos
diff --git a/matchzoo/metrics/mean_reciprocal_rank.py b/matchzoo/metrics/mean_reciprocal_rank.py
new file mode 100644
index 0000000..c04056d
--- /dev/null
+++ b/matchzoo/metrics/mean_reciprocal_rank.py
@@ -0,0 +1,43 @@
+"""Mean reciprocal ranking metric."""
+import numpy as np
+
+from matchzoo.engine.base_metric import BaseMetric, sort_and_couple
+
+
+class MeanReciprocalRank(BaseMetric):
+    """Mean reciprocal rank metric."""
+
+    ALIAS = ['mean_reciprocal_rank', 'mrr']
+
+    def __init__(self, threshold: float = 0.):
+        """
+        :class:`MeanReciprocalRankMetric`.
+
+        :param threshold: The label threshold of relevance degree.
+        """
+        self._threshold = threshold
+
+    def __repr__(self) -> str:
+        """:return: Formated string representation of the metric."""
+        return '%s(%s)'.format(self.ALIAS[0], self._threshold)
+
+    def __call__(self, y_true: np.array, y_pred: np.array) -> float:
+        """
+        Calculate reciprocal of the rank of the first relevant item.
+
+        Example:
+            >>> import numpy as np
+            >>> y_pred = np.asarray([0.2, 0.3, 0.7, 1.0])
+            >>> y_true = np.asarray([1, 0, 0, 0])
+            >>> MeanReciprocalRank()(y_true, y_pred)
+            0.25
+
+        :param y_true: The ground true label of each document.
+        :param y_pred: The predicted scores of each document.
+        :return: Mean reciprocal rank.
+        """
+        coupled_pair = sort_and_couple(y_true, y_pred)
+        for idx, (label, pred) in enumerate(coupled_pair):
+            if label > self._threshold:
+                return 1. / (idx + 1)
+        return 0.
diff --git a/matchzoo/metrics/normalized_discounted_cumulative_gain.py b/matchzoo/metrics/normalized_discounted_cumulative_gain.py
new file mode 100644
index 0000000..bbde0de
--- /dev/null
+++ b/matchzoo/metrics/normalized_discounted_cumulative_gain.py
@@ -0,0 +1,55 @@
+"""Normalized discounted cumulative gain metric for ranking."""
+import numpy as np
+
+from matchzoo.engine.base_metric import BaseMetric, sort_and_couple
+from .discounted_cumulative_gain import DiscountedCumulativeGain
+
+
+class NormalizedDiscountedCumulativeGain(BaseMetric):
+    """Normalized discounted cumulative gain metric."""
+
+    ALIAS = ['normalized_discounted_cumulative_gain', 'ndcg']
+
+    def __init__(self, k: int = 1, threshold: float = 0.):
+        """
+        :class:`NormalizedDiscountedCumulativeGain` constructor.
+
+        :param k: Number of results to consider
+        :param threshold: the label threshold of relevance degree.
+        """
+        self._k = k
+        self._threshold = threshold
+
+    def __repr__(self) -> str:
+        """:return: Formated string representation of the metric."""
+        return "%s@%s(%s)".format(self.ALIAS[0], self._k, self._threshold)
+
+    def __call__(self, y_true: np.array, y_pred: np.array) -> float:
+        """
+        Calculate normalized discounted cumulative gain (ndcg).
+
+        Relevance is positive real values or binary values.
+
+        Example:
+            >>> y_true = [0, 1, 2, 0]
+            >>> y_pred = [0.4, 0.2, 0.5, 0.7]
+            >>> ndcg = NormalizedDiscountedCumulativeGain
+            >>> ndcg(k=1)(y_true, y_pred)
+            0.0
+            >>> round(ndcg(k=2)(y_true, y_pred), 2)
+            0.52
+            >>> round(ndcg(k=3)(y_true, y_pred), 2)
+            0.52
+            >>> type(ndcg()(y_true, y_pred))
+            <class 'float'>
+
+        :param y_true: The ground true label of each document.
+        :param y_pred: The predicted scores of each document.
+
+        :return: Normalized discounted cumulative gain.
+        """
+        dcg_metric = DiscountedCumulativeGain(k=self._k,
+                                              threshold=self._threshold)
+        idcg_val = dcg_metric(y_true, y_true)
+        dcg_val = dcg_metric(y_true, y_pred)
+        return dcg_val / idcg_val if idcg_val != 0 else 0
diff --git a/matchzoo/metrics/precision.py b/matchzoo/metrics/precision.py
new file mode 100644
index 0000000..00537de
--- /dev/null
+++ b/matchzoo/metrics/precision.py
@@ -0,0 +1,57 @@
+"""Precision for ranking."""
+import numpy as np
+
+from matchzoo.engine.base_metric import BaseMetric, sort_and_couple
+
+
+class Precision(BaseMetric):
+    """Precision metric."""
+
+    ALIAS = 'precision'
+
+    def __init__(self, k: int = 1, threshold: float = 0.):
+        """
+        :class:`PrecisionMetric` constructor.
+
+        :param k: Number of results to consider.
+        :param threshold: the label threshold of relevance degree.
+        """
+        self._k = k
+        self._threshold = threshold
+
+    def __repr__(self) -> str:
+        """:return: Formated string representation of the metric."""
+        return "%s@%s(%s)".format(self.ALIAS, self._k, self._threshold)
+
+    def __call__(self, y_true: np.array, y_pred: np.array) -> float:
+        """
+        Calculate precision@k.
+
+        Example:
+            >>> y_true = [0, 0, 0, 1]
+            >>> y_pred = [0.2, 0.4, 0.3, 0.1]
+            >>> Precision(k=1)(y_true, y_pred)
+            0.0
+            >>> Precision(k=2)(y_true, y_pred)
+            0.0
+            >>> Precision(k=4)(y_true, y_pred)
+            0.25
+            >>> Precision(k=5)(y_true, y_pred)
+            0.2
+
+        :param y_true: The ground true label of each document.
+        :param y_pred: The predicted scores of each document.
+        :return: Precision @ k
+        :raises: ValueError: len(r) must be >= k.
+        """
+        if self._k <= 0:
+            raise ValueError("k must be greater than 0."
+                             "%s received.".format(self._k))
+        coupled_pair = sort_and_couple(y_true, y_pred)
+        precision = 0.0
+        for idx, (label, score) in enumerate(coupled_pair):
+            if idx >= self._k:
+                break
+            if label > self._threshold:
+                precision += 1.
+        return precision / self._k
diff --git a/matchzoo/modules/__init__.py b/matchzoo/modules/__init__.py
new file mode 100644
index 0000000..eee030c
--- /dev/null
+++ b/matchzoo/modules/__init__.py
@@ -0,0 +1,8 @@
+from .attention import Attention
+from .attention import BidirectionalAttention
+from .attention import MatchModule
+from .dropout import RNNDropout
+from .stacked_brnn import StackedBRNN
+from .gaussian_kernel import GaussianKernel
+from .matching import Matching
+from .bert_module import BertModule
diff --git a/matchzoo/modules/attention.py b/matchzoo/modules/attention.py
new file mode 100644
index 0000000..d6e5dd5
--- /dev/null
+++ b/matchzoo/modules/attention.py
@@ -0,0 +1,107 @@
+"""Attention module."""
+import typing
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Attention(nn.Module):
+    """
+    Attention module.
+
+    :param input_size: Size of input.
+    :param mask: An integer to mask the invalid values. Defaults to 0.
+
+    Examples:
+        >>> import torch
+        >>> attention = Attention(input_size=10)
+        >>> x = torch.randn(4, 5, 10)
+        >>> x.shape
+        torch.Size([4, 5, 10])
+        >>> attention(x).shape
+        torch.Size([4, 5])
+
+    """
+
+    def __init__(self, input_size: int = 100, mask: int = 0):
+        """Attention constructor."""
+        super().__init__()
+        self.linear = nn.Linear(input_size, 1, bias=False)
+        self.mask = mask
+
+    def forward(self, x):
+        """Perform attention on the input."""
+        x = self.linear(x).squeeze(dim=-1)
+        mask = (x != self.mask)
+        x = x.masked_fill(mask == self.mask, -float('inf'))
+        return F.softmax(x, dim=-1)
+
+
+class BidirectionalAttention(nn.Module):
+    """Computing the soft attention between two sequence."""
+
+    def __init__(self):
+        """Init."""
+        super().__init__()
+
+    def forward(self, v1, v1_mask, v2, v2_mask):
+        """Forward."""
+        similarity_matrix = v1.bmm(v2.transpose(2, 1).contiguous())
+
+        v2_v1_attn = F.softmax(
+            similarity_matrix.masked_fill(
+                v1_mask.unsqueeze(2), -1e-7), dim=1)
+        v1_v2_attn = F.softmax(
+            similarity_matrix.masked_fill(
+                v2_mask.unsqueeze(1), -1e-7), dim=2)
+
+        attended_v1 = v1_v2_attn.bmm(v2)
+        attended_v2 = v2_v1_attn.transpose(1, 2).bmm(v1)
+
+        attended_v1.masked_fill_(v1_mask.unsqueeze(2), 0)
+        attended_v2.masked_fill_(v2_mask.unsqueeze(2), 0)
+
+        return attended_v1, attended_v2
+
+
+class MatchModule(nn.Module):
+    """
+    Computing the match representation for Match LSTM.
+
+    :param hidden_size: Size of hidden vectors.
+    :param dropout_rate: Dropout rate of the projection layer. Defaults to 0.
+
+    Examples:
+        >>> import torch
+        >>> attention = MatchModule(hidden_size=10)
+        >>> v1 = torch.randn(4, 5, 10)
+        >>> v1.shape
+        torch.Size([4, 5, 10])
+        >>> v2 = torch.randn(4, 5, 10)
+        >>> v2_mask = torch.ones(4, 5).to(dtype=torch.uint8)
+        >>> attention(v1, v2, v2_mask).shape
+        torch.Size([4, 5, 20])
+
+
+    """
+
+    def __init__(self, hidden_size, dropout_rate=0):
+        """Init."""
+        super().__init__()
+        self.v2_proj = nn.Linear(hidden_size, hidden_size)
+        self.proj = nn.Linear(hidden_size * 4, hidden_size * 2)
+        self.dropout = nn.Dropout(p=dropout_rate)
+
+    def forward(self, v1, v2, v2_mask):
+        """Computing attention vectors and projection vectors."""
+        proj_v2 = self.v2_proj(v2)
+        similarity_matrix = v1.bmm(proj_v2.transpose(2, 1).contiguous())
+
+        v1_v2_attn = F.softmax(
+            similarity_matrix.masked_fill(
+                v2_mask.unsqueeze(1).bool(), -1e-7), dim=2)
+        v2_wsum = v1_v2_attn.bmm(v2)
+        fusion = torch.cat([v1, v2_wsum, v1 - v2_wsum, v1 * v2_wsum], dim=2)
+        match = self.dropout(F.relu(self.proj(fusion)))
+        return match
diff --git a/matchzoo/modules/bert_module.py b/matchzoo/modules/bert_module.py
new file mode 100644
index 0000000..b147991
--- /dev/null
+++ b/matchzoo/modules/bert_module.py
@@ -0,0 +1,30 @@
+"""Bert module."""
+import typing
+
+import torch
+import torch.nn as nn
+from pytorch_transformers import BertModel
+
+
+class BertModule(nn.Module):
+    """
+    Bert module.
+
+    BERT (from Google) released with the paper BERT: Pre-training of Deep
+    Bidirectional Transformers for Language Understanding by Jacob Devlin,
+    Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+
+    :param mode: String, supported mode can be referred
+        https://huggingface.co/pytorch-transformers/pretrained_models.html.
+
+    """
+
+    def __init__(self, mode: str = 'bert-base-uncased'):
+        """:class:`BertModule` constructor."""
+        super().__init__()
+        self.bert = BertModel.from_pretrained(mode)
+
+    def forward(self, x, y):
+        """Forward."""
+        inputs = torch.cat((x, y), dim=-1)
+        return self.bert(inputs)
diff --git a/matchzoo/modules/dropout.py b/matchzoo/modules/dropout.py
new file mode 100644
index 0000000..6f8a4c6
--- /dev/null
+++ b/matchzoo/modules/dropout.py
@@ -0,0 +1,18 @@
+import torch.nn as nn
+
+
+class RNNDropout(nn.Dropout):
+    """Dropout for RNN."""
+
+    def forward(self, sequences_batch):
+        """Masking whole hidden vector for tokens."""
+        # B: batch size
+        # L: sequence length
+        # D: hidden size
+
+        # sequence_batch: BxLxD
+        ones = sequences_batch.data.new_ones(sequences_batch.shape[0],
+                                             sequences_batch.shape[-1])
+        dropout_mask = nn.functional.dropout(ones, self.p, self.training,
+                                             inplace=False)
+        return dropout_mask.unsqueeze(1) * sequences_batch
diff --git a/matchzoo/modules/gaussian_kernel.py b/matchzoo/modules/gaussian_kernel.py
new file mode 100644
index 0000000..09fa510
--- /dev/null
+++ b/matchzoo/modules/gaussian_kernel.py
@@ -0,0 +1,36 @@
+"""Gaussian kernel module."""
+import typing
+
+import torch
+import torch.nn as nn
+
+
+class GaussianKernel(nn.Module):
+    """
+    Gaussian kernel module.
+
+    :param mu: Float, mean of the kernel.
+    :param sigma: Float, sigma of the kernel.
+
+    Examples:
+        >>> import torch
+        >>> kernel = GaussianKernel()
+        >>> x = torch.randn(4, 5, 10)
+        >>> x.shape
+        torch.Size([4, 5, 10])
+        >>> kernel(x).shape
+        torch.Size([4, 5, 10])
+
+    """
+
+    def __init__(self, mu: float = 1., sigma: float = 1.):
+        """Gaussian kernel constructor."""
+        super().__init__()
+        self.mu = mu
+        self.sigma = sigma
+
+    def forward(self, x):
+        """Forward."""
+        return torch.exp(
+            -0.5 * ((x - self.mu) ** 2) / (self.sigma ** 2)
+        )
diff --git a/matchzoo/modules/matching.py b/matchzoo/modules/matching.py
new file mode 100644
index 0000000..15852d4
--- /dev/null
+++ b/matchzoo/modules/matching.py
@@ -0,0 +1,61 @@
+"""Matching module."""
+import typing
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Matching(nn.Module):
+    """
+    Module that computes a matching matrix between samples in two tensors.
+
+    :param normalize: Whether to L2-normalize samples along the
+        dot product axis before taking the dot product.
+        If set to `True`, then the output of the dot product
+        is the cosine proximity between the two samples.
+    :param matching_type: the similarity function for matching
+
+    Examples:
+        >>> import torch
+        >>> matching = Matching(matching_type='dot', normalize=True)
+        >>> x = torch.randn(2, 3, 2)
+        >>> y = torch.randn(2, 4, 2)
+        >>> matching(x, y).shape
+        torch.Size([2, 3, 4])
+
+    """
+
+    def __init__(self, normalize: bool = False, matching_type: str = 'dot'):
+        """:class:`Matching` constructor."""
+        super().__init__()
+        self._normalize = normalize
+        self._validate_matching_type(matching_type)
+        self._matching_type = matching_type
+
+    @classmethod
+    def _validate_matching_type(cls, matching_type: str = 'dot'):
+        valid_matching_type = ['dot', 'mul', 'plus', 'minus', 'concat']
+        if matching_type not in valid_matching_type:
+            raise ValueError("%s is not a valid matching type, %s expected." % (matching_type, valid_matching_type))
+
+    def forward(self, x, y):
+        """Perform attention on the input."""
+        length_left = x.shape[1]
+        length_right = y.shape[1]
+        if self._matching_type == 'dot':
+            if self._normalize:
+                x = F.normalize(x, p=2, dim=-1)
+                y = F.normalize(y, p=2, dim=-1)
+            return torch.einsum('bld,brd->blr', x, y)
+        else:
+            x = x.unsqueeze(dim=2).repeat(1, 1, length_right, 1)
+            y = y.unsqueeze(dim=1).repeat(1, length_left, 1, 1)
+            if self._matching_type == 'mul':
+                return x * y
+            elif self._matching_type == 'plus':
+                return x + y
+            elif self._matching_type == 'minus':
+                return x - y
+            elif self._matching_type == 'concat':
+                return torch.cat((x, y), dim=3)
diff --git a/matchzoo/modules/stacked_brnn.py b/matchzoo/modules/stacked_brnn.py
new file mode 100644
index 0000000..ecf7303
--- /dev/null
+++ b/matchzoo/modules/stacked_brnn.py
@@ -0,0 +1,91 @@
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+
+class StackedBRNN(nn.Module):
+    """
+    Stacked Bi-directional RNNs.
+
+    Differs from standard PyTorch library in that it has the option to save
+    and concat the hidden states between layers. (i.e. the output hidden size
+    for each sequence input is num_layers * hidden_size).
+
+    Examples:
+        >>> import torch
+        >>> rnn = StackedBRNN(
+        ...     input_size=10,
+        ...     hidden_size=10,
+        ...     num_layers=2,
+        ...     dropout_rate=0.2,
+        ...     dropout_output=True,
+        ...     concat_layers=False
+        ... )
+        >>> x = torch.randn(2, 5, 10)
+        >>> x.size()
+        torch.Size([2, 5, 10])
+        >>> x_mask = (torch.ones(2, 5) == 1)
+        >>> rnn(x, x_mask).shape
+        torch.Size([2, 5, 20])
+
+    """
+
+    def __init__(self, input_size, hidden_size, num_layers,
+                 dropout_rate=0, dropout_output=False, rnn_type=nn.LSTM,
+                 concat_layers=False):
+        """Stacked Bidirectional LSTM."""
+        super().__init__()
+        self.dropout_output = dropout_output
+        self.dropout_rate = dropout_rate
+        self.num_layers = num_layers
+        self.concat_layers = concat_layers
+        self.rnns = nn.ModuleList()
+        for i in range(num_layers):
+            input_size = input_size if i == 0 else 2 * hidden_size
+            self.rnns.append(rnn_type(input_size, hidden_size,
+                                      num_layers=1,
+                                      bidirectional=True))
+
+    def forward(self, x, x_mask):
+        """Encode either padded or non-padded sequences."""
+        if x_mask.data.sum() == 0:
+            # No padding necessary.
+            output = self._forward_unpadded(x, x_mask)
+        output = self._forward_unpadded(x, x_mask)
+
+        return output.contiguous()
+
+    def _forward_unpadded(self, x, x_mask):
+        """Faster encoding that ignores any padding."""
+        # Transpose batch and sequence dims
+        x = x.transpose(0, 1)
+
+        # Encode all layers
+        outputs = [x]
+        for i in range(self.num_layers):
+            rnn_input = outputs[-1]
+
+            # Apply dropout to hidden input
+            if self.dropout_rate > 0:
+                rnn_input = F.dropout(rnn_input,
+                                      p=self.dropout_rate,
+                                      training=self.training)
+            # Forward
+            rnn_output = self.rnns[i](rnn_input)[0]
+            outputs.append(rnn_output)
+
+        # Concat hidden layers
+        if self.concat_layers:
+            output = torch.cat(outputs[1:], 2)
+        else:
+            output = outputs[-1]
+
+        # Transpose back
+        output = output.transpose(0, 1)
+
+        # Dropout on output layer
+        if self.dropout_output and self.dropout_rate > 0:
+            output = F.dropout(output,
+                               p=self.dropout_rate,
+                               training=self.training)
+        return output
diff --git a/matchzoo/preprocessors/__init__.py b/matchzoo/preprocessors/__init__.py
new file mode 100644
index 0000000..9ca462e
--- /dev/null
+++ b/matchzoo/preprocessors/__init__.py
@@ -0,0 +1,19 @@
+from . import units
+from .dssm_preprocessor import DSSMPreprocessor
+from .naive_preprocessor import NaivePreprocessor
+from .basic_preprocessor import BasicPreprocessor
+from .cdssm_preprocessor import CDSSMPreprocessor
+from .mz_pretrained_preprocessor import PreTrainedModelsProcessor
+from .char_ngram_preprocessor import CharNGramPreprocessor
+from .elmo_basic_preprocessor import ElmoPreprocessor
+from .bow_preprocessor import BoWPreprocessor
+from .declare_preprocessor import DeClarePreprocessor
+from .fact_checking_elmo_preprocessor import FactCheckingElmoPreprocessor
+from .char_man_preprocessor import CharManPreprocessor
+from .char_man_elmo_preprocessor import CharManElmoPreprocessor
+
+
+def list_available() -> list:
+    from matchzoo.engine.base_preprocessor import BasePreprocessor
+    from matchzoo.utils import list_recursive_concrete_subclasses
+    return list_recursive_concrete_subclasses(BasePreprocessor)
diff --git a/matchzoo/preprocessors/basic_preprocessor.py b/matchzoo/preprocessors/basic_preprocessor.py
new file mode 100644
index 0000000..d72df17
--- /dev/null
+++ b/matchzoo/preprocessors/basic_preprocessor.py
@@ -0,0 +1,150 @@
+"""Basic Preprocessor."""
+
+from tqdm import tqdm
+
+from . import units
+from matchzoo import DataPack
+from matchzoo.engine.base_preprocessor import BasePreprocessor
+from .build_vocab_unit import build_vocab_unit
+from .build_unit_from_data_pack import build_unit_from_data_pack
+from .chain_transform import chain_transform
+from handlers.output_handler import FileHandler
+
+tqdm.pandas()
+
+
+class BasicPreprocessor(BasePreprocessor):
+    """
+    Baisc preprocessor helper.
+
+    :param fixed_length_left: Integer, maximize length of :attr:`left` in the
+        data_pack.
+    :param fixed_length_right: Integer, maximize length of :attr:`right` in the
+        data_pack.
+    :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can
+        be 'df', 'cf', and 'idf'.
+    :param filter_low_freq: Float, lower bound value used by
+        :class:`FrequenceFilterUnit`.
+    :param filter_high_freq: Float, upper bound value used by
+        :class:`FrequenceFilterUnit`.
+    :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not.
+
+    Example:
+        >>> import matchzoo as mz
+        >>> train_data = mz.datasets.toy.load_data('train')
+        >>> test_data = mz.datasets.toy.load_data('test')
+        >>> preprocessor = mz.preprocessors.BasicPreprocessor(
+        ...     fixed_length_left=10,
+        ...     fixed_length_right=20,
+        ...     filter_mode='df',
+        ...     filter_low_freq=2,
+        ...     filter_high_freq=1000,
+        ...     remove_stop_words=True
+        ... )
+        >>> preprocessor = preprocessor.fit(train_data, verbose=0)
+        >>> preprocessor.context['input_shapes']
+        [(10,), (20,)]
+        >>> preprocessor.context['vocab_size']
+        225
+        >>> processed_train_data = preprocessor.transform(train_data,
+        ...                                               verbose=0)
+        >>> type(processed_train_data)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+        >>> test_data_transformed = preprocessor.transform(test_data,
+        ...                                                verbose=0)
+        >>> type(test_data_transformed)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+
+    """
+
+    def __init__(self, fixed_length_left: int = 30,
+                 fixed_length_right: int = 30,
+                 filter_mode: str = 'df',
+                 filter_low_freq: float = 2,
+                 filter_high_freq: float = float('inf'),
+                 remove_stop_words: bool = False):
+        """Initialization."""
+        super().__init__()
+        self._fixed_length_left = fixed_length_left
+        self._fixed_length_right = fixed_length_right
+        self._left_fixedlength_unit = units.FixedLength(
+            self._fixed_length_left,
+            pad_mode='post'
+        )
+        self._right_fixedlength_unit = units.FixedLength(
+            self._fixed_length_right,
+            pad_mode='post'
+        )
+        self._filter_unit = units.FrequencyFilter(
+            low=filter_low_freq,
+            high=filter_high_freq,
+            mode=filter_mode
+        )
+        self._units = self._default_units()
+        if remove_stop_words:
+            self._units.append(units.stop_removal.StopRemoval())
+
+    def fit(self, data_pack: DataPack, verbose: int = 1):
+        """
+        Fit pre-processing context for transformation.
+
+        :param data_pack: data_pack to be preprocessed.
+        :param verbose: Verbosity.
+        :return: class:`BasicPreprocessor` instance.
+        """
+        data_pack = data_pack.apply_on_text(chain_transform(self._units),
+                                            verbose=verbose)
+        fitted_filter_unit = build_unit_from_data_pack(self._filter_unit,
+                                                       data_pack,
+                                                       flatten=False,
+                                                       mode='right',
+                                                       verbose=verbose)
+        data_pack = data_pack.apply_on_text(fitted_filter_unit.transform,
+                                            mode='right', verbose=verbose)
+        self._context['filter_unit'] = fitted_filter_unit
+
+        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
+        self._context['vocab_unit'] = vocab_unit
+
+        vocab_size = len(vocab_unit.state['term_index']) # + 1  # +1 for padding
+        self._context['vocab_size'] = vocab_size
+        self._context['embedding_input_dim'] = vocab_size
+        self._context['input_shapes'] = [(self._fixed_length_left,),
+                                         (self._fixed_length_right,)]
+
+        return self
+
+    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
+        """
+        Apply transformation on data, create fixed length representation.
+
+        :param data_pack: Inputs to be preprocessed.
+        :param verbose: Verbosity.
+
+        :return: Transformed data as :class:`DataPack` object.
+        """
+        data_pack = data_pack.copy()
+        data_pack.apply_on_text(chain_transform(self._units), inplace=True,
+                                verbose=verbose)
+
+        data_pack.apply_on_text(self._context['filter_unit'].transform,
+                                mode='right', inplace=True, verbose=verbose)
+        data_pack.apply_on_text(self._context['vocab_unit'].transform,
+                                mode='both', inplace=True, verbose=verbose)
+        data_pack.append_text_length(inplace=True, verbose=verbose)
+        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
+                                mode='left', inplace=True, verbose=verbose)
+        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
+                                mode='right', inplace=True, verbose=verbose)
+
+        max_len_left = self._fixed_length_left
+        max_len_right = self._fixed_length_right
+
+        data_pack.left['length_left'] = \
+            data_pack.left['length_left'].apply(
+                lambda val: min(val, max_len_left))
+
+        data_pack.right['length_right'] = \
+            data_pack.right['length_right'].apply(
+                lambda val: min(val, max_len_right))
+        return data_pack
diff --git a/matchzoo/preprocessors/bow_preprocessor.py b/matchzoo/preprocessors/bow_preprocessor.py
new file mode 100644
index 0000000..eb33559
--- /dev/null
+++ b/matchzoo/preprocessors/bow_preprocessor.py
@@ -0,0 +1,180 @@
+"""Basic Preprocessor."""
+
+from tqdm import tqdm
+
+from . import units
+from matchzoo import DataPack
+from matchzoo.engine.base_preprocessor import BasePreprocessor
+from .build_vocab_unit import build_vocab_unit
+from .build_unit_from_data_pack import build_unit_from_data_pack
+from .chain_transform import chain_transform
+from handlers.output_handler import FileHandler
+from typing import List
+import torch
+import itertools, os
+from matchzoo.preprocessors.units import Unit
+
+tqdm.pandas()
+
+
+class BoWPreprocessor(BasePreprocessor):
+    """
+    Bag of word preprocessor. Fit is same as Basic Processor but transform will transform text into bag of words
+
+    :param fixed_length_left: Integer, maximize length of :attr:`left` in the
+        data_pack.
+    :param fixed_length_right: Integer, maximize length of :attr:`right` in the
+        data_pack.
+    :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can
+        be 'df', 'cf', and 'idf'.
+    :param filter_low_freq: Float, lower bound value used by
+        :class:`FrequenceFilterUnit`.
+    :param filter_high_freq: Float, upper bound value used by
+        :class:`FrequenceFilterUnit`.
+    :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not.
+
+    Example:
+        >>> import matchzoo as mz
+        >>> train_data = mz.datasets.toy.load_data('train')
+        >>> test_data = mz.datasets.toy.load_data('test')
+        >>> preprocessor = mz.preprocessors.BoWPreprocessor(
+        ...     fixed_length_left=10,
+        ...     fixed_length_right=20,
+        ...     filter_mode='df',
+        ...     filter_low_freq=2,
+        ...     filter_high_freq=1000,
+        ...     remove_stop_words=True
+        ... )
+        >>> preprocessor = preprocessor.fit(train_data, verbose=0)
+        >>> preprocessor.context['input_shapes']
+        [(10,), (20,)]
+        >>> preprocessor.context['vocab_size']
+        225
+        >>> processed_train_data = preprocessor.transform(train_data,
+        ...                                               verbose=0)
+        >>> type(processed_train_data)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+        >>> test_data_transformed = preprocessor.transform(test_data,
+        ...                                                verbose=0)
+        >>> type(test_data_transformed)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+
+    """
+
+    def __init__(self, fixed_length_left: int = 30,
+                 fixed_length_right: int = 30,
+                 filter_mode: str = 'df',
+                 filter_low_freq: float = 2,
+                 filter_high_freq: float = float('inf'),
+                 remove_stop_words: bool = False,
+                 right_visual_features_pth: str = None,
+                 fixed_num_images_right: int = 1):
+        """Initialization."""
+        super().__init__()
+        self._fixed_length_left = fixed_length_left
+        self._fixed_length_right = fixed_length_right
+        self._left_fixedlength_unit = units.FixedLength(
+            self._fixed_length_left,
+            pad_mode='post'
+        )
+        self._right_fixedlength_unit = units.FixedLength(
+            self._fixed_length_right,
+            pad_mode='post'
+        )
+        self._filter_unit = units.FrequencyFilter(
+            low=filter_low_freq,
+            high=filter_high_freq,
+            mode=filter_mode
+        )
+        self._units = self._default_units()
+        self._images_unit = ImagesUnit(right_visual_features_pth, fixed_num_images_right)
+        if remove_stop_words:
+            self._units.append(units.stop_removal.StopRemoval())
+
+    def fit(self, data_pack: DataPack, verbose: int = 1):
+        """
+        Fit pre-processing context for transformation.
+
+        :param data_pack: data_pack to be preprocessed.
+        :param verbose: Verbosity.
+        :return: class:`BasicPreprocessor` instance.
+        """
+        data_pack = data_pack.apply_on_text(chain_transform(self._units),
+                                            verbose=verbose)
+        fitted_filter_unit = build_unit_from_data_pack(self._filter_unit,
+                                                       data_pack,
+                                                       flatten=False,
+                                                       mode='right',
+                                                       verbose=verbose)
+        data_pack = data_pack.apply_on_text(fitted_filter_unit.transform,
+                                            mode='right', verbose=verbose)
+        self._context['filter_unit'] = fitted_filter_unit
+
+        vocab_unit = build_vocab_unit(data_pack, verbose=verbose, mode="right")  # only rely on the right side
+        self._context['vocab_unit'] = vocab_unit
+
+        vocab_size = len(vocab_unit.state['term_index'])  # + 1  # +1 for padding
+        self._context['vocab_size'] = vocab_size
+        self._context['embedding_input_dim'] = vocab_size
+        self._context['input_shapes'] = [(self._fixed_length_left,),
+                                         (self._fixed_length_right,)]
+
+        return self
+
+    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
+        """
+        Apply transformation on data, create fixed length representation.
+
+        :param data_pack: Inputs to be preprocessed.
+        :param verbose: Verbosity.
+
+        :return: Transformed data as :class:`DataPack` object.
+        """
+        data_pack = data_pack.copy()
+        data_pack.apply_on_text(chain_transform(self._units), inplace=True,
+                                verbose=verbose)
+
+        data_pack.apply_on_text(self._context['filter_unit'].transform,
+                                mode='right', inplace=True, verbose=verbose)
+
+        def convert_to_bow(input_: List[str]):
+            """the list of tokens will be converted to """
+            vocab_unit = self._context['vocab_unit']
+            ans = [0.0] * self._context['vocab_size']
+            for token in input_:
+                index = vocab_unit._state['term_index'][token]
+                ans[index] = 1.0
+            return ans
+
+        data_pack.apply_on_text(convert_to_bow, mode='both', inplace=True, verbose=verbose)
+        data_pack.right['images_right'] = data_pack.right["images_right"].progress_apply(self._images_unit.transform)
+        return data_pack
+
+
+class ImagesUnit(Unit):
+    def __init__(self, visual_features_pth: str, max_len_images: int):
+        """
+
+        Parameters
+        ----------
+        visual_features_pth: str the path to pre-extracted features from images
+        max_len_images: str the maxinum number of images used
+        """
+        self.fat_tensor, paths = torch.load(os.path.join("..", visual_features_pth))
+        self.mapper = dict(zip(paths, range(len(paths))))  # indices here are rows of `fat_tensor`
+        assert len(paths) == self.fat_tensor.size(0)  # ensure consistency
+        self.pad = [0.0] * 4096  # for padding!!!
+        self.max_len_images = max_len_images
+
+    def transform(self, images: str) -> list:
+        """
+        Process input data from raw terms to list of tokens.
+
+        :param images: a list of images path.
+
+        :return tokens: tokenized tokens as a list.
+        """
+        images = [self.fat_tensor[self.mapper[p]].numpy().tolist() for p in images[:self.max_len_images]]
+        if len(images) < self.max_len_images:  # padding
+            images.extend([self.pad for _ in range(self.max_len_images - len(images))])
+        return list(itertools.chain.from_iterable(images))
\ No newline at end of file
diff --git a/matchzoo/preprocessors/build_unit_from_data_pack.py b/matchzoo/preprocessors/build_unit_from_data_pack.py
new file mode 100644
index 0000000..906a655
--- /dev/null
+++ b/matchzoo/preprocessors/build_unit_from_data_pack.py
@@ -0,0 +1,38 @@
+"""Build unit from data pack."""
+
+from tqdm import tqdm
+
+import matchzoo as mz
+from .units import StatefulUnit
+
+
+def build_unit_from_data_pack(
+    unit: StatefulUnit,
+    data_pack: mz.DataPack, mode: str = 'both',
+    flatten: bool = True, verbose: int = 1
+) -> StatefulUnit:
+    """
+    Build a :class:`StatefulUnit` from a :class:`DataPack` object.
+
+    :param unit: :class:`StatefulUnit` object to be built.
+    :param data_pack: The input :class:`DataPack` object.
+    :param mode: One of 'left', 'right', and 'both', to determine the source
+            data for building the :class:`VocabularyUnit`.
+    :param flatten: Flatten the datapack or not. `True` to organize the
+        :class:`DataPack` text as a list, and `False` to organize
+        :class:`DataPack` text as a list of list.
+    :param verbose: Verbosity.
+    :return: A built :class:`StatefulUnit` object.
+
+    """
+    corpus = []
+    if flatten:
+        data_pack.apply_on_text(corpus.extend, mode=mode, verbose=verbose)
+    else:
+        data_pack.apply_on_text(corpus.append, mode=mode, verbose=verbose)
+    if verbose:
+        description = 'Building ' + unit.__class__.__name__ + \
+                      ' from a datapack.'
+        corpus = tqdm(corpus, desc=description)
+    unit.fit(corpus)
+    return unit
diff --git a/matchzoo/preprocessors/build_vocab_unit.py b/matchzoo/preprocessors/build_vocab_unit.py
new file mode 100644
index 0000000..3d9442d
--- /dev/null
+++ b/matchzoo/preprocessors/build_vocab_unit.py
@@ -0,0 +1,30 @@
+from matchzoo.data_pack import DataPack
+from .units import Vocabulary
+from .build_unit_from_data_pack import build_unit_from_data_pack
+
+
+def build_vocab_unit(
+    data_pack: DataPack,
+    mode: str = 'both',
+    verbose: int = 1
+) -> Vocabulary:
+    """
+    Build a :class:`preprocessor.units.Vocabulary` given `data_pack`.
+
+    The `data_pack` should be preprocessed forehand, and each item in
+    `text_left` and `text_right` columns of the `data_pack` should be a list
+    of tokens.
+
+    :param data_pack: The :class:`DataPack` to build vocabulary upon.
+    :param mode: One of 'left', 'right', and 'both', to determine the source
+    data for building the :class:`VocabularyUnit`.
+    :param verbose: Verbosity.
+    :return: A built vocabulary unit.
+
+    """
+    return build_unit_from_data_pack(
+        unit=Vocabulary(),
+        data_pack=data_pack,
+        mode=mode,
+        flatten=True, verbose=verbose
+    )
diff --git a/matchzoo/preprocessors/cdssm_preprocessor.py b/matchzoo/preprocessors/cdssm_preprocessor.py
new file mode 100644
index 0000000..edeac4e
--- /dev/null
+++ b/matchzoo/preprocessors/cdssm_preprocessor.py
@@ -0,0 +1,125 @@
+"""CDSSM Preprocessor."""
+
+from tqdm import tqdm
+
+from . import units
+from .chain_transform import chain_transform
+from matchzoo import DataPack
+from matchzoo.engine.base_preprocessor import BasePreprocessor
+from .build_vocab_unit import build_vocab_unit
+
+tqdm.pandas()
+
+
+class CDSSMPreprocessor(BasePreprocessor):
+    """CDSSM Model preprocessor."""
+
+    def __init__(self,
+                 fixed_length_left: int = 10,
+                 fixed_length_right: int = 40,
+                 with_word_hashing: bool = True):
+        """
+        CDSSM Model preprocessor.
+
+        The word hashing step could eats up a lot of memory. To workaround
+        this problem, set `with_word_hashing` to `False` and use a
+        :class:`matchzoo.DynamicDataGenerator` with a
+        :class:`matchzoo.preprocessor.units.WordHashing`.
+
+        TODO: doc here.
+
+        :param with_word_hashing: Include a word hashing step if `True`.
+
+        Example:
+            >>> import matchzoo as mz
+            >>> train_data = mz.datasets.toy.load_data()
+            >>> test_data = mz.datasets.toy.load_data(stage='test')
+            >>> cdssm_preprocessor = mz.preprocessors.CDSSMPreprocessor()
+            >>> train_data_processed = cdssm_preprocessor.fit_transform(
+            ...     train_data, verbose=0
+            ... )
+            >>> type(train_data_processed)
+            <class 'matchzoo.data_pack.data_pack.DataPack'>
+            >>> test_data_transformed = cdssm_preprocessor.transform(test_data,
+            ...                                                      verbose=0)
+            >>> type(test_data_transformed)
+            <class 'matchzoo.data_pack.data_pack.DataPack'>
+
+        """
+        super().__init__()
+        self._fixed_length_left = fixed_length_left
+        self._fixed_length_right = fixed_length_right
+        self._left_fixedlength_unit = units.FixedLength(
+            self._fixed_length_left,
+            pad_value='0', pad_mode='post'
+        )
+        self._right_fixedlength_unit = units.FixedLength(
+            self._fixed_length_right,
+            pad_value='0', pad_mode='post'
+        )
+        self._with_word_hashing = with_word_hashing
+
+    def fit(self, data_pack: DataPack, verbose: int = 1):
+        """
+        Fit pre-processing context for transformation.
+
+        :param verbose: Verbosity.
+        :param data_pack: Data_pack to be preprocessed.
+        :return: class:`CDSSMPreprocessor` instance.
+        """
+        fit_units = self._default_units() + [units.NgramLetter()]
+        func = chain_transform(fit_units)
+        data_pack = data_pack.apply_on_text(func, verbose=verbose)
+        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
+
+        self._context['vocab_unit'] = vocab_unit
+        vocab_size = len(vocab_unit.state['term_index']) + 1
+        self._context['input_shapes'] = [
+            (self._fixed_length_left, vocab_size),
+            (self._fixed_length_right, vocab_size)
+        ]
+        return self
+
+    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
+        """
+        Apply transformation on data, create `letter-ngram` representation.
+
+        :param data_pack: Inputs to be preprocessed.
+        :param verbose: Verbosity.
+
+        :return: Transformed data as :class:`DataPack` object.
+        """
+        data_pack = data_pack.copy()
+        func = chain_transform(self._default_units())
+        data_pack.apply_on_text(func, inplace=True, verbose=verbose)
+        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
+                                mode='left', inplace=True, verbose=verbose)
+        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
+                                mode='right', inplace=True, verbose=verbose)
+        post_units = [units.NgramLetter(reduce_dim=False)]
+        if self._with_word_hashing:
+            term_index = self._context['vocab_unit'].state['term_index']
+            post_units.append(units.WordHashing(term_index))
+        data_pack.apply_on_text(chain_transform(post_units),
+                                inplace=True, verbose=verbose)
+        return data_pack
+
+    @classmethod
+    def _default_units(cls) -> list:
+        """Prepare needed process units."""
+        return [
+            units.Tokenize(),
+            units.Lowercase(),
+            units.PuncRemoval(),
+            units.StopRemoval(),
+        ]
+
+    @property
+    def with_word_hashing(self):
+        """`with_word_hashing` getter."""
+        return self._with_word_hashing
+
+    @with_word_hashing.setter
+    def with_word_hashing(self, value):
+        """`with_word_hashing` setter."""
+        self._with_word_hashing = value
diff --git a/matchzoo/preprocessors/chain_transform.py b/matchzoo/preprocessors/chain_transform.py
new file mode 100644
index 0000000..ceb1e87
--- /dev/null
+++ b/matchzoo/preprocessors/chain_transform.py
@@ -0,0 +1,24 @@
+"""Wrapper function organizes a number of transform functions."""
+import typing
+import functools
+
+from .units.unit import Unit
+
+
+def chain_transform(units: typing.List[Unit]) -> typing.Callable:
+    """
+    Compose unit transformations into a single function.
+
+    :param units: List of :class:`matchzoo.StatelessUnit`.
+    """
+
+    @functools.wraps(chain_transform)
+    def wrapper(arg):
+        """Wrapper function of transformations composition."""
+        for unit in units:
+            arg = unit.transform(arg)
+        return arg
+
+    unit_names = ' => '.join(unit.__class__.__name__ for unit in units)
+    wrapper.__name__ += ' of ' + unit_names
+    return wrapper
diff --git a/matchzoo/preprocessors/char_man_elmo_preprocessor.py b/matchzoo/preprocessors/char_man_elmo_preprocessor.py
new file mode 100644
index 0000000..ec4fda0
--- /dev/null
+++ b/matchzoo/preprocessors/char_man_elmo_preprocessor.py
@@ -0,0 +1,269 @@
+"""Basic Preprocessor."""
+
+from tqdm import tqdm
+
+from . import units
+from matchzoo import DataPack
+from matchzoo.engine.base_preprocessor import BasePreprocessor
+from .build_vocab_unit import build_vocab_unit
+from .build_unit_from_data_pack import build_unit_from_data_pack
+from .chain_transform import chain_transform
+from handlers.output_handler import FileHandler
+from matchzoo.preprocessors.basic_preprocessor import BasicPreprocessor
+from matchzoo.preprocessors.units import Unit
+from .units import Vocabulary
+tqdm.pandas()
+
+
+class CharManElmoPreprocessor(BasicPreprocessor):
+    """
+    Baisc preprocessor helper for fact-checking with external evidences for my
+
+    :param fixed_length_left: Integer, maximize length of :attr:`left` in the
+        data_pack.
+    :param fixed_length_right: Integer, maximize length of :attr:`right` in the
+        data_pack.
+    :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can
+        be 'df', 'cf', and 'idf'.
+    :param filter_low_freq: Float, lower bound value used by
+        :class:`FrequenceFilterUnit`.
+    :param filter_high_freq: Float, upper bound value used by
+        :class:`FrequenceFilterUnit`.
+    :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not.
+
+    Example:
+        >>> import matchzoo as mz
+        >>> train_data = mz.datasets.toy.load_data('train')
+        >>> test_data = mz.datasets.toy.load_data('test')
+        >>> preprocessor = mz.preprocessors.BasicPreprocessor(
+        ...     fixed_length_left=10,
+        ...     fixed_length_right=20,
+        ...     filter_mode='df',
+        ...     filter_low_freq=2,
+        ...     filter_high_freq=1000,
+        ...     remove_stop_words=True
+        ... )
+        >>> preprocessor = preprocessor.fit(train_data, verbose=0)
+        >>> preprocessor.context['input_shapes']
+        [(10,), (20,)]
+        >>> preprocessor.context['vocab_size']
+        225
+        >>> processed_train_data = preprocessor.transform(train_data,
+        ...                                               verbose=0)
+        >>> type(processed_train_data)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+        >>> test_data_transformed = preprocessor.transform(test_data,
+        ...                                                verbose=0)
+        >>> type(test_data_transformed)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+
+    """
+
+    def __init__(self, fixed_length_left: int = 30,
+                 fixed_length_right: int = 30,
+                 fixed_length_left_src: int = 30,
+                 fixed_length_right_src: int = 30,
+                 filter_mode: str = 'df',
+                 filter_low_freq: float = 2,
+                 filter_high_freq: float = float('inf'),
+                 remove_stop_words: bool = False):
+        """Initialization."""
+        super().__init__()
+        self._fixed_length_left = fixed_length_left
+        self._fixed_length_right = fixed_length_right
+        self._fixed_length_left_src = fixed_length_left_src
+        self._fixed_length_right_src = fixed_length_right_src
+        self._left_fixedlength_unit = units.FixedLength(
+            self._fixed_length_left,
+            pad_mode='post'
+        )
+        self._right_fixedlength_unit = units.FixedLength(
+            self._fixed_length_right,
+            pad_mode='post'
+        )
+        # for padding character level of left_source and right_source
+        self._left_char_src_fixedlength_unit = units.FixedLength(self._fixed_length_left_src, pad_mode='post')
+        self._right_char_src_fixedlength_unit = units.FixedLength(self._fixed_length_right_src, pad_mode='post')
+
+        self.char_unit = units.ngram_letter.NgramLetter(ngram=1, reduce_dim=True)
+        self._units = [SplitTokenize()]
+        if remove_stop_words:
+            self._units.append(units.stop_removal.StopRemoval())
+
+    def fit(self, data_pack: DataPack, verbose: int = 1):
+        """
+        Fit pre-processing context for transformation.
+
+        :param data_pack: data_pack to be preprocessed.
+        :param verbose: Verbosity.
+        :return: class:`BasicPreprocessor` instance.
+        """
+        data_pack = data_pack.apply_on_text(chain_transform(self._units), verbose=verbose)
+        # fitted_filter_unit = build_unit_from_data_pack(self._filter_unit,
+        #                                                data_pack,
+        #                                                flatten=False,
+        #                                                mode='right',
+        #                                                verbose=verbose)
+        # data_pack = data_pack.apply_on_text(fitted_filter_unit.transform,
+        #                                     mode='right', verbose=verbose)
+        # self._context['filter_unit'] = fitted_filter_unit
+
+        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
+        self._context['vocab_unit'] = vocab_unit
+
+        vocab_size = len(vocab_unit.state['term_index'])  # + 1  # +1 for padding
+        self._context['vocab_size'] = vocab_size
+        self._context['embedding_input_dim'] = vocab_size
+        self._context['input_shapes'] = [(self._fixed_length_left,),
+                                         (self._fixed_length_right,)]
+
+        claim_source_unit = build_entity_unit(column="claim_source", data_pack=data_pack, mode="left")
+        article_source_unit = build_entity_unit(column="evidence_source", data_pack=data_pack, mode="right")
+        self._context['claim_source_unit'] = claim_source_unit
+        self._context['article_source_unit'] = article_source_unit
+
+        char_source_unit = build_ngram_unit(left_column="claim_source", right_column="evidence_source",
+                                            data_pack=data_pack, mode="both")
+        self._context['char_source_unit'] = char_source_unit
+        return self
+
+    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
+        """
+        Apply transformation on data, create fixed length representation.
+
+        :param data_pack: Inputs to be preprocessed.
+        :param verbose: Verbosity.
+
+        :return: Transformed data as :class:`DataPack` object.
+        """
+        data_pack = data_pack.copy()
+
+        def map_claim_source(entity: str): return self._context['claim_source_unit'].transform([entity])
+
+        def map_evidence_source(entity: str): return self._context['article_source_unit'].transform([entity])
+
+        def map_src2char(entity: str):
+            return self._context['char_source_unit'].transform(list(entity))
+
+        data_pack.left["claim_source"] = data_pack.left["claim_source"].progress_apply(map_claim_source)
+        data_pack.left["char_claim_source"] = data_pack.left["char_claim_source"].progress_apply(map_src2char)
+        data_pack.left["char_claim_source"] = data_pack.left["char_claim_source"].progress_apply(
+            self._left_char_src_fixedlength_unit.transform)
+
+        data_pack.right["evidence_source"] = data_pack.right["evidence_source"].progress_apply(map_evidence_source)
+        data_pack.right["char_evidence_source"] = data_pack.right["char_evidence_source"].progress_apply(map_src2char)
+        data_pack.right["char_evidence_source"] = data_pack.right["char_evidence_source"].progress_apply(
+            self._right_char_src_fixedlength_unit.transform)
+
+        data_pack.apply_on_text(chain_transform(self._units), inplace=True, verbose=verbose)
+
+        # data_pack.apply_on_text(self._context['filter_unit'].transform,
+        #                         mode='right', inplace=True, verbose=verbose)
+        data_pack.apply_on_text(self._context['vocab_unit'].transform,
+                                mode='both', inplace=True, verbose=verbose)
+        data_pack.append_text_length(inplace=True, verbose=verbose)
+        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
+                                mode='left', inplace=True, verbose=verbose)
+        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
+                                mode='right', inplace=True, verbose=verbose)
+
+        max_len_left = self._fixed_length_left
+        max_len_right = self._fixed_length_right
+
+        data_pack.left['length_left'] = \
+            data_pack.left['length_left'].apply(
+                lambda val: min(val, max_len_left))
+
+        data_pack.right['length_right'] = \
+            data_pack.right['length_right'].apply(
+                lambda val: min(val, max_len_right))
+
+        return data_pack
+
+
+class SplitTokenize(Unit):
+    """Process unit for text tokenization."""
+
+    def transform(self, input_: str) -> list:
+        """
+        Process input data from raw terms to list of tokens.
+
+        :param input_: raw textual input.
+
+        :return tokens: tokenized tokens as a list.
+        """
+        return input_.split()
+
+
+def build_entity_unit(
+    column: str,
+    data_pack: DataPack,
+    mode: str = 'both',
+    verbose: int = 1
+) -> Vocabulary:
+    """
+    Build a :class:`preprocessor.units.Vocabulary` given `data_pack`.
+
+    The `data_pack` should be preprocessed forehand, and each item in
+    `text_left` and `text_right` columns of the `data_pack` should be a list
+    of tokens.
+
+    :param column: `str` the selected column to build units
+    :param data_pack: The :class:`DataPack` to build vocabulary upon.
+    :param mode: One of 'left', 'right', and 'both', to determine the source
+    data for building the :class:`VocabularyUnit`.
+    :param verbose: Verbosity.
+    :return: A built vocabulary unit.
+
+    """
+    unit = Vocabulary()
+    corpus = []
+    def func(entity: str): corpus.append(entity.strip())
+    assert mode in ["left", "right"]
+    if mode == "left":
+        data_pack.left[column].progress_apply(func)
+    elif mode == "right":
+        data_pack.right[column].progress_apply(func)
+    else:
+        raise NotImplemented("Not coded for both columns")
+
+    if verbose:
+        description = 'Building Entities ' + unit.__class__.__name__ + ' from a datapack.'
+        corpus = tqdm(corpus, desc=description)
+    unit.fit(corpus)
+    return unit
+
+
+def build_ngram_unit(left_column: str, right_column: str, data_pack: DataPack, mode: str = 'both', verbose: int = 1):
+    """
+    Build a :class:`preprocessor.units.Vocabulary` given `data_pack`.
+
+    The `data_pack` should be preprocessed forehand, and each item in
+    `text_left` and `text_right` columns of the `data_pack` should be a list
+    of tokens.
+
+    :param column: `str` the selected column to build units
+    :param data_pack: The :class:`DataPack` to build vocabulary upon.
+    :param mode: One of 'left', 'right', and 'both', to determine the source
+    data for building the :class:`VocabularyUnit`.
+    :param verbose: Verbosity.
+    :return: A built vocabulary unit.
+
+    """
+    unit = Vocabulary()
+    corpus = []
+
+    def func(entity: str):
+        assert type(entity) == str
+        entity = entity.strip()
+        for c in entity: corpus.append(c)
+
+    assert mode == "both"
+    data_pack.left[left_column].progress_apply(func)
+    data_pack.right[right_column].progress_apply(func)
+
+    if verbose:
+        description = 'Building Characters ' + unit.__class__.__name__ + ' from a datapack.'
+        corpus = tqdm(corpus, desc=description)
+    unit.fit(corpus)
+    return unit
diff --git a/matchzoo/preprocessors/char_man_preprocessor.py b/matchzoo/preprocessors/char_man_preprocessor.py
new file mode 100644
index 0000000..779fd69
--- /dev/null
+++ b/matchzoo/preprocessors/char_man_preprocessor.py
@@ -0,0 +1,257 @@
+"""Basic Preprocessor."""
+
+from tqdm import tqdm
+
+from . import units
+from matchzoo import DataPack
+from matchzoo.engine.base_preprocessor import BasePreprocessor
+from .build_vocab_unit import build_vocab_unit
+from .build_unit_from_data_pack import build_unit_from_data_pack
+from .chain_transform import chain_transform
+from handlers.output_handler import FileHandler
+from .units import Vocabulary
+from .units import StatefulUnit
+tqdm.pandas()
+
+
+class CharManPreprocessor(BasePreprocessor):
+    """
+    Preprocessor for model Character Multiperspective Attention Network (CharMAN)
+    Char-MAN preprocessor helper which has source embeddings. Add char embeddings for characters
+
+    :param fixed_length_left: Integer, maximize length of :attr:`left` in the
+        data_pack.
+    :param fixed_length_right: Integer, maximize length of :attr:`right` in the
+        data_pack.
+    :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can
+        be 'df', 'cf', and 'idf'.
+    :param filter_low_freq: Float, lower bound value used by
+        :class:`FrequenceFilterUnit`.
+    :param filter_high_freq: Float, upper bound value used by
+        :class:`FrequenceFilterUnit`.
+    :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not.
+
+    Example:
+        >>> import matchzoo as mz
+        >>> train_data = mz.datasets.toy.load_data('train')
+        >>> test_data = mz.datasets.toy.load_data('test')
+        >>> preprocessor = mz.preprocessors.BasicPreprocessor(
+        ...     fixed_length_left=10,
+        ...     fixed_length_right=20,
+        ...     filter_mode='df',
+        ...     filter_low_freq=2,
+        ...     filter_high_freq=1000,
+        ...     remove_stop_words=True
+        ... )
+        >>> preprocessor = preprocessor.fit(train_data, verbose=0)
+        >>> preprocessor.context['input_shapes']
+        [(10,), (20,)]
+        >>> preprocessor.context['vocab_size']
+        225
+        >>> processed_train_data = preprocessor.transform(train_data,
+        ...                                               verbose=0)
+        >>> type(processed_train_data)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+        >>> test_data_transformed = preprocessor.transform(test_data,
+        ...                                                verbose=0)
+        >>> type(test_data_transformed)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+
+    """
+
+    def __init__(self, fixed_length_left: int = 30,
+                 fixed_length_right: int = 30,
+                 fixed_length_left_src: int = 30,
+                 fixed_length_right_src: int = 30,
+                 filter_mode: str = 'df',
+                 filter_low_freq: float = 2,
+                 filter_high_freq: float = float('inf'),
+                 remove_stop_words: bool = False):
+        """Initialization."""
+        super().__init__()
+        self._fixed_length_left = fixed_length_left
+        self._fixed_length_right = fixed_length_right
+        self._fixed_length_left_src = fixed_length_left_src
+        self._fixed_length_right_src = fixed_length_right_src
+        self._left_fixedlength_unit = units.FixedLength(
+            self._fixed_length_left,
+            pad_mode='post'
+        )
+        self._right_fixedlength_unit = units.FixedLength(
+            self._fixed_length_right,
+            pad_mode='post'
+        )
+        # for padding character level of left_source and right_source
+        self._left_char_src_fixedlength_unit = units.FixedLength(self._fixed_length_left_src, pad_mode='post')
+        self._right_char_src_fixedlength_unit = units.FixedLength(self._fixed_length_right_src, pad_mode='post')
+
+        self.char_unit = units.ngram_letter.NgramLetter(ngram=1, reduce_dim=True)
+        self._units = self._default_units()
+        if remove_stop_words:
+            self._units.append(units.stop_removal.StopRemoval())
+
+    def fit(self, data_pack: DataPack, verbose: int = 1):
+        """
+        Fit pre-processing context for transformation.
+
+        :param data_pack: data_pack to be preprocessed.
+        :param verbose: Verbosity.
+        :return: class:`BasicPreprocessor` instance.
+        """
+        data_pack = data_pack.apply_on_text(chain_transform(self._units),
+                                            verbose=verbose)
+        # fitted_filter_unit = build_unit_from_data_pack(self._filter_unit,
+        #                                                data_pack,
+        #                                                flatten=False,
+        #                                                mode='right',
+        #                                                verbose=verbose)
+        # data_pack = data_pack.apply_on_text(fitted_filter_unit.transform,
+        #                                     mode='right', verbose=verbose)
+        # self._context['filter_unit'] = fitted_filter_unit
+
+        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
+        self._context['vocab_unit'] = vocab_unit
+
+        vocab_size = len(vocab_unit.state['term_index'])  # + 1  # +1 for padding
+        self._context['vocab_size'] = vocab_size
+        self._context['embedding_input_dim'] = vocab_size
+        self._context['input_shapes'] = [(self._fixed_length_left,),
+                                         (self._fixed_length_right,)]
+
+        claim_source_unit = build_entity_unit(column = "claim_source", data_pack = data_pack, mode = "left")
+        article_source_unit = build_entity_unit(column = "evidence_source", data_pack = data_pack, mode = "right")
+        self._context['claim_source_unit'] = claim_source_unit
+        self._context['article_source_unit'] = article_source_unit
+
+        char_source_unit = build_ngram_unit(left_column = "claim_source", right_column="evidence_source",
+                                            data_pack = data_pack, mode = "both")
+        self._context['char_source_unit'] = char_source_unit
+
+        return self
+
+    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
+        """
+        Apply transformation on data, create fixed length representation.
+
+        :param data_pack: Inputs to be preprocessed.
+        :param verbose: Verbosity.
+
+        :return: Transformed data as :class:`DataPack` object.
+        """
+        data_pack = data_pack.copy()
+
+        def map_claim_source(entity: str): return self._context['claim_source_unit'].transform([entity])
+
+        def map_evidence_source(entity: str): return self._context['article_source_unit'].transform([entity])
+
+        def map_src2char(entity: str):
+            return self._context['char_source_unit'].transform(list(entity))
+
+        data_pack.left["claim_source"] = data_pack.left["claim_source"].progress_apply(map_claim_source)
+        data_pack.left["char_claim_source"] = data_pack.left["char_claim_source"].progress_apply(map_src2char)
+        data_pack.left["char_claim_source"] = data_pack.left["char_claim_source"].progress_apply(
+            self._left_char_src_fixedlength_unit.transform)
+
+        data_pack.right["evidence_source"] = data_pack.right["evidence_source"].progress_apply(map_evidence_source)
+        data_pack.right["char_evidence_source"] = data_pack.right["char_evidence_source"].progress_apply(map_src2char)
+        data_pack.right["char_evidence_source"] = data_pack.right["char_evidence_source"].progress_apply(
+            self._right_char_src_fixedlength_unit.transform)
+
+        data_pack.apply_on_text(chain_transform(self._units), inplace=True, verbose=verbose)
+
+        # data_pack.apply_on_text(self._context['filter_unit'].transform,
+        #                         mode='right', inplace=True, verbose=verbose)
+        data_pack.apply_on_text(self._context['vocab_unit'].transform,
+                                mode='both', inplace=True, verbose=verbose)
+        data_pack.append_text_length(inplace=True, verbose=verbose)
+        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
+                                mode='left', inplace=True, verbose=verbose)
+        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
+                                mode='right', inplace=True, verbose=verbose)
+
+        max_len_left = self._fixed_length_left
+        max_len_right = self._fixed_length_right
+
+        data_pack.left['length_left'] = \
+            data_pack.left['length_left'].apply(
+                lambda val: min(val, max_len_left))
+
+        data_pack.right['length_right'] = \
+            data_pack.right['length_right'].apply(
+                lambda val: min(val, max_len_right))
+
+        return data_pack
+
+
+def build_entity_unit(
+    column: str,
+    data_pack: DataPack,
+    mode: str = 'both',
+    verbose: int = 1
+) -> Vocabulary:
+    """
+    Build a :class:`preprocessor.units.Vocabulary` given `data_pack`.
+
+    The `data_pack` should be preprocessed forehand, and each item in
+    `text_left` and `text_right` columns of the `data_pack` should be a list
+    of tokens.
+
+    :param column: `str` the selected column to build units
+    :param data_pack: The :class:`DataPack` to build vocabulary upon.
+    :param mode: One of 'left', 'right', and 'both', to determine the source
+    data for building the :class:`VocabularyUnit`.
+    :param verbose: Verbosity.
+    :return: A built vocabulary unit.
+
+    """
+    unit = Vocabulary()
+    corpus = []
+    def func(entity: str): corpus.append(entity.strip())
+    assert mode in ["left", "right"]
+    if mode == "left":
+        data_pack.left[column].progress_apply(func)
+    elif mode == "right":
+        data_pack.right[column].progress_apply(func)
+    else:
+        raise NotImplemented("Not coded for both columns")
+
+    if verbose:
+        description = 'Building Entities ' + unit.__class__.__name__ + ' from a datapack.'
+        corpus = tqdm(corpus, desc=description)
+    unit.fit(corpus)
+    return unit
+
+
+def build_ngram_unit(left_column: str, right_column: str, data_pack: DataPack, mode: str = 'both', verbose: int = 1):
+    """
+    Build a :class:`preprocessor.units.Vocabulary` given `data_pack`.
+
+    The `data_pack` should be preprocessed forehand, and each item in
+    `text_left` and `text_right` columns of the `data_pack` should be a list
+    of tokens.
+
+    :param column: `str` the selected column to build units
+    :param data_pack: The :class:`DataPack` to build vocabulary upon.
+    :param mode: One of 'left', 'right', and 'both', to determine the source
+    data for building the :class:`VocabularyUnit`.
+    :param verbose: Verbosity.
+    :return: A built vocabulary unit.
+
+    """
+    unit = Vocabulary()
+    corpus = []
+
+    def func(entity: str):
+        assert type(entity) == str
+        entity = entity.strip()
+        for c in entity: corpus.append(c)
+
+    assert mode == "both"
+    data_pack.left[left_column].progress_apply(func)
+    data_pack.right[right_column].progress_apply(func)
+
+    if verbose:
+        description = 'Building Characters ' + unit.__class__.__name__ + ' from a datapack.'
+        corpus = tqdm(corpus, desc=description)
+    unit.fit(corpus)
+    return unit
diff --git a/matchzoo/preprocessors/char_ngram_preprocessor.py b/matchzoo/preprocessors/char_ngram_preprocessor.py
new file mode 100644
index 0000000..22087fc
--- /dev/null
+++ b/matchzoo/preprocessors/char_ngram_preprocessor.py
@@ -0,0 +1,95 @@
+"""Basic Preprocessor."""
+
+from tqdm import tqdm
+
+from . import units
+from matchzoo import DataPack
+from matchzoo.preprocessors.basic_preprocessor import BasicPreprocessor
+from .build_vocab_unit import build_vocab_unit
+from .build_unit_from_data_pack import build_unit_from_data_pack
+from .chain_transform import chain_transform
+from handlers.output_handler import FileHandler
+
+tqdm.pandas()
+
+
+class CharNGramPreprocessor(BasicPreprocessor):
+    """
+    Baisc preprocessor helper.
+
+    :param fixed_length_left: Integer, maximize length of :attr:`left` in the
+        data_pack.
+    :param fixed_length_right: Integer, maximize length of :attr:`right` in the
+        data_pack.
+    :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can
+        be 'df', 'cf', and 'idf'.
+    :param filter_low_freq: Float, lower bound value used by
+        :class:`FrequenceFilterUnit`.
+    :param filter_high_freq: Float, upper bound value used by
+        :class:`FrequenceFilterUnit`.
+    :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not.
+
+    Example:
+        >>> import matchzoo as mz
+        >>> train_data = mz.datasets.toy.load_data('train')
+        >>> test_data = mz.datasets.toy.load_data('test')
+        >>> preprocessor = mz.preprocessors.BasicPreprocessor(
+        ...     fixed_length_left=10,
+        ...     fixed_length_right=20,
+        ...     filter_mode='df',
+        ...     filter_low_freq=2,
+        ...     filter_high_freq=1000,
+        ...     remove_stop_words=True
+        ... )
+        >>> preprocessor = preprocessor.fit(train_data, verbose=0)
+        >>> preprocessor.context['input_shapes']
+        [(10,), (20,)]
+        >>> preprocessor.context['vocab_size']
+        225
+        >>> processed_train_data = preprocessor.transform(train_data,
+        ...                                               verbose=0)
+        >>> type(processed_train_data)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+        >>> test_data_transformed = preprocessor.transform(test_data,
+        ...                                                verbose=0)
+        >>> type(test_data_transformed)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+
+    """
+
+    def __init__(self, fixed_length_left: int = 30,
+                 fixed_length_right: int = 30,
+                 filter_mode: str = 'df',
+                 filter_low_freq: float = 2,
+                 filter_high_freq: float = float('inf'),
+                 remove_stop_words: bool = False):
+        """Initialization."""
+        # super().__init__()
+        super(BasicPreprocessor, self).__init__()
+        self._fixed_length_left = fixed_length_left
+        self._fixed_length_right = fixed_length_right
+        self._left_fixedlength_unit = units.FixedLength(
+            self._fixed_length_left,
+            pad_mode='post'
+        )
+        self._right_fixedlength_unit = units.FixedLength(
+            self._fixed_length_right,
+            pad_mode='post'
+        )
+        self._filter_unit = units.FrequencyFilter(
+            low=filter_low_freq,
+            high=filter_high_freq,
+            mode=filter_mode
+        )
+        self._units = self._default_units()
+        # if remove_stop_words:
+        #     self._units.append(units.stop_removal.StopRemoval())
+
+    def _default_units(cls) -> list:
+        return [
+            units.Tokenize(),
+            units.Lowercase(),
+            units.PuncRemoval(),
+            units.StopRemoval(),
+            units.NgramLetter(),
+        ]
diff --git a/matchzoo/preprocessors/declare_preprocessor.py b/matchzoo/preprocessors/declare_preprocessor.py
new file mode 100644
index 0000000..3bca49f
--- /dev/null
+++ b/matchzoo/preprocessors/declare_preprocessor.py
@@ -0,0 +1,202 @@
+"""Basic Preprocessor."""
+
+from tqdm import tqdm
+
+from . import units
+from matchzoo import DataPack
+from matchzoo.engine.base_preprocessor import BasePreprocessor
+from .build_vocab_unit import build_vocab_unit
+from .build_unit_from_data_pack import build_unit_from_data_pack
+from .chain_transform import chain_transform
+from handlers.output_handler import FileHandler
+from .units import Vocabulary
+from .units import StatefulUnit
+tqdm.pandas()
+
+
+class DeClarePreprocessor(BasePreprocessor):
+    """
+    Declare preprocessor helper which has source embeddings.
+
+    :param fixed_length_left: Integer, maximize length of :attr:`left` in the
+        data_pack.
+    :param fixed_length_right: Integer, maximize length of :attr:`right` in the
+        data_pack.
+    :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can
+        be 'df', 'cf', and 'idf'.
+    :param filter_low_freq: Float, lower bound value used by
+        :class:`FrequenceFilterUnit`.
+    :param filter_high_freq: Float, upper bound value used by
+        :class:`FrequenceFilterUnit`.
+    :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not.
+
+    Example:
+        >>> import matchzoo as mz
+        >>> train_data = mz.datasets.toy.load_data('train')
+        >>> test_data = mz.datasets.toy.load_data('test')
+        >>> preprocessor = mz.preprocessors.BasicPreprocessor(
+        ...     fixed_length_left=10,
+        ...     fixed_length_right=20,
+        ...     filter_mode='df',
+        ...     filter_low_freq=2,
+        ...     filter_high_freq=1000,
+        ...     remove_stop_words=True
+        ... )
+        >>> preprocessor = preprocessor.fit(train_data, verbose=0)
+        >>> preprocessor.context['input_shapes']
+        [(10,), (20,)]
+        >>> preprocessor.context['vocab_size']
+        225
+        >>> processed_train_data = preprocessor.transform(train_data,
+        ...                                               verbose=0)
+        >>> type(processed_train_data)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+        >>> test_data_transformed = preprocessor.transform(test_data,
+        ...                                                verbose=0)
+        >>> type(test_data_transformed)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+
+    """
+
+    def __init__(self, fixed_length_left: int = 30,
+                 fixed_length_right: int = 30,
+                 filter_mode: str = 'df',
+                 filter_low_freq: float = 2,
+                 filter_high_freq: float = float('inf'),
+                 remove_stop_words: bool = False):
+        """Initialization."""
+        super().__init__()
+        self._fixed_length_left = fixed_length_left
+        self._fixed_length_right = fixed_length_right
+        self._left_fixedlength_unit = units.FixedLength(
+            self._fixed_length_left,
+            pad_mode='post'
+        )
+        self._right_fixedlength_unit = units.FixedLength(
+            self._fixed_length_right,
+            pad_mode='post'
+        )
+        # self._filter_unit = units.FrequencyFilter(
+        #     low=filter_low_freq,
+        #     high=filter_high_freq,
+        #     mode=filter_mode
+        # )
+        self._units = self._default_units()
+        if remove_stop_words:
+            self._units.append(units.stop_removal.StopRemoval())
+
+    def fit(self, data_pack: DataPack, verbose: int = 1):
+        """
+        Fit pre-processing context for transformation.
+
+        :param data_pack: data_pack to be preprocessed.
+        :param verbose: Verbosity.
+        :return: class:`BasicPreprocessor` instance.
+        """
+        data_pack = data_pack.apply_on_text(chain_transform(self._units),
+                                            verbose=verbose)
+        # fitted_filter_unit = build_unit_from_data_pack(self._filter_unit,
+        #                                                data_pack,
+        #                                                flatten=False,
+        #                                                mode='right',
+        #                                                verbose=verbose)
+        # data_pack = data_pack.apply_on_text(fitted_filter_unit.transform,
+        #                                     mode='right', verbose=verbose)
+        # self._context['filter_unit'] = fitted_filter_unit
+
+        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
+        self._context['vocab_unit'] = vocab_unit
+
+        vocab_size = len(vocab_unit.state['term_index'])  # + 1  # +1 for padding
+        self._context['vocab_size'] = vocab_size
+        self._context['embedding_input_dim'] = vocab_size
+        self._context['input_shapes'] = [(self._fixed_length_left,),
+                                         (self._fixed_length_right,)]
+
+        claim_source_unit = build_entity_unit(column = "claim_source", data_pack = data_pack, mode = "left")
+        article_source_unit = build_entity_unit(column = "evidence_source", data_pack = data_pack, mode = "right")
+        self._context['claim_source_unit'] = claim_source_unit
+        self._context['article_source_unit'] = article_source_unit
+
+        return self
+
+    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
+        """
+        Apply transformation on data, create fixed length representation.
+
+        :param data_pack: Inputs to be preprocessed.
+        :param verbose: Verbosity.
+
+        :return: Transformed data as :class:`DataPack` object.
+        """
+        data_pack = data_pack.copy()
+
+        def map_claim_source(entity: str): return self._context['claim_source_unit'].transform([entity])
+
+        def map_evidence_source(entity: str): return self._context['article_source_unit'].transform([entity])
+
+        data_pack.left["claim_source"] = data_pack.left["claim_source"].progress_apply(map_claim_source)
+        data_pack.right["evidence_source"] = data_pack.right["evidence_source"].progress_apply(map_evidence_source)
+        data_pack.apply_on_text(chain_transform(self._units), inplace=True, verbose=verbose)
+
+        # data_pack.apply_on_text(self._context['filter_unit'].transform,
+        #                         mode='right', inplace=True, verbose=verbose)
+        data_pack.apply_on_text(self._context['vocab_unit'].transform,
+                                mode='both', inplace=True, verbose=verbose)
+        data_pack.append_text_length(inplace=True, verbose=verbose)
+        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
+                                mode='left', inplace=True, verbose=verbose)
+        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
+                                mode='right', inplace=True, verbose=verbose)
+
+        max_len_left = self._fixed_length_left
+        max_len_right = self._fixed_length_right
+
+        data_pack.left['length_left'] = \
+            data_pack.left['length_left'].apply(
+                lambda val: min(val, max_len_left))
+
+        data_pack.right['length_right'] = \
+            data_pack.right['length_right'].apply(
+                lambda val: min(val, max_len_right))
+
+        return data_pack
+
+
+def build_entity_unit(
+    column: str,
+    data_pack: DataPack,
+    mode: str = 'both',
+    verbose: int = 1
+) -> Vocabulary:
+    """
+    Build a :class:`preprocessor.units.Vocabulary` given `data_pack`.
+
+    The `data_pack` should be preprocessed forehand, and each item in
+    `text_left` and `text_right` columns of the `data_pack` should be a list
+    of tokens.
+
+    :param column: `str` the selected column to build units
+    :param data_pack: The :class:`DataPack` to build vocabulary upon.
+    :param mode: One of 'left', 'right', and 'both', to determine the source
+    data for building the :class:`VocabularyUnit`.
+    :param verbose: Verbosity.
+    :return: A built vocabulary unit.
+
+    """
+    unit = Vocabulary()
+    corpus = []
+    def func(entity: str): corpus.append(entity.strip())
+    assert mode in ["left", "right"]
+    if mode == "left":
+        data_pack.left[column].progress_apply(func)
+    elif mode == "right":
+        data_pack.right[column].progress_apply(func)
+    else:
+        raise NotImplemented("Not coded for both columns")
+
+    if verbose:
+        description = 'Building ' + unit.__class__.__name__ + ' from a datapack.'
+        corpus = tqdm(corpus, desc=description)
+    unit.fit(corpus)
+    return unit
diff --git a/matchzoo/preprocessors/dssm_preprocessor.py b/matchzoo/preprocessors/dssm_preprocessor.py
new file mode 100644
index 0000000..fb2ebab
--- /dev/null
+++ b/matchzoo/preprocessors/dssm_preprocessor.py
@@ -0,0 +1,124 @@
+"""DSSM Preprocessor."""
+
+from tqdm import tqdm
+
+from matchzoo.data_pack import DataPack
+from matchzoo.engine.base_preprocessor import BasePreprocessor
+from .chain_transform import chain_transform
+from .build_vocab_unit import build_vocab_unit
+from . import units
+
+tqdm.pandas()
+
+
+class DSSMPreprocessor(BasePreprocessor):
+    """DSSM Model preprocessor."""
+
+    def __init__(self, with_word_hashing: bool = True):
+        """
+        DSSM Model preprocessor.
+
+        The word hashing step could eats up a lot of memory. To workaround
+        this problem, set `with_word_hashing` to `False` and use  a
+        :class:`matchzoo.DynamicDataGenerator` with a
+        :class:`matchzoo.preprocessor.units.WordHashing`.
+
+        :param with_word_hashing: Include a word hashing step if `True`.
+
+        Example:
+            >>> import matchzoo as mz
+            >>> train_data = mz.datasets.toy.load_data()
+            >>> test_data = mz.datasets.toy.load_data(stage='test')
+            >>> dssm_preprocessor = mz.preprocessors.DSSMPreprocessor()
+            >>> train_data_processed = dssm_preprocessor.fit_transform(
+            ...     train_data, verbose=0
+            ... )
+            >>> type(train_data_processed)
+            <class 'matchzoo.data_pack.data_pack.DataPack'>
+            >>> test_data_transformed = dssm_preprocessor.transform(test_data,
+            ...                                                     verbose=0)
+            >>> type(test_data_transformed)
+            <class 'matchzoo.data_pack.data_pack.DataPack'>
+
+        """
+        super().__init__()
+        self._with_word_hashing = with_word_hashing
+
+    def fit(self, data_pack: DataPack, verbose: int = 1):
+        """
+        Fit pre-processing context for transformation.
+
+        :param verbose: Verbosity.
+        :param data_pack: data_pack to be preprocessed.
+        :return: class:`DSSMPreprocessor` instance.
+        """
+        DEBUG = False
+        if DEBUG:
+            func2 = chain_transform(self.old_units())
+            data_packx = data_pack.apply_on_text(func2, verbose = verbose)
+            # transform text, after tokenizing, remove stop words and blah blah
+            vocab_unit2 = build_vocab_unit(data_packx, verbose = verbose)
+            vocab_size_without_using_letter_ngram = len(vocab_unit2.state['term_index']) + 1
+            print("Vocab size without using letter_ngram", vocab_size_without_using_letter_ngram)
+
+        func = chain_transform(self._default_units())
+        data_pack = data_pack.apply_on_text(func, verbose=verbose)
+        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
+
+        self._context['vocab_unit'] = vocab_unit
+        vocab_size = len(vocab_unit.state['term_index']) + 1
+        if DEBUG:
+            print("Vocab size using letter_ngram", vocab_size)
+        self._context['vocab_size'] = vocab_size
+        self._context['embedding_input_dim'] = vocab_size
+        self._context['input_shapes'] = [(vocab_size,), (vocab_size,)]
+        return self
+
+    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
+        """
+        Apply transformation on data, create `tri-letter` representation.
+
+        :param data_pack: Inputs to be preprocessed.
+        :param verbose: Verbosity.
+
+        :return: Transformed data as :class:`DataPack` object.
+        """
+        data_pack = data_pack.copy()
+        units_ = self._default_units()
+        assert len(units_) == 5, "Must have 5 pre-processing step in DSSM "
+        if self._with_word_hashing:
+            term_index = self._context['vocab_unit'].state['term_index']
+            units_.append(units.WordHashing(term_index))
+        func = chain_transform(units_)
+        data_pack.apply_on_text(func, inplace=True, verbose=verbose)
+        return data_pack
+
+    @classmethod
+    def _default_units(cls) -> list:
+        """Prepare needed process units."""
+        return [
+            units.Tokenize(),
+            units.Lowercase(),
+            units.PuncRemoval(),
+            units.StopRemoval(),
+            units.NgramLetter(),
+        ]
+
+    @property
+    def with_word_hashing(self):
+        """`with_word_hashing` getter."""
+        return self._with_word_hashing
+
+    @with_word_hashing.setter
+    def with_word_hashing(self, value):
+        """`with_word_hashing` setter."""
+        self._with_word_hashing = value
+
+    def old_units(self) -> list:
+        """Prepare needed process units."""
+        return [
+            units.Tokenize(),
+            units.Lowercase(),
+            units.PuncRemoval(),
+            units.StopRemoval()
+        ]
\ No newline at end of file
diff --git a/matchzoo/preprocessors/elmo_basic_preprocessor.py b/matchzoo/preprocessors/elmo_basic_preprocessor.py
new file mode 100644
index 0000000..aae3f01
--- /dev/null
+++ b/matchzoo/preprocessors/elmo_basic_preprocessor.py
@@ -0,0 +1,168 @@
+"""Basic Preprocessor."""
+
+from tqdm import tqdm
+
+from . import units
+from matchzoo import DataPack
+from matchzoo.engine.base_preprocessor import BasePreprocessor
+from .build_vocab_unit import build_vocab_unit
+from .build_unit_from_data_pack import build_unit_from_data_pack
+from .chain_transform import chain_transform
+from handlers.output_handler import FileHandler
+from matchzoo.preprocessors.basic_preprocessor import BasicPreprocessor
+from matchzoo.preprocessors.units import Unit
+tqdm.pandas()
+
+
+class ElmoPreprocessor(BasicPreprocessor):
+    """
+    Baisc preprocessor helper.
+
+    :param fixed_length_left: Integer, maximize length of :attr:`left` in the
+        data_pack.
+    :param fixed_length_right: Integer, maximize length of :attr:`right` in the
+        data_pack.
+    :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can
+        be 'df', 'cf', and 'idf'.
+    :param filter_low_freq: Float, lower bound value used by
+        :class:`FrequenceFilterUnit`.
+    :param filter_high_freq: Float, upper bound value used by
+        :class:`FrequenceFilterUnit`.
+    :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not.
+
+    Example:
+        >>> import matchzoo as mz
+        >>> train_data = mz.datasets.toy.load_data('train')
+        >>> test_data = mz.datasets.toy.load_data('test')
+        >>> preprocessor = mz.preprocessors.BasicPreprocessor(
+        ...     fixed_length_left=10,
+        ...     fixed_length_right=20,
+        ...     filter_mode='df',
+        ...     filter_low_freq=2,
+        ...     filter_high_freq=1000,
+        ...     remove_stop_words=True
+        ... )
+        >>> preprocessor = preprocessor.fit(train_data, verbose=0)
+        >>> preprocessor.context['input_shapes']
+        [(10,), (20,)]
+        >>> preprocessor.context['vocab_size']
+        225
+        >>> processed_train_data = preprocessor.transform(train_data,
+        ...                                               verbose=0)
+        >>> type(processed_train_data)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+        >>> test_data_transformed = preprocessor.transform(test_data,
+        ...                                                verbose=0)
+        >>> type(test_data_transformed)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+
+    """
+
+    def __init__(self, fixed_length_left: int = 30,
+                 fixed_length_right: int = 30,
+                 filter_mode: str = 'df',
+                 filter_low_freq: float = 2,
+                 filter_high_freq: float = float('inf'),
+                 remove_stop_words: bool = False):
+        """Initialization."""
+        super().__init__()
+        self._fixed_length_left = fixed_length_left
+        self._fixed_length_right = fixed_length_right
+        self._left_fixedlength_unit = units.FixedLength(
+            self._fixed_length_left,
+            pad_mode='post'
+        )
+        self._right_fixedlength_unit = units.FixedLength(
+            self._fixed_length_right,
+            pad_mode='post'
+        )
+        # self._filter_unit = units.FrequencyFilter(
+        #     low=filter_low_freq,
+        #     high=filter_high_freq,
+        #     mode=filter_mode
+        # )
+        self._units = [SplitTokenize()]
+        # self._char_left = units.AllenCharUnit(self._fixed_length_left)
+        # self._char_right = units.AllenCharUnit(self._fixed_length_right)
+        if remove_stop_words:
+            self._units.append(units.stop_removal.StopRemoval())
+
+    def fit(self, data_pack: DataPack, verbose: int = 1):
+        """
+        Fit pre-processing context for transformation.
+
+        :param data_pack: data_pack to be preprocessed.
+        :param verbose: Verbosity.
+        :return: class:`BasicPreprocessor` instance.
+        """
+        data_pack = data_pack.apply_on_text(chain_transform(self._units), verbose=verbose)
+        # fitted_filter_unit = build_unit_from_data_pack(self._filter_unit,
+        #                                                data_pack,
+        #                                                flatten=False,
+        #                                                mode='right',
+        #                                                verbose=verbose)
+        # data_pack = data_pack.apply_on_text(fitted_filter_unit.transform,
+        #                                     mode='right', verbose=verbose)
+        # self._context['filter_unit'] = fitted_filter_unit
+
+        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
+        self._context['vocab_unit'] = vocab_unit
+
+        vocab_size = len(vocab_unit.state['term_index']) # + 1  # +1 for padding
+        self._context['vocab_size'] = vocab_size
+        self._context['embedding_input_dim'] = vocab_size
+        self._context['input_shapes'] = [(self._fixed_length_left,),
+                                         (self._fixed_length_right,)]
+
+        return self
+
+    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
+        """
+        Apply transformation on data, create fixed length representation.
+
+        :param data_pack: Inputs to be preprocessed.
+        :param verbose: Verbosity.
+
+        :return: Transformed data as :class:`DataPack` object.
+        """
+        data_pack = data_pack.copy()
+        data_pack.apply_on_text(chain_transform(self._units), inplace=True, verbose=verbose)
+
+        # data_pack.apply_on_text(self._context['filter_unit'].transform,
+        #                         mode='right', inplace=True, verbose=verbose)
+        # data_pack.apply_on_text(self._char_left.transform, mode='left', inplace=True, verbose=verbose, rename="char_left")
+        # data_pack.apply_on_text(self._char_right.transform, mode='right', inplace=True, verbose=verbose, rename="char_right")
+
+        data_pack.apply_on_text(self._context['vocab_unit'].transform,
+                                mode='both', inplace=True, verbose=verbose)
+        data_pack.append_text_length(inplace=True, verbose=verbose)
+        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
+                                mode='left', inplace=True, verbose=verbose)
+        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
+                                mode='right', inplace=True, verbose=verbose)
+
+        max_len_left = self._fixed_length_left
+        max_len_right = self._fixed_length_right
+
+        data_pack.left['length_left'] = \
+            data_pack.left['length_left'].apply(
+                lambda val: min(val, max_len_left))
+
+        data_pack.right['length_right'] = \
+            data_pack.right['length_right'].apply(
+                lambda val: min(val, max_len_right))
+        return data_pack
+
+
+class SplitTokenize(Unit):
+    """Process unit for text tokenization."""
+
+    def transform(self, input_: str) -> list:
+        """
+        Process input data from raw terms to list of tokens.
+
+        :param input_: raw textual input.
+
+        :return tokens: tokenized tokens as a list.
+        """
+        return input_.split()
diff --git a/matchzoo/preprocessors/fact_checking_elmo_preprocessor.py b/matchzoo/preprocessors/fact_checking_elmo_preprocessor.py
new file mode 100644
index 0000000..bf4b6e5
--- /dev/null
+++ b/matchzoo/preprocessors/fact_checking_elmo_preprocessor.py
@@ -0,0 +1,173 @@
+"""Basic Preprocessor."""
+
+from tqdm import tqdm
+
+from . import units
+from matchzoo import DataPack
+from matchzoo.engine.base_preprocessor import BasePreprocessor
+from .build_vocab_unit import build_vocab_unit
+from .build_unit_from_data_pack import build_unit_from_data_pack
+from .chain_transform import chain_transform
+from handlers.output_handler import FileHandler
+from matchzoo.preprocessors.basic_preprocessor import BasicPreprocessor
+from matchzoo.preprocessors.units import Unit
+from matchzoo.preprocessors.declare_preprocessor import build_entity_unit
+tqdm.pandas()
+
+
+class FactCheckingElmoPreprocessor(BasicPreprocessor):
+    """
+    Baisc preprocessor helper for fact-checking with external evidences for my
+
+    :param fixed_length_left: Integer, maximize length of :attr:`left` in the
+        data_pack.
+    :param fixed_length_right: Integer, maximize length of :attr:`right` in the
+        data_pack.
+    :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can
+        be 'df', 'cf', and 'idf'.
+    :param filter_low_freq: Float, lower bound value used by
+        :class:`FrequenceFilterUnit`.
+    :param filter_high_freq: Float, upper bound value used by
+        :class:`FrequenceFilterUnit`.
+    :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not.
+
+    Example:
+        >>> import matchzoo as mz
+        >>> train_data = mz.datasets.toy.load_data('train')
+        >>> test_data = mz.datasets.toy.load_data('test')
+        >>> preprocessor = mz.preprocessors.BasicPreprocessor(
+        ...     fixed_length_left=10,
+        ...     fixed_length_right=20,
+        ...     filter_mode='df',
+        ...     filter_low_freq=2,
+        ...     filter_high_freq=1000,
+        ...     remove_stop_words=True
+        ... )
+        >>> preprocessor = preprocessor.fit(train_data, verbose=0)
+        >>> preprocessor.context['input_shapes']
+        [(10,), (20,)]
+        >>> preprocessor.context['vocab_size']
+        225
+        >>> processed_train_data = preprocessor.transform(train_data,
+        ...                                               verbose=0)
+        >>> type(processed_train_data)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+        >>> test_data_transformed = preprocessor.transform(test_data,
+        ...                                                verbose=0)
+        >>> type(test_data_transformed)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+
+    """
+
+    def __init__(self, fixed_length_left: int = 30,
+                 fixed_length_right: int = 30,
+                 filter_mode: str = 'df',
+                 filter_low_freq: float = 2,
+                 filter_high_freq: float = float('inf'),
+                 remove_stop_words: bool = False):
+        """Initialization."""
+        super().__init__()
+        self._fixed_length_left = fixed_length_left
+        self._fixed_length_right = fixed_length_right
+        self._left_fixedlength_unit = units.FixedLength(
+            self._fixed_length_left,
+            pad_mode='post'
+        )
+        self._right_fixedlength_unit = units.FixedLength(
+            self._fixed_length_right,
+            pad_mode='post'
+        )
+
+        self._units = [SplitTokenize()]
+        if remove_stop_words:
+            self._units.append(units.stop_removal.StopRemoval())
+
+    def fit(self, data_pack: DataPack, verbose: int = 1):
+        """
+        Fit pre-processing context for transformation.
+
+        :param data_pack: data_pack to be preprocessed.
+        :param verbose: Verbosity.
+        :return: class:`BasicPreprocessor` instance.
+        """
+        data_pack = data_pack.apply_on_text(chain_transform(self._units), verbose=verbose)
+        # fitted_filter_unit = build_unit_from_data_pack(self._filter_unit,
+        #                                                data_pack,
+        #                                                flatten=False,
+        #                                                mode='right',
+        #                                                verbose=verbose)
+        # data_pack = data_pack.apply_on_text(fitted_filter_unit.transform,
+        #                                     mode='right', verbose=verbose)
+        # self._context['filter_unit'] = fitted_filter_unit
+
+        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
+        self._context['vocab_unit'] = vocab_unit
+
+        vocab_size = len(vocab_unit.state['term_index'])  # + 1  # +1 for padding
+        self._context['vocab_size'] = vocab_size
+        self._context['embedding_input_dim'] = vocab_size
+        self._context['input_shapes'] = [(self._fixed_length_left,),
+                                         (self._fixed_length_right,)]
+
+        claim_source_unit = build_entity_unit(column="claim_source", data_pack=data_pack, mode="left")
+        article_source_unit = build_entity_unit(column="evidence_source", data_pack=data_pack, mode="right")
+        self._context['claim_source_unit'] = claim_source_unit
+        self._context['article_source_unit'] = article_source_unit
+
+        return self
+
+    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
+        """
+        Apply transformation on data, create fixed length representation.
+
+        :param data_pack: Inputs to be preprocessed.
+        :param verbose: Verbosity.
+
+        :return: Transformed data as :class:`DataPack` object.
+        """
+        data_pack = data_pack.copy()
+
+        def map_claim_source(entity: str): return self._context['claim_source_unit'].transform([entity])
+
+        def map_evidence_source(entity: str): return self._context['article_source_unit'].transform([entity])
+
+        data_pack.left["claim_source"] = data_pack.left["claim_source"].progress_apply(map_claim_source)
+        data_pack.right["evidence_source"] = data_pack.right["evidence_source"].progress_apply(map_evidence_source)
+        data_pack.apply_on_text(chain_transform(self._units), inplace=True, verbose=verbose)
+
+        # data_pack.apply_on_text(self._context['filter_unit'].transform,
+        #                         mode='right', inplace=True, verbose=verbose)
+        data_pack.apply_on_text(self._context['vocab_unit'].transform,
+                                mode='both', inplace=True, verbose=verbose)
+        data_pack.append_text_length(inplace=True, verbose=verbose)
+        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
+                                mode='left', inplace=True, verbose=verbose)
+        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
+                                mode='right', inplace=True, verbose=verbose)
+
+        max_len_left = self._fixed_length_left
+        max_len_right = self._fixed_length_right
+
+        data_pack.left['length_left'] = \
+            data_pack.left['length_left'].apply(
+                lambda val: min(val, max_len_left))
+
+        data_pack.right['length_right'] = \
+            data_pack.right['length_right'].apply(
+                lambda val: min(val, max_len_right))
+
+        return data_pack
+
+
+class SplitTokenize(Unit):
+    """Process unit for text tokenization."""
+
+    def transform(self, input_: str) -> list:
+        """
+        Process input data from raw terms to list of tokens.
+
+        :param input_: raw textual input.
+
+        :return tokens: tokenized tokens as a list.
+        """
+        return input_.split()
diff --git a/matchzoo/preprocessors/mz_pretrained_preprocessor.py b/matchzoo/preprocessors/mz_pretrained_preprocessor.py
new file mode 100644
index 0000000..d897a8e
--- /dev/null
+++ b/matchzoo/preprocessors/mz_pretrained_preprocessor.py
@@ -0,0 +1,250 @@
+
+from tqdm import tqdm
+
+from . import units
+from matchzoo import DataPack
+from matchzoo.engine.base_preprocessor import BasePreprocessor
+from .build_vocab_unit import build_vocab_unit
+from .build_unit_from_data_pack import build_unit_from_data_pack
+from .chain_transform import chain_transform
+from handlers.output_handler import FileHandler
+from pytorch_transformers import PreTrainedTokenizer
+from pytorch_transformers.utils_glue import _truncate_seq_pair
+from typing import List, Tuple
+import pandas as pd
+tqdm.pandas()
+
+
+class PreTrainedModelsProcessor(PreTrainedTokenizer):
+    """
+    a preprocessor for transform DataPack.
+
+    :param fixed_length_left: Integer, maximize length of :attr:`left` in the
+        data_pack.
+    :param fixed_length_right: Integer, maximize length of :attr:`right` in the
+        data_pack.
+    :param filter_mode: String, mode used by :class:`FrequenceFilterUnit`, Can
+        be 'df', 'cf', and 'idf'.
+    :param filter_low_freq: Float, lower bound value used by
+        :class:`FrequenceFilterUnit`.
+    :param filter_high_freq: Float, upper bound value used by
+        :class:`FrequenceFilterUnit`.
+    :param remove_stop_words: Bool, use :class:`StopRemovalUnit` unit or not.
+
+
+    """
+
+    def __init__(self, max_seq_length: int, fixed_length_left: int = -1,
+                 fixed_length_right: int = -1,
+                 filter_mode: str = 'df',
+                 filter_low_freq: float = 2,
+                 filter_high_freq: float = float('inf'),
+                 remove_stop_words: bool = False,
+
+                 tokenizer: PreTrainedTokenizer = None):
+        """Initialization. We may need to store vocab path file, number of tokens, blah blah.
+        """
+        FileHandler.myprint("Query truncated to " + str(fixed_length_left) +
+                            " Doc truncated to " + str(fixed_length_right))
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.max_seq_length = max_seq_length
+        assert fixed_length_left > 0 and fixed_length_right > 0
+        self.fixed_length_left = fixed_length_left
+        self.fixed_length_right = fixed_length_right
+        assert self.fixed_length_left + self.fixed_length_right < self.max_seq_length, \
+            "Left + right should be smaller than max length"
+
+
+    def fit(self, data_pack: pd.DataFrame, verbose: int = 1):
+        """
+        Fit pre-processing context for transformation.
+
+        :param data_pack: data_pack to be preprocessed.
+        :param verbose: Verbosity.
+        :return: class:`BasicPreprocessor` instance.
+        """
+        raise NotImplementedError("Not coded yet")
+
+    def transform(self, data_pack: pd.DataFrame, verbose: int = 1) -> Tuple[pd.DataFrame, List]:
+        """
+        Apply transformation on data, create fixed length representation.
+
+        :param data_pack: Inputs to be preprocessed.
+        :param verbose: Verbosity.
+
+        :return: Transformed data as :class:`DataPack` object.
+        """
+
+
+        # data_pack.append_text_length(inplace = True, verbose = verbose)
+        # we need to split each text_left to an array of tokens, then we can convert them to ids
+        converted_features = self._convert_examples_to_features(data_pack, label_list = [0, 1], max_seq_length = self.max_seq_length,
+                                     tokenizer = self.tokenizer, output_mode = "classification")
+
+        # data_pack.apply_on_text(str.split, mode = 'left', inplace = True, verbose = verbose)
+        # data_pack.apply_on_text(self.tokenizer.convert_tokens_to_ids,
+        #                         mode = 'left', inplace = True, verbose = verbose)
+
+        # data_pack.apply_on_text(str.split, mode = 'right', inplace = True, verbose = verbose)
+        # data_pack.apply_on_text(self.tokenizer.convert_tokens_to_ids,
+        #                         mode = 'right', inplace = True, verbose = verbose)
+
+        # max_len_left = self._fixed_length_left
+        # max_len_right = self._fixed_length_right
+        #
+        # data_pack.left['length_left'] = \
+        #     data_pack.left['length_left'].apply(
+        #         lambda val: min(val, max_len_left))
+        #
+        # data_pack.right['length_right'] = \
+        #     data_pack.right['length_right'].apply(
+        #         lambda val: min(val, max_len_right))
+        return data_pack, converted_features
+
+
+
+    def _convert_examples_to_features(self, examples: pd.DataFrame, label_list, max_seq_length,
+                                     tokenizer, output_mode,
+                                     cls_token_at_end = False,
+                                     cls_token = '[CLS]',
+                                     cls_token_segment_id = 1,
+                                     sep_token = '[SEP]',
+                                     sep_token_extra = False,
+                                     pad_on_left = False,
+                                     pad_token = 0,
+                                     pad_token_segment_id = 0,
+                                     sequence_a_segment_id = 0,
+                                     sequence_b_segment_id = 1,
+                                     mask_padding_with_zero = True):
+        """ Loads a data file into a list of `InputBatch`s
+            `cls_token_at_end` define the location of the CLS token:
+                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
+                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
+            `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
+        """
+
+        label_map = {label: i for i, label in enumerate(label_list)}
+        # from tqdm import tqdm
+        features = []
+        ex_index = -1
+        FileHandler.myprint("Processing text_left and text_right to make it a full sequence for BERT........")
+        assert type(examples) == pd.DataFrame
+        for q_id, text_a, doc_id, text_b, label in zip(examples["id_left"], examples["text_left"],
+                                                       examples["id_right"], examples["text_right"], examples["label"]):
+            ex_index += 1
+            if ex_index % 10000 == 0: FileHandler.myprint("Processed xample %d of %d" % (ex_index, len(examples)))
+            tokens_a = text_a.split()
+            tokens_a = tokens_a[:self.fixed_length_left]
+            tokens_b = None
+            assert len(text_b) != 0, "Length of documents must be not zero!"
+            if text_b:
+                tokens_b = text_b.split()
+                tokens_b = tokens_b[: self.fixed_length_right]
+                # Modifies `tokens_a` and `tokens_b` in place so that the total
+                # length is less than the specified length.
+                # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
+                special_tokens_count = 4 if sep_token_extra else 3
+                _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
+            else:
+                # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
+                special_tokens_count = 3 if sep_token_extra else 2
+                if len(tokens_a) > max_seq_length - special_tokens_count:
+                    tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
+
+            # The convention in BERT is:
+            # (a) For sequence pairs:
+            #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+            #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
+            # (b) For single sequences:
+            #  tokens:   [CLS] the dog is hairy . [SEP]
+            #  type_ids:   0   0   0   0  0     0   0
+            #
+            # Where "type_ids" are used to indicate whether this is the first
+            # sequence or the second sequence. The embedding vectors for `type=0` and
+            # `type=1` were learned during pre-training and are added to the wordpiece
+            # embedding vector (and position vector). This is not *strictly* necessary
+            # since the [SEP] token unambiguously separates the sequences, but it makes
+            # it easier for the model to learn the concept of sequences.
+            #
+            # For classification tasks, the first vector (corresponding to [CLS]) is
+            # used as as the "sentence vector". Note that this only makes sense because
+            # the entire model is fine-tuned.
+            tokens = tokens_a + [sep_token]
+            if sep_token_extra:
+                # roberta uses an extra separator b/w pairs of sentences
+                tokens += [sep_token]
+            segment_ids = [sequence_a_segment_id] * len(tokens)
+
+            if tokens_b:
+                tokens += tokens_b + [sep_token]
+                segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
+
+            if cls_token_at_end:
+                tokens = tokens + [cls_token]
+                segment_ids = segment_ids + [cls_token_segment_id]
+            else:
+                tokens = [cls_token] + tokens
+                segment_ids = [cls_token_segment_id] + segment_ids
+
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            padding_length = max_seq_length - len(input_ids)
+            if pad_on_left:
+                input_ids = ([pad_token] * padding_length) + input_ids
+                input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
+                segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
+            else:
+                input_ids = input_ids + ([pad_token] * padding_length)
+                input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+                segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
+
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+
+            if output_mode == "classification":
+                label_id = label_map[label]
+            elif output_mode == "regression":
+                label_id = float(label)
+            else:
+                raise KeyError(output_mode)
+
+            if ex_index < 5:
+                FileHandler.myprint("*** Example ***")
+                # FileHandler.myprint("guid: %s" % (example.guid))
+                FileHandler.myprint("tokens: %s" % " ".join(
+                    [str(x) for x in tokens]))
+                FileHandler.myprint("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                FileHandler.myprint("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                FileHandler.myprint("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+                FileHandler.myprint("label: %s (id = %d)" % (label, label_id))
+
+            features.append(
+                InputFeatures(left_id = q_id, right_id = doc_id,
+                              text_left = text_a, text_right = text_b,
+                              input_ids = input_ids,
+                              input_mask = input_mask,
+                              segment_ids = segment_ids,
+                              label_id = label_id))
+        return features
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, left_id: int, right_id: int,
+                 text_left: str, text_right: str,
+                 input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+        self.left_id = left_id
+        self.right_id = right_id
+        self.text_left = text_left
+        self.text_right = text_right
\ No newline at end of file
diff --git a/matchzoo/preprocessors/naive_preprocessor.py b/matchzoo/preprocessors/naive_preprocessor.py
new file mode 100644
index 0000000..139da4e
--- /dev/null
+++ b/matchzoo/preprocessors/naive_preprocessor.py
@@ -0,0 +1,61 @@
+"""Naive Preprocessor."""
+
+from tqdm import tqdm
+
+from matchzoo.engine.base_preprocessor import BasePreprocessor
+from matchzoo import DataPack
+from .chain_transform import chain_transform
+from .build_vocab_unit import build_vocab_unit
+from . import units
+
+tqdm.pandas()
+
+
+class NaivePreprocessor(BasePreprocessor):
+    """
+    Naive preprocessor.
+
+    Example:
+        >>> import matchzoo as mz
+        >>> train_data = mz.datasets.toy.load_data()
+        >>> test_data = mz.datasets.toy.load_data(stage='test')
+        >>> preprocessor = mz.preprocessors.NaivePreprocessor()
+        >>> train_data_processed = preprocessor.fit_transform(train_data,
+        ...                                                   verbose=0)
+        >>> type(train_data_processed)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+        >>> test_data_transformed = preprocessor.transform(test_data,
+        ...                                                verbose=0)
+        >>> type(test_data_transformed)
+        <class 'matchzoo.data_pack.data_pack.DataPack'>
+
+    """
+
+    def fit(self, data_pack: DataPack, verbose: int = 1):
+        """
+        Fit pre-processing context for transformation.
+
+        :param data_pack: data_pack to be preprocessed.
+        :param verbose: Verbosity.
+        :return: class:`NaivePreprocessor` instance.
+        """
+        func = chain_transform(self._default_units())
+        data_pack = data_pack.apply_on_text(func, verbose=verbose)
+        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
+        self._context['vocab_unit'] = vocab_unit
+        return self
+
+    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
+        """
+        Apply transformation on data, create `tri-letter` representation.
+
+        :param data_pack: Inputs to be preprocessed.
+        :param verbose: Verbosity.
+
+        :return: Transformed data as :class:`DataPack` object.
+        """
+        units_ = self._default_units()
+        units_.append(self._context['vocab_unit'])
+        units_.append(units.FixedLength(text_length=30, pad_mode='post'))
+        func = chain_transform(units_)
+        return data_pack.apply_on_text(func, verbose=verbose)
diff --git a/matchzoo/preprocessors/tfidf_preprocessor.py b/matchzoo/preprocessors/tfidf_preprocessor.py
new file mode 100644
index 0000000..896aa77
--- /dev/null
+++ b/matchzoo/preprocessors/tfidf_preprocessor.py
@@ -0,0 +1,41 @@
+import matchzoo
+from typing import List, Dict, Tuple
+import torch_utils
+import collections
+import numpy as np
+
+class TFIDF:
+
+	idf_dict = {}
+	idf_char_ngram = {}
+
+	@classmethod
+	def init(cls, corpus: List[List[str]], char_ngram_copus: List[List[str]] = []):
+		"""docid, value: list of words """
+		stats = cls._idf(corpus)
+		cls.idf_dict = stats
+		if char_ngram_copus:
+			cls.idf_char_ngram = cls._idf(char_ngram_copus)
+
+	@classmethod
+	def get_term_idf(cls):
+		return cls.idf_dict
+
+	@classmethod
+	def get_char_ngram_idf(cls):
+		return cls.idf_char_ngram
+
+	@classmethod
+	def _df(cls, list_of_tokens: list) -> dict:
+		stats = collections.Counter()
+		for tokens in list_of_tokens:
+			stats.update(set(tokens))
+		return stats
+
+	@classmethod
+	def _idf(cls, list_of_tokens: list) -> dict:
+		num_docs = len(list_of_tokens)
+		stats = cls._df(list_of_tokens)
+		for key, val in stats.most_common():
+			stats[key] = np.log((1.0 + num_docs) / (1.0 + val)) + 1.0
+		return stats
\ No newline at end of file
diff --git a/matchzoo/preprocessors/units/__init__.py b/matchzoo/preprocessors/units/__init__.py
new file mode 100644
index 0000000..c34bed0
--- /dev/null
+++ b/matchzoo/preprocessors/units/__init__.py
@@ -0,0 +1,21 @@
+from .unit import Unit
+from .digit_removal import DigitRemoval
+from .fixed_length import FixedLength
+from .frequency_filter import FrequencyFilter
+from .lemmatization import Lemmatization
+from .lowercase import Lowercase
+from .matching_histogram import MatchingHistogram
+from .ngram_letter import NgramLetter
+from .punc_removal import PuncRemoval
+from .stateful_unit import StatefulUnit
+from .stemming import Stemming
+from .stop_removal import StopRemoval
+from .tokenize import Tokenize
+from .vocabulary import Vocabulary
+from .word_hashing import WordHashing
+# from .allen_char_unit import AllenCharUnit
+
+
+def list_available() -> list:
+    from matchzoo.utils import list_recursive_concrete_subclasses
+    return list_recursive_concrete_subclasses(Unit)
diff --git a/matchzoo/preprocessors/units/allen_char_unit.py b/matchzoo/preprocessors/units/allen_char_unit.py
new file mode 100644
index 0000000..000999f
--- /dev/null
+++ b/matchzoo/preprocessors/units/allen_char_unit.py
@@ -0,0 +1,60 @@
+from .unit import Unit
+from allennlp.modules.elmo import batch_to_ids
+
+
+class AllenCharUnit(Unit):
+    """
+    Process unit for n-letter generation.
+
+    Triletter is used in :class:`DSSMModel`.
+    This processor is expected to execute before `Vocab`
+    has been created.
+
+    Examples:
+        >>> triletter = AllenCharUnit()
+        >>> rv = triletter.transform(['hello', 'word'])
+        >>> len(rv)
+        9
+        >>> rv
+        ['#he', 'hel', 'ell', 'llo', 'lo#', '#wo', 'wor', 'ord', 'rd#']
+        >>> triletter = AllenCharUnit(reduce_dim=False)
+        >>> rv = triletter.transform(['hello', 'word'])
+        >>> len(rv)
+        2
+        >>> rv
+        [['#he', 'hel', 'ell', 'llo', 'lo#'], ['#wo', 'wor', 'ord', 'rd#']]
+
+    """
+
+    def __init__(self, max_len: int, pad_mode='post'):
+        """
+        Class initialization.
+
+        :param ngram: By default use 3-gram (tri-letter).
+        :param reduce_dim: Reduce to 1-D list for sentence representation.
+        """
+        self._max_len = max_len
+        self._pad_mode = pad_mode
+
+    def transform(self, input_: list) -> list:
+        """
+        Transform token into letter.
+
+        For example, `word` should be represented as `w`, `o`, `r`, `d`
+
+        :param input_: list of tokens to be transformed.
+
+        :return n_letters: generated n_letters.
+        """
+        fixed_tokens = [[0] * 50 for _ in range(self._max_len)]
+        input_ = [input_[:self._max_len]]  # to satisfy the f*** function
+        ans = batch_to_ids(input_)[0].tolist()  # List[List[int]] len <= self.max_len, each element is 50 numbers.
+        # padding 0 to ans
+        if self._pad_mode == 'post':
+            fixed_tokens[:len(ans)] = ans
+        elif self._pad_mode == 'pre':
+            fixed_tokens[-len(ans):] = ans
+        else:
+            raise ValueError('{} is not a vaild '
+                             'pad mode.'.format(self._pad_mode))
+        return fixed_tokens
diff --git a/matchzoo/preprocessors/units/digit_removal.py b/matchzoo/preprocessors/units/digit_removal.py
new file mode 100644
index 0000000..bff5f08
--- /dev/null
+++ b/matchzoo/preprocessors/units/digit_removal.py
@@ -0,0 +1,15 @@
+from .unit import Unit
+
+
+class DigitRemoval(Unit):
+    """Process unit to remove digits."""
+
+    def transform(self, input_: list) -> list:
+        """
+        Remove digits from list of tokens.
+
+        :param input_: list of tokens to be filtered.
+
+        :return tokens: tokens of tokens without digits.
+        """
+        return [token for token in input_ if not token.isdigit()]
diff --git a/matchzoo/preprocessors/units/fixed_length.py b/matchzoo/preprocessors/units/fixed_length.py
new file mode 100644
index 0000000..d1425f0
--- /dev/null
+++ b/matchzoo/preprocessors/units/fixed_length.py
@@ -0,0 +1,79 @@
+import typing
+
+import numpy as np
+
+from .unit import Unit
+
+
+class FixedLength(Unit):
+    """
+    FixedLengthUnit Class.
+
+    Process unit to get the fixed length text.
+
+    Examples:
+        >>> from matchzoo.preprocessors.units import FixedLength
+        >>> fixedlen = FixedLength(3)
+        >>> fixedlen.transform(list(range(1, 6))) == [3, 4, 5]
+        True
+        >>> fixedlen.transform(list(range(1, 3))) == [0, 1, 2]
+        True
+
+    """
+
+    def __init__(
+        self,
+        text_length: int,
+        pad_value: typing.Union[int, str] = 0,
+        pad_mode: str = 'pre',
+        truncate_mode: str = 'pre'
+    ):
+        """
+        Class initialization.
+
+        :param text_length: fixed length of the text.
+        :param pad_value: if text length is smaller than :attr:`text_length`,
+            filling text with :attr:`pad_value`.
+        :param pad_mode: String, `pre` or `post`:
+            pad either before or after each sequence.
+        :param truncate_mode: String, `pre` or `post`:
+            remove values from sequences larger than :attr:`text_length`,
+            either at the beginning or at the end of the sequences.
+        """
+        self._text_length = text_length
+        self._pad_value = pad_value
+        self._pad_mode = pad_mode
+        self._truncate_mode = truncate_mode
+
+    def transform(self, input_: list) -> list:
+        """
+        Transform list of tokenized tokens into the fixed length text.
+
+        :param input_: list of tokenized tokens.
+
+        :return tokens: list of tokenized tokens in fixed length.
+        """
+        # padding process can not handle empty list as input
+        if len(input_) == 0:
+            input_ = [self._pad_value]
+        np_tokens = np.array(input_)
+        fixed_tokens = np.full([self._text_length], self._pad_value,
+                               dtype=np_tokens.dtype)
+
+        if self._truncate_mode == 'pre':
+            trunc_tokens = input_[-self._text_length:]
+        elif self._truncate_mode == 'post':
+            trunc_tokens = input_[:self._text_length]
+        else:
+            raise ValueError('{} is not a vaild '
+                             'truncate mode.'.format(self._truncate_mode))
+
+        if self._pad_mode == 'post':
+            fixed_tokens[:len(trunc_tokens)] = trunc_tokens
+        elif self._pad_mode == 'pre':
+            fixed_tokens[-len(trunc_tokens):] = trunc_tokens
+        else:
+            raise ValueError('{} is not a vaild '
+                             'pad mode.'.format(self._pad_mode))
+
+        return fixed_tokens.tolist()
diff --git a/matchzoo/preprocessors/units/frequency_filter.py b/matchzoo/preprocessors/units/frequency_filter.py
new file mode 100644
index 0000000..b2fdcd9
--- /dev/null
+++ b/matchzoo/preprocessors/units/frequency_filter.py
@@ -0,0 +1,96 @@
+import collections
+import typing
+
+import numpy as np
+
+from .stateful_unit import StatefulUnit
+
+
+class FrequencyFilter(StatefulUnit):
+    """
+    Frequency filter unit.
+
+    :param low: Lower bound, inclusive.
+    :param high: Upper bound, exclusive.
+    :param mode: One of `tf` (term frequency), `df` (document frequency),
+        and `idf` (inverse document frequency).
+
+    Examples::
+        >>> import matchzoo as mz
+
+    To filter based on term frequency (tf):
+        >>> tf_filter = mz.preprocessors.units.FrequencyFilter(
+        ...     low=2, mode='tf')
+        >>> tf_filter.fit([['A', 'B', 'B'], ['C', 'C', 'C']])
+        >>> tf_filter.transform(['A', 'B', 'C'])
+        ['B', 'C']
+
+    To filter based on document frequency (df):
+        >>> tf_filter = mz.preprocessors.units.FrequencyFilter(
+        ...     low=2, mode='df')
+        >>> tf_filter.fit([['A', 'B'], ['B', 'C']])
+        >>> tf_filter.transform(['A', 'B', 'C'])
+        ['B']
+
+    To filter based on inverse document frequency (idf):
+        >>> idf_filter = mz.preprocessors.units.FrequencyFilter(
+        ...     low=1.2, mode='idf')
+        >>> idf_filter.fit([['A', 'B'], ['B', 'C', 'D']])
+        >>> idf_filter.transform(['A', 'B', 'C'])
+        ['A', 'C']
+
+    """
+
+    def __init__(self, low: float = 0, high: float = float('inf'),
+                 mode: str = 'df'):
+        """Frequency filter unit."""
+        super().__init__()
+        self._low = low
+        self._high = high
+        self._mode = mode
+
+    def fit(self, list_of_tokens: typing.List[typing.List[str]]):
+        """Fit `list_of_tokens` by calculating `mode` states."""
+        valid_terms = set()
+        if self._mode == 'tf':
+            stats = self._tf(list_of_tokens)
+        elif self._mode == 'df':
+            stats = self._df(list_of_tokens)
+        elif self._mode == 'idf':
+            stats = self._idf(list_of_tokens)
+        else:
+            raise ValueError("%s is not a valid filtering mode."
+                             "Mode must be one of `tf`, `df`, and `idf`." % (self._mode))
+
+        for k, v in stats.items():
+            if self._low <= v < self._high:
+                valid_terms.add(k)
+
+        self._state[self._mode] = valid_terms
+
+    def transform(self, input_: list) -> list:
+        """Transform a list of tokens by filtering out unwanted words."""
+        valid_terms = self._state[self._mode]
+        return list(filter(lambda token: token in valid_terms, input_))
+
+    @classmethod
+    def _tf(cls, list_of_tokens: list) -> dict:
+        stats = collections.Counter()
+        for tokens in list_of_tokens:
+            stats.update(tokens)
+        return stats
+
+    @classmethod
+    def _df(cls, list_of_tokens: list) -> dict:
+        stats = collections.Counter()
+        for tokens in list_of_tokens:
+            stats.update(set(tokens))
+        return stats
+
+    @classmethod
+    def _idf(cls, list_of_tokens: list) -> dict:
+        num_docs = len(list_of_tokens)
+        stats = cls._df(list_of_tokens)
+        for key, val in stats.most_common():
+            stats[key] = np.log((1 + num_docs) / (1 + val)) + 1
+        return stats
diff --git a/matchzoo/preprocessors/units/lemmatization.py b/matchzoo/preprocessors/units/lemmatization.py
new file mode 100644
index 0000000..23c05b0
--- /dev/null
+++ b/matchzoo/preprocessors/units/lemmatization.py
@@ -0,0 +1,18 @@
+import nltk
+
+from .unit import Unit
+
+
+class Lemmatization(Unit):
+    """Process unit for token lemmatization."""
+
+    def transform(self, input_: list) -> list:
+        """
+        Lemmatization a sequence of tokens.
+
+        :param input_: list of tokens to be lemmatized.
+
+        :return tokens: list of lemmatizd tokens.
+        """
+        lemmatizer = nltk.WordNetLemmatizer()
+        return [lemmatizer.lemmatize(token, pos='v') for token in input_]
diff --git a/matchzoo/preprocessors/units/lowercase.py b/matchzoo/preprocessors/units/lowercase.py
new file mode 100644
index 0000000..1dabb67
--- /dev/null
+++ b/matchzoo/preprocessors/units/lowercase.py
@@ -0,0 +1,15 @@
+from .unit import Unit
+
+
+class Lowercase(Unit):
+    """Process unit for text lower case."""
+
+    def transform(self, input_: list) -> list:
+        """
+        Convert list of tokens to lower case.
+
+        :param input_: list of tokens.
+
+        :return tokens: lower-cased list of tokens.
+        """
+        return [token.lower() for token in input_]
diff --git a/matchzoo/preprocessors/units/matching_histogram.py b/matchzoo/preprocessors/units/matching_histogram.py
new file mode 100644
index 0000000..3746bad
--- /dev/null
+++ b/matchzoo/preprocessors/units/matching_histogram.py
@@ -0,0 +1,60 @@
+import numpy as np
+
+from .unit import Unit
+
+
+class MatchingHistogram(Unit):
+    """
+    MatchingHistogramUnit Class.
+
+    :param bin_size: The number of bins of the matching histogram.
+    :param embedding_matrix: The word embedding matrix applied to calculate
+                             the matching histogram.
+    :param normalize: Boolean, normalize the embedding or not.
+    :param mode: The type of the historgram, it should be one of 'CH', 'NG',
+                 or 'LCH'.
+
+    Examples:
+        >>> embedding_matrix = np.array([[1.0, -1.0], [1.0, 2.0], [1.0, 3.0]])
+        >>> text_left = [0, 1]
+        >>> text_right = [1, 2]
+        >>> histogram = MatchingHistogram(3, embedding_matrix, True, 'CH')
+        >>> histogram.transform([text_left, text_right])
+        [[3.0, 1.0, 1.0], [1.0, 2.0, 2.0]]
+
+    """
+
+    def __init__(self, bin_size: int = 30, embedding_matrix=None,
+                 normalize=True, mode: str = 'LCH'):
+        """The constructor."""
+        self._hist_bin_size = bin_size
+        self._embedding_matrix = embedding_matrix
+        if normalize:
+            self._normalize_embedding()
+        self._mode = mode
+
+    def _normalize_embedding(self):
+        """Normalize the embedding matrix."""
+        l2_norm = np.sqrt(
+            (self._embedding_matrix * self._embedding_matrix).sum(axis=1)
+        )
+        self._embedding_matrix = \
+            self._embedding_matrix / l2_norm[:, np.newaxis]
+
+    def transform(self, input_: list) -> list:
+        """Transform the input text."""
+        text_left, text_right = input_
+        matching_hist = np.ones((len(text_left), self._hist_bin_size),
+                                dtype=np.float32)
+        embed_left = self._embedding_matrix[text_left]
+        embed_right = self._embedding_matrix[text_right]
+        matching_matrix = embed_left.dot(np.transpose(embed_right))
+        for (i, j), value in np.ndenumerate(matching_matrix):
+            bin_index = int((value + 1.) / 2. * (self._hist_bin_size - 1.))
+            matching_hist[i][bin_index] += 1.0
+        if self._mode == 'NH':
+            matching_sum = matching_hist.sum(axis=1)
+            matching_hist = matching_hist / matching_sum[:, np.newaxis]
+        elif self._mode == 'LCH':
+            matching_hist = np.log(matching_hist)
+        return matching_hist.tolist()
diff --git a/matchzoo/preprocessors/units/ngram_letter.py b/matchzoo/preprocessors/units/ngram_letter.py
new file mode 100644
index 0000000..a957f33
--- /dev/null
+++ b/matchzoo/preprocessors/units/ngram_letter.py
@@ -0,0 +1,60 @@
+from .unit import Unit
+
+
+class NgramLetter(Unit):
+    """
+    Process unit for n-letter generation.
+
+    Triletter is used in :class:`DSSMModel`.
+    This processor is expected to execute before `Vocab`
+    has been created.
+
+    Examples:
+        >>> triletter = NgramLetter()
+        >>> rv = triletter.transform(['hello', 'word'])
+        >>> len(rv)
+        9
+        >>> rv
+        ['#he', 'hel', 'ell', 'llo', 'lo#', '#wo', 'wor', 'ord', 'rd#']
+        >>> triletter = NgramLetter(reduce_dim=False)
+        >>> rv = triletter.transform(['hello', 'word'])
+        >>> len(rv)
+        2
+        >>> rv
+        [['#he', 'hel', 'ell', 'llo', 'lo#'], ['#wo', 'wor', 'ord', 'rd#']]
+
+    """
+
+    def __init__(self, ngram: int = 3, reduce_dim: bool = True):
+        """
+        Class initialization.
+
+        :param ngram: By default use 3-gram (tri-letter).
+        :param reduce_dim: Reduce to 1-D list for sentence representation.
+        """
+        self._ngram = ngram
+        self._reduce_dim = reduce_dim
+
+    def transform(self, input_: list) -> list:
+        """
+        Transform token into tri-letter.
+
+        For example, `word` should be represented as `#wo`,
+        `wor`, `ord` and `rd#`.
+
+        :param input_: list of tokens to be transformed.
+
+        :return n_letters: generated n_letters.
+        """
+        n_letters = []
+        for token in input_:
+            token = '#' + token + '#'
+            token_ngram = []
+            while len(token) >= self._ngram:
+                token_ngram.append(token[:self._ngram])
+                token = token[1:]
+            if self._reduce_dim:
+                n_letters.extend(token_ngram)
+            else:
+                n_letters.append(token_ngram)
+        return n_letters
diff --git a/matchzoo/preprocessors/units/punc_removal.py b/matchzoo/preprocessors/units/punc_removal.py
new file mode 100644
index 0000000..302a470
--- /dev/null
+++ b/matchzoo/preprocessors/units/punc_removal.py
@@ -0,0 +1,20 @@
+import re
+
+from .unit import Unit
+
+
+class PuncRemoval(Unit):
+    """Process unit for remove punctuations."""
+
+    _MATCH_PUNC = re.compile(r'[^\w\s]')
+
+    def transform(self, input_: list) -> list:
+        """
+        Remove punctuations from list of tokens.
+
+        :param input_: list of toekns.
+
+        :return rv: tokens  without punctuation.
+        """
+        return [token for token in input_ if
+                not self._MATCH_PUNC.search(token)]
diff --git a/matchzoo/preprocessors/units/stateful_unit.py b/matchzoo/preprocessors/units/stateful_unit.py
new file mode 100644
index 0000000..9f8b3fc
--- /dev/null
+++ b/matchzoo/preprocessors/units/stateful_unit.py
@@ -0,0 +1,21 @@
+import abc
+import typing
+
+from .unit import Unit
+
+
+class StatefulUnit(Unit, metaclass=abc.ABCMeta):
+    """Process unit do persive state (i.e. need fit)."""
+
+    def __init__(self):
+        """Initialization."""
+        self._state = {}
+
+    @property
+    def state(self):
+        """Get current state."""
+        return self._state
+
+    @abc.abstractmethod
+    def fit(self, input: typing.Any):
+        """Abstract base method, need to be implemented in subclass."""
diff --git a/matchzoo/preprocessors/units/stemming.py b/matchzoo/preprocessors/units/stemming.py
new file mode 100644
index 0000000..83bf4eb
--- /dev/null
+++ b/matchzoo/preprocessors/units/stemming.py
@@ -0,0 +1,32 @@
+import nltk
+
+from .unit import Unit
+
+
+class Stemming(Unit):
+    """
+    Process unit for token stemming.
+
+    :param stemmer: stemmer to use, `porter` or `lancaster`.
+    """
+
+    def __init__(self, stemmer='porter'):
+        """Initialization."""
+        self.stemmer = stemmer
+
+    def transform(self, input_: list) -> list:
+        """
+        Reducing inflected words to their word stem, base or root form.
+
+        :param input_: list of string to be stemmed.
+        """
+        if self.stemmer == 'porter':
+            porter_stemmer = nltk.stem.PorterStemmer()
+            return [porter_stemmer.stem(token) for token in input_]
+        elif self.stemmer == 'lancaster' or self.stemmer == 'krovetz':
+            lancaster_stemmer = nltk.stem.lancaster.LancasterStemmer()
+            return [lancaster_stemmer.stem(token) for token in input_]
+        else:
+            raise ValueError(
+                'Not supported supported stemmer type: {}'.format(
+                    self.stemmer))
diff --git a/matchzoo/preprocessors/units/stop_removal.py b/matchzoo/preprocessors/units/stop_removal.py
new file mode 100644
index 0000000..ad5ff23
--- /dev/null
+++ b/matchzoo/preprocessors/units/stop_removal.py
@@ -0,0 +1,45 @@
+import nltk
+
+from .unit import Unit
+
+
+class StopRemoval(Unit):
+    """
+    Process unit to remove stop words.
+
+    Example:
+        >>> unit = StopRemoval()
+        >>> unit.transform(['a', 'the', 'test'])
+        ['test']
+        >>> type(unit.stopwords)
+        <class 'list'>
+    """
+
+    def __init__(self, lang: str = 'english'):
+        """Initialization."""
+        self._lang = lang
+        self._stop = nltk.corpus.stopwords.words(self._lang)
+
+    def transform(self, input_: list) -> list:
+        """
+        Remove stopwords from list of tokenized tokens.
+
+        :param input_: list of tokenized tokens.
+        :param lang: language code for stopwords.
+
+        :return tokens: list of tokenized tokens without stopwords.
+        """
+        return [token
+                for token
+                in input_
+                if token not in self._stop]
+
+    @property
+    def stopwords(self) -> list:
+        """
+        Get stopwords based on language.
+
+        :params lang: language code.
+        :return: list of stop words.
+        """
+        return self._stop
diff --git a/matchzoo/preprocessors/units/tokenize.py b/matchzoo/preprocessors/units/tokenize.py
new file mode 100644
index 0000000..1aeb2e6
--- /dev/null
+++ b/matchzoo/preprocessors/units/tokenize.py
@@ -0,0 +1,17 @@
+import nltk
+
+from .unit import Unit
+
+
+class Tokenize(Unit):
+    """Process unit for text tokenization."""
+
+    def transform(self, input_: str) -> list:
+        """
+        Process input data from raw terms to list of tokens.
+
+        :param input_: raw textual input.
+
+        :return tokens: tokenized tokens as a list.
+        """
+        return nltk.word_tokenize(input_)
diff --git a/matchzoo/preprocessors/units/unit.py b/matchzoo/preprocessors/units/unit.py
new file mode 100644
index 0000000..4dd3390
--- /dev/null
+++ b/matchzoo/preprocessors/units/unit.py
@@ -0,0 +1,10 @@
+import abc
+import typing
+
+
+class Unit(metaclass=abc.ABCMeta):
+    """Process unit do not persive state (i.e. do not need fit)."""
+
+    @abc.abstractmethod
+    def transform(self, input_: typing.Any):
+        """Abstract base method, need to be implemented in subclass."""
diff --git a/matchzoo/preprocessors/units/vocabulary.py b/matchzoo/preprocessors/units/vocabulary.py
new file mode 100644
index 0000000..c2caace
--- /dev/null
+++ b/matchzoo/preprocessors/units/vocabulary.py
@@ -0,0 +1,121 @@
+from .stateful_unit import StatefulUnit
+
+
+class Vocabulary(StatefulUnit):
+    # """
+    # Vocabulary class. MatchZoo-Tensorflow
+    #
+    # Examples:
+    #     >>> vocab = Vocabulary()
+    #     >>> vocab.fit(['A', 'B', 'C', 'D', 'E'])
+    #     >>> term_index = vocab.state['term_index']
+    #     >>> term_index  # doctest: +SKIP
+    #     {'E': 1, 'C': 2, 'D': 3, 'A': 4, 'B': 5}
+    #     >>> index_term = vocab.state['index_term']
+    #     >>> index_term  # doctest: +SKIP
+    #     {1: 'C', 2: 'A', 3: 'E', 4: 'B', 5: 'D'}
+    #
+    #     >>> term_index['out-of-vocabulary-term']
+    #     0
+    #     >>> index_term[0]
+    #     ''
+    #     >>> index_term[42]
+    #     Traceback (most recent call last):
+    #         ...
+    #     KeyError: 42
+    #     >>> a_index = term_index['A']
+    #     >>> c_index = term_index['C']
+    #     >>> vocab.transform(['C', 'A', 'C']) == [c_index, a_index, c_index]
+    #     True
+    #     >>> vocab.transform(['C', 'A', 'OOV']) == [c_index, a_index, 0]
+    #     True
+    #     >>> indices = vocab.transform(list('ABCDDZZZ'))
+    #     >>> ''.join(vocab.state['index_term'][i] for i in indices)
+    #     'ABCDD'
+    #
+    # """
+    """
+    Vocabulary class. MatchZoo-PyTorch
+
+    :param pad_value: The string value for the padding position.
+    :param oov_value: The string value for the out-of-vocabulary terms.
+
+    Examples:
+        >>> vocab = Vocabulary(pad_value='[PAD]', oov_value='[OOV]')
+        >>> vocab.fit(['A', 'B', 'C', 'D', 'E'])
+        >>> term_index = vocab.state['term_index']
+        >>> term_index  # doctest: +SKIP
+        {'[PAD]': 0, '[OOV]': 1, 'D': 2, 'A': 3, 'B': 4, 'C': 5, 'E': 6}
+        >>> index_term = vocab.state['index_term']
+        >>> index_term  # doctest: +SKIP
+        {0: '[PAD]', 1: '[OOV]', 2: 'D', 3: 'A', 4: 'B', 5: 'C', 6: 'E'}
+
+        >>> term_index['out-of-vocabulary-term']
+        1
+        >>> index_term[0]
+        '[PAD]'
+        >>> index_term[42]
+        Traceback (most recent call last):
+            ...
+        KeyError: 42
+        >>> a_index = term_index['A']
+        >>> c_index = term_index['C']
+        >>> vocab.transform(['C', 'A', 'C']) == [c_index, a_index, c_index]
+        True
+        >>> vocab.transform(['C', 'A', '[OOV]']) == [c_index, a_index, 1]
+        True
+        >>> indices = vocab.transform(list('ABCDDZZZ'))
+        >>> ' '.join(vocab.state['index_term'][i] for i in indices)
+        'A B C D D [OOV] [OOV] [OOV]'
+
+    """
+    def __init__(self, pad_value: str = '<PAD>', oov_value: str = '<OOV>'):
+        """Vocabulary unit initializer."""
+        super().__init__()
+        self._pad = pad_value
+        self._oov = oov_value
+        self._state['term_index'] = self.TermIndex()
+        self._state['index_term'] = dict()
+
+        # self._context['term_index'] = self.TermIndex()
+        # self._context['index_term'] = dict()
+
+    # class IndexTerm(dict):
+    #     """Map index to term."""
+    #
+    #     def __missing__(self, key):
+    #         """Map out-of-vocabulary indices to empty string."""
+    #         if key == 0:
+    #             return ''
+    #         else:
+    #             raise KeyError(key)
+    #
+    # class TermIndex(dict):
+    #     """Map term to index."""
+    #
+    #     def __missing__(self, key):
+    #         """Map out-of-vocabulary terms to index 0."""
+    #         return 0
+    class TermIndex(dict):
+        """Map term to index."""
+
+        def __missing__(self, key):
+            """Map out-of-vocabulary terms to index 1."""
+            return 1
+
+    def fit(self, tokens: list):
+        """Build a :class:`TermIndex` and a :class:`IndexTerm`."""
+        # self._state['term_index'] = self.TermIndex()
+        # self._state['index_term'] = self.IndexTerm()
+        self._state['term_index'][self._pad] = 0
+        self._state['term_index'][self._oov] = 1
+        self._state['index_term'][0] = self._pad
+        self._state['index_term'][1] = self._oov
+        terms = sorted(list(set(tokens))) # the order will be changed!!!!! here!!!!. Therefore, I sorted these tokens.
+        for index, term in enumerate(terms):
+            self._state['term_index'][term] = index + 2
+            self._state['index_term'][index + 2] = term
+
+    def transform(self, input_: list) -> list:
+        """Transform a list of tokens to corresponding indices."""
+        return [self._state['term_index'][token] for token in input_]
diff --git a/matchzoo/preprocessors/units/word_hashing.py b/matchzoo/preprocessors/units/word_hashing.py
new file mode 100644
index 0000000..10281e2
--- /dev/null
+++ b/matchzoo/preprocessors/units/word_hashing.py
@@ -0,0 +1,70 @@
+import collections
+
+import numpy as np
+
+from .unit import Unit
+
+
+class WordHashing(Unit):
+    """
+    Word-hashing layer for DSSM-based models.
+
+    The input of :class:`WordHashingUnit` should be a list of word
+    sub-letter list extracted from one document. The output of is
+    the word-hashing representation of this document.
+
+    :class:`NgramLetterUnit` and :class:`VocabularyUnit` are two
+    essential prerequisite of :class:`WordHashingUnit`.
+
+    Examples:
+       >>> letters = [['#te', 'tes','est', 'st#'], ['oov']]
+       >>> word_hashing = WordHashing(
+       ...     term_index={'': 0,'st#': 1, '#te': 2, 'est': 3, 'tes': 4})
+       >>> hashing = word_hashing.transform(letters)
+       >>> hashing[0]
+       [0.0, 1.0, 1.0, 1.0, 1.0, 0.0]
+       >>> hashing[1]
+       [1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+
+    """
+
+    def __init__(
+        self,
+        term_index: dict,
+    ):
+        """
+        Class initialization.
+
+        :param term_index: term-index mapping generated by
+            :class:`VocabularyUnit`.
+        :param dim_triletter: dimensionality of tri_leltters.
+        """
+        self._term_index = term_index
+
+    def transform(self, input_: list) -> list:
+        """
+        Transform list of :attr:`letters` into word hashing layer.
+
+        :param input_: list of `tri_letters` generated by
+            :class:`NgramLetterUnit`.
+        :return: Word hashing representation of `tri-letters`.
+        """
+        if any([isinstance(elem, list) for elem in input_]):
+            # The input shape for CDSSM is
+            # [[word1 ngram, ngram], [word2, ngram, ngram], ...].
+            hashing = np.zeros((len(input_), len(self._term_index) + 1))
+            for idx, word in enumerate(input_):
+                counted_letters = collections.Counter(word)
+                for key, value in counted_letters.items():
+                    letter_id = self._term_index.get(key, 0)
+                    hashing[idx, letter_id] = value
+        else:
+            # The input shape for DSSM model [ngram, ngram, ...].
+            hashing = np.zeros((len(self._term_index) + 1))
+            counted_letters = collections.Counter(input_)
+            ''' Ben's doubt [Resolved] self._term_index is a dictionary of ngrams.'''
+            for key, value in counted_letters.items():
+                letter_id = self._term_index.get(key, 0)
+                hashing[letter_id] = value
+
+        return hashing.tolist()
diff --git a/matchzoo/tasks/__init__.py b/matchzoo/tasks/__init__.py
new file mode 100644
index 0000000..3911b0d
--- /dev/null
+++ b/matchzoo/tasks/__init__.py
@@ -0,0 +1,2 @@
+from .classification import Classification
+from .ranking import Ranking
diff --git a/matchzoo/tasks/classification.py b/matchzoo/tasks/classification.py
new file mode 100644
index 0000000..3acd443
--- /dev/null
+++ b/matchzoo/tasks/classification.py
@@ -0,0 +1,59 @@
+"""Classification task."""
+
+from matchzoo.engine.base_task import BaseTask
+
+
+class Classification(BaseTask):
+    """Classification task.
+
+    Examples:
+        >>> classification_task = Classification(num_classes=2)
+        >>> classification_task.metrics = ['precision']
+        >>> classification_task.num_classes
+        2
+        >>> classification_task.output_shape
+        (2,)
+        >>> classification_task.output_dtype
+        <class 'int'>
+        >>> print(classification_task)
+        Classification Task with 2 classes
+
+    """
+
+    def __init__(self, num_classes: int = 2, **kwargs):
+        """Classification task."""
+        super().__init__(**kwargs)
+        if not isinstance(num_classes, int):
+            raise TypeError("Number of classes must be an integer.")
+        if num_classes < 2:
+            raise ValueError("Number of classes can't be smaller than 2")
+        self._num_classes = num_classes
+
+    @property
+    def num_classes(self) -> int:
+        """:return: number of classes to classify."""
+        return self._num_classes
+
+    @classmethod
+    def list_available_losses(cls) -> list:
+        """:return: a list of available losses."""
+        return ['categorical_crossentropy']
+
+    @classmethod
+    def list_available_metrics(cls) -> list:
+        """:return: a list of available metrics."""
+        return ['acc']
+
+    @property
+    def output_shape(self) -> tuple:
+        """:return: output shape of a single sample of the task."""
+        return self._num_classes,
+
+    @property
+    def output_dtype(self):
+        """:return: target data type, expect `int` as output."""
+        return int
+
+    def __str__(self):
+        """:return: Task name as string."""
+        return 'Classification Task with %s classes' % self._num_classes
diff --git a/matchzoo/tasks/ranking.py b/matchzoo/tasks/ranking.py
new file mode 100644
index 0000000..ff4cfa1
--- /dev/null
+++ b/matchzoo/tasks/ranking.py
@@ -0,0 +1,43 @@
+"""Ranking task."""
+
+from matchzoo.engine import base_task
+
+
+class Ranking(base_task.BaseTask):
+    """Ranking Task.
+
+    Examples:
+        >>> ranking_task = Ranking()
+        >>> ranking_task.metrics = ['map', 'ndcg']
+        >>> ranking_task.output_shape
+        (1,)
+        >>> ranking_task.output_dtype
+        <class 'float'>
+        >>> print(ranking_task)
+        Ranking Task
+
+    """
+
+    @classmethod
+    def list_available_losses(cls) -> list:
+        """:return: a list of available losses."""
+        return ['mse']
+
+    @classmethod
+    def list_available_metrics(cls) -> list:
+        """:return: a list of available metrics."""
+        return ['map']
+
+    @property
+    def output_shape(self) -> tuple:
+        """:return: output shape of a single sample of the task."""
+        return 1,
+
+    @property
+    def output_dtype(self):
+        """:return: target data type, expect `float` as output."""
+        return float
+
+    def __str__(self):
+        """:return: Task name as string."""
+        return 'Ranking Task'
diff --git a/matchzoo/utils/__init__.py b/matchzoo/utils/__init__.py
new file mode 100644
index 0000000..3766756
--- /dev/null
+++ b/matchzoo/utils/__init__.py
@@ -0,0 +1,4 @@
+from .one_hot import one_hot
+from .get_file import get_file
+from .tensor_type import TensorType
+from .list_recursive_subclasses import list_recursive_concrete_subclasses
diff --git a/matchzoo/utils/average_meter.py b/matchzoo/utils/average_meter.py
new file mode 100644
index 0000000..b393779
--- /dev/null
+++ b/matchzoo/utils/average_meter.py
@@ -0,0 +1,40 @@
+"""Average meter."""
+
+
+class AverageMeter(object):
+    """
+    Computes and stores the average and current value.
+
+    Examples:
+        >>> am = AverageMeter()
+        >>> am.update(1)
+        >>> am.avg
+        1.0
+        >>> am.update(val=2.5, n=2)
+        >>> am.avg
+        2.0
+
+    """
+
+    def __init__(self):
+        """Average meter constructor."""
+        self.reset()
+
+    def reset(self):
+        """Reset AverageMeter."""
+        self._val = 0.
+        self._avg = 0.
+        self._sum = 0.
+        self._count = 0.
+
+    def update(self, val, n=1):
+        """Update value."""
+        self._val = val
+        self._sum += val * n
+        self._count += n
+        self._avg = self._sum / self._count
+
+    @property
+    def avg(self):
+        """Get avg."""
+        return self._avg
diff --git a/matchzoo/utils/early_stopping.py b/matchzoo/utils/early_stopping.py
new file mode 100644
index 0000000..d606536
--- /dev/null
+++ b/matchzoo/utils/early_stopping.py
@@ -0,0 +1,80 @@
+"""Early stopping."""
+
+import typing
+
+import torch
+import numpy as np
+
+
+class EarlyStopping:
+    """
+    EarlyStopping stops training if no improvement after a given patience.
+
+    :param patience: Number fo events to wait if no improvement and then
+        stop the training.
+    :param should_decrease: The way to judge the best so far.
+    :param key: Key of metric to be compared.
+    """
+
+    def __init__(
+        self,
+        patience: typing.Optional[int] = None,
+        should_decrease: bool = None,
+        key: typing.Any = None
+    ):
+        """Early stopping Constructor."""
+        self._patience = patience
+        self._key = key
+        self._best_so_far = 0
+        self._epochs_with_no_improvement = 0
+        self._is_best_so_far = False
+        self._early_stop = False
+
+    def state_dict(self) -> typing.Dict[str, typing.Any]:
+        """A `Trainer` can use this to serialize the state."""
+        return {
+            'patience': self._patience,
+            'best_so_far': self._best_so_far,
+            'is_best_so_far': self._is_best_so_far,
+            'epochs_with_no_improvement': self._epochs_with_no_improvement,
+        }
+
+    def load_state_dict(
+        self,
+        state_dict: typing.Dict[str, typing.Any]
+    ) -> None:
+        """Hydrate a early stopping from a serialized state."""
+        self._patience = state_dict["patience"]
+        self._is_best_so_far = state_dict["is_best_so_far"]
+        self._best_so_far = state_dict["best_so_far"]
+        self._epochs_with_no_improvement = \
+            state_dict["epochs_with_no_improvement"]
+
+    def update(self, result: list):
+        """Call function."""
+        score = result[self._key]
+        if score > self._best_so_far:
+            self._best_so_far = score
+            self._is_best_so_far = True
+            self._epochs_with_no_improvement = 0
+        else:
+            self._is_best_so_far = False
+            self._epochs_with_no_improvement += 1
+
+    @property
+    def best_so_far(self) -> bool:
+        """Returns best so far."""
+        return self._best_so_far
+
+    @property
+    def is_best_so_far(self) -> bool:
+        """Returns true if it is the best so far."""
+        return self._is_best_so_far
+
+    @property
+    def should_stop_early(self) -> bool:
+        """Returns true if improvement has stopped for long enough."""
+        if not self._patience:
+            return False
+        else:
+            return self._epochs_with_no_improvement >= self._patience
diff --git a/matchzoo/utils/get_file.py b/matchzoo/utils/get_file.py
new file mode 100644
index 0000000..0c67489
--- /dev/null
+++ b/matchzoo/utils/get_file.py
@@ -0,0 +1,349 @@
+"""Download file."""
+import typing
+# from pathlib import Path
+
+import os
+import hashlib
+import shutil
+import sys
+import tarfile
+import time
+import zipfile
+import collections
+import six
+from six.moves.urllib.error import HTTPError
+from six.moves.urllib.error import URLError
+from six.moves.urllib.request import urlretrieve
+import numpy as np
+
+import matchzoo
+
+
+class Progbar(object):
+    """
+    Displays a progress bar.
+
+    :param target: Total number of steps expected, None if unknown.
+    :param width: Progress bar width on screen.
+    :param verbose: Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose)
+    :param stateful_metrics: Iterable of string names of metrics that
+        should *not* be averaged over time. Metrics in this list
+        will be displayed as-is. All others will be averaged
+        by the progbar before display.
+    :param interval: Minimum visual progress update interval (in seconds).
+    """
+
+    def __init__(
+        self,
+        target,
+        width=30,
+        verbose=1,
+        interval=0.05,
+    ):
+        """Init."""
+        self.target = target
+        self.width = width
+        self.verbose = verbose
+        self.interval = interval
+
+        self._dynamic_display = ((hasattr(sys.stdout,
+                                  'isatty') and sys.stdout.isatty()
+                                  ) or 'ipykernel' in sys.modules)
+        self._total_width = 0
+        self._seen_so_far = 0
+        self._start = time.time()
+        self._last_update = 0
+
+    def update(self, current):
+        """Updates the progress bar."""
+        self._seen_so_far = current
+
+        now = time.time()
+        info = ' - {0:.0f}s'.format(now - self._start)
+        if self.verbose == 1:
+            if (now - self._last_update < self.interval and self.target is not
+               None and current < self.target):
+                return
+
+            prev_total_width = self._total_width
+            if self._dynamic_display:
+                sys.stdout.write('\b' * prev_total_width)
+                sys.stdout.write('\r')
+            else:
+                sys.stdout.write('\n')
+
+            if self.target is not None:
+                numdigits = int(np.floor(np.log10(self.target))) + 1
+                bar = '{2:{0:d}d}/{1} ['.format(
+                    numdigits, self.target, current)
+                prog = float(current) / self.target
+                prog_width = int(self.width * prog)
+                if prog_width > 0:
+                    bar += ('=' * (prog_width - 1))
+                    if current < self.target:
+                        bar += '>'
+                    else:
+                        bar += '='
+                bar += ('.' * (self.width - prog_width))
+                bar += ']'
+            else:
+                bar = '{0:7d}/Unknown'.format(current)
+
+            self._total_width = len(bar)
+            sys.stdout.write(bar)
+
+            if current:
+                time_per_unit = (now - self._start) / current
+            else:
+                time_per_unit = 0
+            if self.target is not None and current < self.target:
+                eta = int(time_per_unit * (self.target - current))
+                if eta > 3600:
+                    eta_format = ('{0:d}:{1:02d}:{2:02d}'.format(
+                        eta // 3600, (eta % 3600) // 60, eta % 60))
+                elif eta > 60:
+                    eta_format = '{0:d}:{1:02d}'.format(eta // 60, eta % 60)
+                else:
+                    eta_format = '{0:d}s'.format(eta)
+
+                info = ' - ETA: {0}'.format(eta_format)
+            else:
+                if time_per_unit >= 1:
+                    info += ' {0:.0f}s/step'.format(time_per_unit)
+                elif time_per_unit >= 1e-3:
+                    info += ' {0:.0f}ms/step'.format(time_per_unit * 1e3)
+                else:
+                    info += ' {0:.0f}us/step'.format(time_per_unit * 1e6)
+
+            self._total_width += len(info)
+            if prev_total_width > self._total_width:
+                info += (' ' * (prev_total_width - self._total_width))
+
+            if self.target is not None and current >= self.target:
+                info += '\n'
+
+            sys.stdout.write(info)
+            sys.stdout.flush()
+
+        elif self.verbose == 2:
+            if self.target is None or current >= self.target:
+                info += '\n'
+                sys.stdout.write(info)
+                sys.stdout.flush()
+
+        self._last_update = now
+
+
+def _extract_archive(file_path, path='.', archive_format='auto'):
+    """
+    Extracts an archive if it matches tar, tar.gz, tar.bz, or zip formats.
+
+    :param file_path: path to the archive file
+    :param path: path to extract the archive file
+    :param archive_format: Archive format to try for extracting the file.
+        Options are 'auto', 'tar', 'zip', and None.
+        'tar' includes tar, tar.gz, and tar.bz files.
+        The default 'auto' is ['tar', 'zip'].
+        None or an empty list will return no matches found.
+
+    :return: True if a match was found and an archive extraction was completed,
+        False otherwise.
+    """
+    if archive_format is None:
+        return False
+    if archive_format == 'auto':
+        archive_format = ['tar', 'zip']
+    if isinstance(archive_format, six.string_types):
+        archive_format = [archive_format]
+
+    for archive_type in archive_format:
+        if archive_type == 'tar':
+            open_fn = tarfile.open
+            is_match_fn = tarfile.is_tarfile
+        if archive_type == 'zip':
+            open_fn = zipfile.ZipFile
+            is_match_fn = zipfile.is_zipfile
+
+        if is_match_fn(file_path):
+            with open_fn(file_path) as archive:
+                try:
+                    archive.extractall(path)
+                except (tarfile.TarError, RuntimeError,
+                        KeyboardInterrupt):
+                    if os.path.exists(path):
+                        if os.path.isfile(path):
+                            os.remove(path)
+                        else:
+                            shutil.rmtree(path)
+                    raise
+            return True
+    return False
+
+
+def get_file(
+    fname: str = None,
+    origin: str = None,
+    untar: bool = False,
+    extract: bool = False,
+    md5_hash: typing.Any = None,
+    file_hash: typing.Any = None,
+    hash_algorithm: str = 'auto',
+    archive_format: str = 'auto',
+    cache_subdir: str = 'data',
+    cache_dir: str = matchzoo.USER_DATA_DIR,
+    verbose: int = 1
+) -> str:
+    """
+    Downloads a file from a URL if it not already in the cache.
+
+    By default the file at the url `origin` is downloaded to the
+    cache_dir `~/.matchzoo/datasets`, placed in the cache_subdir `data`,
+    and given the filename `fname`. The final location of a file
+    `example.txt` would therefore be `~/.matchzoo/datasets/data/example.txt`.
+
+    Files in tar, tar.gz, tar.bz, and zip formats can also be extracted.
+    Passing a hash will verify the file after download. The command line
+    programs `shasum` and `sha256sum` can compute the hash.
+
+    :param fname: Name of the file. If an absolute path `/path/to/file.txt` is
+        specified the file will be saved at that location.
+    :param origin: Original URL of the file.
+    :param untar: Deprecated in favor of 'extract'. Boolean, whether the file
+        should be decompressed.
+    :param md5_hash: Deprecated in favor of 'file_hash'. md5 hash of the file
+        for verification.
+    :param file_hash: The expected hash string of the file after download.
+        The sha256 and md5 hash algorithms are both supported.
+    :param cache_subdir: Subdirectory under the cache dir where the file is
+        saved. If an absolute path `/path/to/folder` is specified the file
+        will be saved at that location.
+    :param hash_algorithm: Select the hash algorithm to verify the file.
+        options are 'md5', 'sha256', and 'auto'. The default 'auto' detects
+        the hash algorithm in use.
+    :papram extract: True tries extracting the file as an Archive, like tar
+        or zip.
+    :param archive_format: Archive format to try for extracting the file.
+        Options are 'auto', 'tar', 'zip', and None.
+        'tar' includes tar, tar.gz, and tar.bz files.
+        The default 'auto' is ['tar', 'zip'].
+        None or an empty list will return no matches found.
+    :param cache_dir: Location to store cached files, when None it defaults to
+        the [matchzoo.USER_DATA_DIR](~/.matchzoo/datasets).
+    :param verbose: Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose)
+
+    :return: Path to the downloaded file.
+    """
+    if md5_hash is not None and file_hash is None:
+        file_hash = md5_hash
+        hash_algorithm = 'md5'
+    datadir_base = os.path.expanduser(cache_dir)
+    if not os.access(datadir_base, os.W_OK):
+        datadir_base = os.path.join('/tmp', '.matchzoo')
+    datadir = os.path.join(datadir_base, cache_subdir)
+    if not os.path.exists(datadir):
+        os.makedirs(datadir)
+
+    if untar:
+        untar_fpath = os.path.join(datadir, fname)
+        fpath = untar_fpath + '.tar.gz'
+    else:
+        fpath = os.path.join(datadir, fname)
+
+    download = False
+    if os.path.exists(fpath):
+        if file_hash is not None:
+            if not validate_file(fpath, file_hash, algorithm=hash_algorithm):
+                print('A local file was found, but it seems to be '
+                      'incomplete or outdated because the file hash '
+                      'does not match the original value of file_hash.'
+                      ' We will re-download the data.')
+                download = True
+    else:
+        download = True
+
+    if download:
+        print('Downloading data from', origin)
+
+        class ProgressTracker(object):
+            progbar = None
+
+        def dl_progress(count, block_size, total_size):
+            if ProgressTracker.progbar is None:
+                if total_size == -1:
+                    total_size = None
+                ProgressTracker.progbar = Progbar(
+                    target=total_size, verbose=verbose)
+            else:
+                ProgressTracker.progbar.update(count * block_size)
+
+        error_msg = 'URL fetch failure on {} : {} -- {}'
+        try:
+            try:
+                urlretrieve(origin, fpath, dl_progress)
+            except HTTPError as e:
+                raise Exception(error_msg.format(origin, e.code, e.msg))
+            except URLError as e:
+                raise Exception(error_msg.format(origin, e.errno, e.reason))
+        except (Exception, KeyboardInterrupt):
+            if os.path.exists(fpath):
+                os.remove(fpath)
+            raise
+        ProgressTracker.progbar = None
+
+    if untar:
+        if not os.path.exists(untar_fpath):
+            _extract_archive(fpath, datadir, archive_format='tar')
+        return untar_fpath
+
+    if extract:
+        _extract_archive(fpath, datadir, archive_format)
+
+    return fpath
+
+
+def validate_file(fpath, file_hash, algorithm='auto', chunk_size=65535):
+    """
+    Validates a file against a sha256 or md5 hash.
+
+    :param fpath: path to the file being validated
+    :param file_hash:  The expected hash string of the file.
+        The sha256 and md5 hash algorithms are both supported.
+    :param algorithm: Hash algorithm, one of 'auto', 'sha256', or 'md5'.
+        The default 'auto' detects the hash algorithm in use.
+    :param chunk_size: Bytes to read at a time, important for large files.
+
+    :return: Whether the file is valid.
+    """
+    if ((algorithm == 'sha256') or (algorithm == 'auto' and len(
+                                    file_hash) == 64)):
+        hasher = 'sha256'
+    else:
+        hasher = 'md5'
+
+    if str(_hash_file(fpath, hasher, chunk_size)) == str(file_hash):
+        return True
+    else:
+        return False
+
+
+def _hash_file(fpath, algorithm='sha256', chunk_size=65535):
+    """
+    Calculates a file sha256 or md5 hash.
+
+    :param fpath: path to the file being validated
+    :param algorithm: hash algorithm, one of 'auto', 'sha256', or 'md5'.
+        The default 'auto' detects the hash algorithm in use.
+    :param chunk_size: Bytes to read at a time, important for large files.
+
+    :return: The file hash.
+    """
+    if algorithm == 'sha256':
+        hasher = hashlib.sha256()
+    else:
+        hasher = hashlib.md5()
+
+    with open(fpath, 'rb') as fpath_file:
+        for chunk in iter(lambda: fpath_file.read(chunk_size), b''):
+            hasher.update(chunk)
+
+    return hasher.hexdigest()
diff --git a/matchzoo/utils/list_recursive_subclasses.py b/matchzoo/utils/list_recursive_subclasses.py
new file mode 100644
index 0000000..9ce5390
--- /dev/null
+++ b/matchzoo/utils/list_recursive_subclasses.py
@@ -0,0 +1,17 @@
+import inspect
+
+
+def list_recursive_concrete_subclasses(base):
+    """List all concrete subclasses of `base` recursively."""
+    return _filter_concrete(_bfs(base))
+
+
+def _filter_concrete(classes):
+    return list(filter(lambda c: not inspect.isabstract(c), classes))
+
+
+def _bfs(base):
+    return base.__subclasses__() + sum([
+        _bfs(subclass)
+        for subclass in base.__subclasses__()
+    ], [])
diff --git a/matchzoo/utils/one_hot.py b/matchzoo/utils/one_hot.py
new file mode 100644
index 0000000..7e3a043
--- /dev/null
+++ b/matchzoo/utils/one_hot.py
@@ -0,0 +1,9 @@
+"""One hot vectors."""
+import numpy as np
+
+
+def one_hot(indices: int, num_classes: int) -> np.ndarray:
+    """:return: A one-hot encoded vector."""
+    vec = np.zeros((num_classes,), dtype=np.int64)
+    vec[indices] = 1
+    return vec
diff --git a/matchzoo/utils/parse.py b/matchzoo/utils/parse.py
new file mode 100644
index 0000000..086321c
--- /dev/null
+++ b/matchzoo/utils/parse.py
@@ -0,0 +1,277 @@
+import typing
+
+import torch
+from torch import nn
+from torch import optim
+
+import matchzoo
+from matchzoo.engine.base_metric import (
+    BaseMetric, #RankingMetric, ClassificationMetric
+)
+
+activation = nn.ModuleDict([
+    ['relu', nn.ReLU()],
+    ['hardtanh', nn.Hardtanh()],
+    ['relu6', nn.ReLU6()],
+    ['sigmoid', nn.Sigmoid()],
+    ['tanh', nn.Tanh()],
+    ['softmax', nn.Softmax()],
+    ['softmax2d', nn.Softmax2d()],
+    ['logsoftmax', nn.LogSoftmax()],
+    ['elu', nn.ELU()],
+    ['selu', nn.SELU()],
+    # ['celu', nn.CELU()],
+    ['hardshrink', nn.Hardshrink()],
+    ['leakyrelu', nn.LeakyReLU()],
+    ['logsigmoid', nn.LogSigmoid()],
+    ['softplus', nn.Softplus()],
+    ['softshrink', nn.Softshrink()],
+    ['prelu', nn.PReLU()],
+    ['softsign', nn.Softsign()],
+    ['softmin', nn.Softmin()],
+    ['tanhshrink', nn.Tanhshrink()],
+    ['rrelu', nn.RReLU()],
+    ['glu', nn.GLU()],
+])
+
+loss = nn.ModuleDict([
+    ['l1', nn.L1Loss()],
+    ['nll', nn.NLLLoss()],
+    ['kldiv', nn.KLDivLoss()],
+    ['mse', nn.MSELoss()],
+    ['bce', nn.BCELoss()],
+    ['bce_with_logits', nn.BCEWithLogitsLoss()],
+    ['cosine_embedding', nn.CosineEmbeddingLoss()],
+    # ['ctc', nn.CTCLoss()],
+    ['hinge_embedding', nn.HingeEmbeddingLoss()],
+    ['margin_ranking', nn.MarginRankingLoss()],
+    ['multi_label_margin', nn.MultiLabelMarginLoss()],
+    ['multi_label_soft_margin', nn.MultiLabelSoftMarginLoss()],
+    ['multi_margin', nn.MultiMarginLoss()],
+    ['smooth_l1', nn.SmoothL1Loss()],
+    ['soft_margin', nn.SoftMarginLoss()],
+    ['cross_entropy', nn.CrossEntropyLoss()],
+    ['triplet_margin', nn.TripletMarginLoss()],
+    ['poisson_nll', nn.PoissonNLLLoss()]
+])
+
+optimizer = dict({
+    'adadelta': optim.Adadelta,
+    'adagrad': optim.Adagrad,
+    'adam': optim.Adam,
+    'sparse_adam': optim.SparseAdam,
+    'adamax': optim.Adamax,
+    'asgd': optim.ASGD,
+    'lbfgs': optim.LBFGS,
+    'rmsprop': optim.RMSprop,
+    'rprop': optim.Rprop,
+    'sgd': optim.SGD
+})
+
+
+def _parse(
+    identifier: typing.Union[str, typing.Type[nn.Module], nn.Module],
+    dictionary: nn.ModuleDict,
+    target: str
+) -> nn.Module:
+    """
+    Parse loss and activation.
+
+    :param identifier: activation identifier, one of
+            - String: name of a activation
+            - Torch Modele subclass
+            - Torch Module instance (it will be returned unchanged).
+    :param dictionary: nn.ModuleDict instance. Map string identifier to
+        nn.Module instance.
+    :return: A :class:`nn.Module` instance
+    """
+    if isinstance(identifier, str):
+        if identifier in dictionary:
+            return dictionary[identifier]
+        else:
+            raise ValueError(
+                'Could not interpret %s identifier: ' % target + str(identifier)
+            )
+    elif isinstance(identifier, nn.Module):
+        return identifier
+    elif issubclass(identifier, nn.Module):
+        return identifier()
+    else:
+        raise ValueError(
+            'Could not interpret %s identifier: ' % (target) + str(identifier)
+        )
+
+
+def parse_activation(
+    identifier: typing.Union[str, typing.Type[nn.Module], nn.Module]
+) -> nn.Module:
+    """
+    Retrieves a torch Module instance.
+
+    :param identifier: activation identifier, one of
+            - String: name of a activation
+            - Torch Modele subclass
+            - Torch Module instance (it will be returned unchanged).
+    :return: A :class:`nn.Module` instance
+
+    Examples::
+        >>> from torch import nn
+        >>> from matchzoo.utils import parse_activation
+
+    Use `str` as activation:
+        >>> activation = parse_activation('relu')
+        >>> type(activation)
+        <class 'torch.nn.modules.activation.ReLU'>
+
+    Use :class:`torch.nn.Module` subclasses as activation:
+        >>> type(parse_activation(nn.ReLU))
+        <class 'torch.nn.modules.activation.ReLU'>
+
+    Use :class:`torch.nn.Module` instances as activation:
+        >>> type(parse_activation(nn.ReLU()))
+        <class 'torch.nn.modules.activation.ReLU'>
+
+    """
+
+    return _parse(identifier, activation, 'activation')
+
+
+def parse_loss(
+    identifier: typing.Union[str, typing.Type[nn.Module], nn.Module],
+    task: typing.Optional[str] = None
+) -> nn.Module:
+    """
+    Retrieves a torch Module instance.
+
+    :param identifier: loss identifier, one of
+            - String: name of a loss
+            - Torch Module subclass
+            - Torch Module instance (it will be returned unchanged).
+    :param task: Task type for determining specific loss.
+    :return: A :class:`nn.Module` instance
+
+    Examples::
+        >>> from torch import nn
+        >>> from matchzoo.utils import parse_loss
+
+    Use `str` as loss:
+        >>> loss = parse_loss('mse')
+        >>> type(loss)
+        <class 'torch.nn.modules.loss.MSELoss'>
+
+    Use :class:`torch.nn.Module` subclasses as loss:
+        >>> type(parse_loss(nn.MSELoss))
+        <class 'torch.nn.modules.loss.MSELoss'>
+
+    Use :class:`torch.nn.Module` instances as loss:
+        >>> type(parse_loss(nn.MSELoss()))
+        <class 'torch.nn.modules.loss.MSELoss'>
+
+    """
+    return _parse(identifier, loss, 'loss')
+
+
+def _parse_metric(
+    metric: typing.Union[str, typing.Type[BaseMetric], BaseMetric],
+    Metrix: typing.Type[BaseMetric]
+) -> BaseMetric:
+    """
+    Parse metric.
+
+    :param metrc: Input metric in any form.
+    :param Metrix: Base Metric class. Either
+        :class:`matchzoo.engine.base_metric.RankingMetric` or
+        :class:`matchzoo.engine.base_metric.ClassificationMetric`.
+    :return: A :class:`BaseMetric` instance
+    """
+    if isinstance(metric, str):
+        metric = metric.lower()  # ignore case
+        for subclass in Metrix.__subclasses__():
+            if metric == subclass.ALIAS or metric in subclass.ALIAS:
+                return subclass()
+    elif isinstance(metric, Metrix):
+        return metric
+    elif issubclass(metric, Metrix):
+        return metric()
+    raise ValueError('%s can not be used in current task.' % metric)
+
+
+def parse_metric(
+    metric: typing.Union[str, typing.Type[BaseMetric], BaseMetric],
+    task: str
+) -> BaseMetric:
+    """
+    Parse input metric in any form into a :class:`BaseMetric` instance.
+
+    :param metric: Input metric in any form.
+    :param task: Task type for determining specific metric.
+    :return: A :class:`BaseMetric` instance
+
+    Examples::
+        >>> from matchzoo import metrics
+        >>> from matchzoo.utils import parse_metric
+
+    Use `str` as MatchZoo metrics:
+        >>> mz_metric = parse_metric('map', 'ranking')
+        >>> type(mz_metric)
+        <class 'matchzoo.metrics.mean_average_precision.MeanAveragePrecision'>
+
+    Use :class:`matchzoo.engine.BaseMetric` subclasses as MatchZoo metrics:
+        >>> type(parse_metric(metrics.AveragePrecision, 'ranking'))
+        <class 'matchzoo.metrics.average_precision.AveragePrecision'>
+
+    Use :class:`matchzoo.engine.BaseMetric` instances as MatchZoo metrics:
+        >>> type(parse_metric(metrics.AveragePrecision(), 'ranking'))
+        <class 'matchzoo.metrics.average_precision.AveragePrecision'>
+
+    """
+    if task is None:
+        raise ValueError(
+            'Should specify one `BaseTask`.'
+        )
+    if task == 'ranking':
+        return _parse_metric(metric, RankingMetric)
+    if task == 'classification':
+        return _parse_metric(metric, ClassificationMetric)
+    else:
+        raise ValueError(
+            'Should be a Ranking or Classification task.'
+        )
+
+
+def parse_optimizer(
+    identifier: typing.Union[str, typing.Type[optim.Optimizer]],
+) -> optim.Optimizer:
+    """
+    Parse input metric in any form into a :class:`Optimizer` class.
+
+    :param optimizer: Input optimizer in any form.
+    :return: A :class:`Optimizer` class
+
+    Examples::
+        >>> from torch import optim
+        >>> from matchzoo.utils import parse_optimizer
+
+    Use `str` as optimizer:
+        >>> parse_optimizer('adam')
+        <class 'torch.optim.adam.Adam'>
+
+    Use :class:`torch.optim.Optimizer` subclasses as optimizer:
+        >>> parse_optimizer(optim.Adam)
+        <class 'torch.optim.adam.Adam'>
+
+    """
+    if isinstance(identifier, str):
+        identifier = identifier.lower()  # ignore case
+        if identifier in optimizer:
+            return optimizer[identifier]
+        else:
+            raise ValueError(
+                'Could not interpret optimizer identifier: ' + str(identifier)
+            )
+    elif issubclass(identifier, optim.Optimizer):
+        return identifier
+    else:
+        raise ValueError(
+            'Could not interpret optimizer identifier: ' + str(identifier)
+        )
diff --git a/matchzoo/utils/tensor_type.py b/matchzoo/utils/tensor_type.py
new file mode 100644
index 0000000..8153cd2
--- /dev/null
+++ b/matchzoo/utils/tensor_type.py
@@ -0,0 +1,4 @@
+"""Define Keras tensor type."""
+import typing
+
+TensorType = typing.Any
diff --git a/matchzoo/utils/timer.py b/matchzoo/utils/timer.py
new file mode 100644
index 0000000..003770b
--- /dev/null
+++ b/matchzoo/utils/timer.py
@@ -0,0 +1,38 @@
+"""Timer."""
+
+import time
+
+
+class Timer(object):
+    """Computes elapsed time."""
+
+    def __init__(self):
+        """Timer constructor."""
+        self.reset()
+
+    def reset(self):
+        """Reset timer."""
+        self.running = True
+        self.total = 0
+        self.start = time.time()
+
+    def resume(self):
+        """Resume."""
+        if not self.running:
+            self.running = True
+            self.start = time.time()
+        return self
+
+    def stop(self):
+        """Stop."""
+        if self.running:
+            self.running = False
+            self.total += time.time() - self.start
+        return self
+
+    @property
+    def time(self):
+        """Return time."""
+        if self.running:
+            return self.total + time.time() - self.start
+        return self.total
diff --git a/matchzoo/version.py b/matchzoo/version.py
new file mode 100644
index 0000000..b7c3fd2
--- /dev/null
+++ b/matchzoo/version.py
@@ -0,0 +1,3 @@
+"""Matchzoo version file."""
+
+__version__ = '2.1.0'
diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
new file mode 100644
index 0000000..a40ca5f
--- /dev/null
+++ b/pytorch_transformers/__init__.py
@@ -0,0 +1,75 @@
+__version__ = "1.2.0"
+# Work around to update TensorFlow's absl.logging threshold which alters the
+# default Python logging output behavior when present.
+# see: https://github.com/abseil/abseil-py/issues/99
+# and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
+try:
+    import absl.logging
+    absl.logging.set_verbosity('info')
+    absl.logging.set_stderrthreshold('info')
+    absl.logging._warn_preinit_stderr = False
+except:
+    pass
+
+# Tokenizer
+from .tokenization_utils import (PreTrainedTokenizer)
+from .tokenization_auto import AutoTokenizer
+from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
+from .tokenization_openai import OpenAIGPTTokenizer
+from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
+from .tokenization_gpt2 import GPT2Tokenizer
+from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
+from .tokenization_xlm import XLMTokenizer
+from .tokenization_roberta import RobertaTokenizer
+from .tokenization_distilbert import DistilBertTokenizer
+
+# Configurations
+from .configuration_utils import PretrainedConfig
+from .configuration_auto import AutoConfig
+from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+# Modeling
+from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
+from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
+                            AutoModelWithLMHead)
+
+from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
+                            BertForMaskedLM, BertForNextSentencePrediction,
+                            BertForSequenceClassification, BertForMultipleChoice,
+                            BertForTokenClassification, BertForQuestionAnswering,
+                            load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
+                              OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
+                              load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
+                                  load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
+                            GPT2LMHeadModel, GPT2DoubleHeadsModel,
+                            load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
+                             XLNetForSequenceClassification, XLNetForQuestionAnswering,
+                             load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
+                           XLMWithLMHeadModel, XLMForSequenceClassification,
+                           XLMForQuestionAnswering, XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification,
+                               ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
+                               DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
+                               DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, SearchDistilBert)
+
+# Optimization
+from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
+                           WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
+
+# Files and general utilities
+from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
+                         cached_path, add_start_docstrings, add_end_docstrings,
+                         WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME)
diff --git a/pytorch_transformers/__main__.py b/pytorch_transformers/__main__.py
new file mode 100644
index 0000000..b047fa7
--- /dev/null
+++ b/pytorch_transformers/__main__.py
@@ -0,0 +1,128 @@
+# coding: utf8
+def main():
+    import sys
+    if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]:
+        print(
+        "Should be used as one of: \n"
+        ">> pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
+        ">> pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
+        ">> pytorch_transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
+        ">> pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
+        ">> pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
+        ">> pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
+    else:
+        if sys.argv[1] == "bert":
+            try:
+                from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+            except ImportError:
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions.")
+                raise
+
+            if len(sys.argv) != 5:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+            else:
+                PYTORCH_DUMP_OUTPUT = sys.argv.pop()
+                TF_CONFIG = sys.argv.pop()
+                TF_CHECKPOINT = sys.argv.pop()
+                convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
+        elif sys.argv[1] == "gpt":
+            from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
+            if len(sys.argv) < 4 or len(sys.argv) > 5:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
+            else:
+                OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
+                PYTORCH_DUMP_OUTPUT = sys.argv[3]
+                if len(sys.argv) == 5:
+                    OPENAI_GPT_CONFIG = sys.argv[4]
+                else:
+                    OPENAI_GPT_CONFIG = ""
+                convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,
+                                                    OPENAI_GPT_CONFIG,
+                                                    PYTORCH_DUMP_OUTPUT)
+        elif sys.argv[1] == "transfo_xl":
+            try:
+                from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
+            except ImportError:
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions.")
+                raise
+            if len(sys.argv) < 4 or len(sys.argv) > 5:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+            else:
+                if 'ckpt' in sys.argv[2].lower():
+                    TF_CHECKPOINT = sys.argv[2]
+                    TF_DATASET_FILE = ""
+                else:
+                    TF_DATASET_FILE = sys.argv[2]
+                    TF_CHECKPOINT = ""
+                PYTORCH_DUMP_OUTPUT = sys.argv[3]
+                if len(sys.argv) == 5:
+                    TF_CONFIG = sys.argv[4]
+                else:
+                    TF_CONFIG = ""
+                convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
+        elif sys.argv[1] == "gpt2":
+            try:
+                from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
+            except ImportError:
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions.")
+                raise
+
+            if len(sys.argv) < 4 or len(sys.argv) > 5:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+            else:
+                TF_CHECKPOINT = sys.argv[2]
+                PYTORCH_DUMP_OUTPUT = sys.argv[3]
+                if len(sys.argv) == 5:
+                    TF_CONFIG = sys.argv[4]
+                else:
+                    TF_CONFIG = ""
+                convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
+        elif sys.argv[1] == "xlnet":
+            try:
+                from .convert_xlnet_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
+            except ImportError:
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions.")
+                raise
+
+            if len(sys.argv) < 5 or len(sys.argv) > 6:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
+            else:
+                TF_CHECKPOINT = sys.argv[2]
+                TF_CONFIG = sys.argv[3]
+                PYTORCH_DUMP_OUTPUT = sys.argv[4]
+                if len(sys.argv) == 6:
+                    FINETUNING_TASK = sys.argv[5]
+                else:
+                    FINETUNING_TASK = None
+
+                convert_xlnet_checkpoint_to_pytorch(TF_CHECKPOINT,
+                                                    TF_CONFIG,
+                                                    PYTORCH_DUMP_OUTPUT,
+                                                    FINETUNING_TASK)
+        elif sys.argv[1] == "xlm":
+            from .convert_xlm_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
+
+            if len(sys.argv) != 4:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
+            else:
+                XLM_CHECKPOINT_PATH = sys.argv[2]
+                PYTORCH_DUMP_OUTPUT = sys.argv[3]
+
+                convert_xlm_checkpoint_to_pytorch(XLM_CHECKPOINT_PATH, PYTORCH_DUMP_OUTPUT)
+
+if __name__ == '__main__':
+    main()
diff --git a/pytorch_transformers/configuration_auto.py b/pytorch_transformers/configuration_auto.py
new file mode 100644
index 0000000..9e35f85
--- /dev/null
+++ b/pytorch_transformers/configuration_auto.py
@@ -0,0 +1,135 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Auto Model class. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+
+from .configuration_bert import BertConfig
+from .configuration_openai import OpenAIGPTConfig
+from .configuration_gpt2 import GPT2Config
+from .configuration_transfo_xl import TransfoXLConfig
+from .configuration_xlnet import XLNetConfig
+from .configuration_xlm import XLMConfig
+from .configuration_roberta import RobertaConfig
+from .configuration_distilbert import DistilBertConfig
+
+logger = logging.getLogger(__name__)
+
+
+class AutoConfig(object):
+    r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class
+        that will be instantiated as one of the configuration classes of the library
+        when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method take care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The base model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertConfig (DistilBERT model)
+            - contains `bert`: BertConfig (Bert model)
+            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+            - contains `xlnet`: XLNetConfig (XLNet model)
+            - contains `xlm`: XLMConfig (XLM model)
+            - contains `roberta`: RobertaConfig (RoBERTa model)
+
+        This class cannot be instantiated using `__init__()` (throw an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoConfig is designed to be instantiated "
+            "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r""" Instantiate a one of the configuration classes of the library
+        from a pre-trained model configuration.
+
+        The configuration class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertConfig (DistilBERT model)
+            - contains `bert`: BertConfig (Bert model)
+            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+            - contains `xlnet`: XLNetConfig (XLNet model)
+            - contains `xlm`: XLMConfig (XLM model)
+            - contains `roberta`: RobertaConfig (RoBERTa model)
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
+
+                - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            return_unused_kwargs: (`optional`) bool:
+
+                - If False, then this function returns just the final configuration object.
+                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
+
+        Examples::
+
+            config = AutoConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            config = AutoConfig.from_pretrained('./test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+            config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
+            config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+            assert config.output_attention == True
+            config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True,
+                                                               foo=False, return_unused_kwargs=True)
+            assert config.output_attention == True
+            assert unused_kwargs == {'foo': False}
+
+        """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'gpt2' in pretrained_model_name_or_path:
+            return GPT2Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            return TransfoXLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
diff --git a/pytorch_transformers/configuration_bert.py b/pytorch_transformers/configuration_bert.py
new file mode 100644
index 0000000..7fff3e5
--- /dev/null
+++ b/pytorch_transformers/configuration_bert.py
@@ -0,0 +1,113 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT model configuration """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
+    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
+}
+
+
+class BertConfig(PretrainedConfig):
+    r"""
+        :class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a
+        `BertModel`.
+
+
+        Arguments:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `BertModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+            layer_norm_eps: The epsilon used by LayerNorm.
+    """
+    pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=30522,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 **kwargs):
+        super(BertConfig, self).__init__(**kwargs)
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_act = hidden_act
+            self.intermediate_size = intermediate_size
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+            self.layer_norm_eps = layer_norm_eps
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             " or the path to a pretrained model config file (str)")
diff --git a/pytorch_transformers/configuration_distilbert.py b/pytorch_transformers/configuration_distilbert.py
new file mode 100644
index 0000000..b8929ee
--- /dev/null
+++ b/pytorch_transformers/configuration_distilbert.py
@@ -0,0 +1,89 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" DistilBERT model configuration """
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import sys
+import json
+import logging
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
+    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
+}
+
+
+class DistilBertConfig(PretrainedConfig):
+    pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=30522,
+                 max_position_embeddings=512,
+                 sinusoidal_pos_embds=True,
+                 n_layers=6,
+                 n_heads=12,
+                 dim=768,
+                 hidden_dim=4*768,
+                 dropout=0.1,
+                 attention_dropout=0.1,
+                 activation='gelu',
+                 initializer_range=0.02,
+                 tie_weights_=True,
+                 qa_dropout=0.1,
+                 seq_classif_dropout=0.2,
+                 **kwargs):
+        super(DistilBertConfig, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.max_position_embeddings = max_position_embeddings
+            self.sinusoidal_pos_embds = sinusoidal_pos_embds
+            self.n_layers = n_layers
+            self.n_heads = n_heads
+            self.dim = dim
+            self.hidden_dim = hidden_dim
+            self.dropout = dropout
+            self.attention_dropout = attention_dropout
+            self.activation = activation
+            self.initializer_range = initializer_range
+            self.tie_weights_ = tie_weights_
+            self.qa_dropout = qa_dropout
+            self.seq_classif_dropout = seq_classif_dropout
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             " or the path to a pretrained model config file (str)")
+    @property
+    def hidden_size(self):
+        return self.dim
+
+    @property
+    def num_attention_heads(self):
+        return self.n_heads
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layers
diff --git a/pytorch_transformers/configuration_gpt2.py b/pytorch_transformers/configuration_gpt2.py
new file mode 100644
index 0000000..c83d9e8
--- /dev/null
+++ b/pytorch_transformers/configuration_gpt2.py
@@ -0,0 +1,143 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" OpenAI GPT-2 configuration """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
+                                      "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
+                                      "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"}
+
+class GPT2Config(PretrainedConfig):
+    """Configuration class to store the configuration of a `GPT2Model`.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+        n_positions: Number of positional embeddings.
+        n_ctx: Size of the causal mask (usually same as n_positions).
+        n_embd: Dimensionality of the embeddings and hidden states.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        layer_norm_epsilon: epsilon to use in the layer norm layers
+        resid_pdrop: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attn_pdrop: The dropout ratio for the attention
+            probabilities.
+        embd_pdrop: The dropout ratio for the embeddings.
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+    """
+    pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(
+        self,
+        vocab_size_or_config_json_file=50257,
+        n_positions=1024,
+        n_ctx=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+
+        num_labels=1,
+        summary_type='cls_index',
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        **kwargs
+    ):
+        """Constructs GPT2Config.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+            n_positions: Number of positional embeddings.
+            n_ctx: Size of the causal mask (usually same as n_positions).
+            n_embd: Dimensionality of the embeddings and hidden states.
+            n_layer: Number of hidden layers in the Transformer encoder.
+            n_head: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            layer_norm_epsilon: epsilon to use in the layer norm layers
+            resid_pdrop: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attn_pdrop: The dropout ratio for the attention
+                probabilities.
+            embd_pdrop: The dropout ratio for the embeddings.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        super(GPT2Config, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.n_ctx = n_ctx
+            self.n_positions = n_positions
+            self.n_embd = n_embd
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.resid_pdrop = resid_pdrop
+            self.embd_pdrop = embd_pdrop
+            self.attn_pdrop = attn_pdrop
+            self.layer_norm_epsilon = layer_norm_epsilon
+            self.initializer_range = initializer_range
+
+            self.num_labels = num_labels
+            self.summary_type = summary_type
+            self.summary_use_proj = summary_use_proj
+            self.summary_activation = summary_activation
+            self.summary_first_dropout = summary_first_dropout
+            self.summary_proj_to_labels = summary_proj_to_labels
+        else:
+            raise ValueError(
+                "First argument must be either a vocabulary size (int)"
+                "or the path to a pretrained model config file (str)"
+            )
+
+    @property
+    def max_position_embeddings(self):
+        return self.n_positions
+
+    @property
+    def hidden_size(self):
+        return self.n_embd
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
diff --git a/pytorch_transformers/configuration_openai.py b/pytorch_transformers/configuration_openai.py
new file mode 100644
index 0000000..b27df56
--- /dev/null
+++ b/pytorch_transformers/configuration_openai.py
@@ -0,0 +1,135 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" OpenAI GPT configuration """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"
+}
+
+class OpenAIGPTConfig(PretrainedConfig):
+    """
+    Configuration class to store the configuration of a `OpenAIGPTModel`.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
+        n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
+        n_positions: Number of positional embeddings.
+        n_ctx: Size of the causal mask (usually same as n_positions).
+        n_embd: Dimensionality of the embeddings and hidden states.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        afn: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+        resid_pdrop: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attn_pdrop: The dropout ratio for the attention
+            probabilities.
+        embd_pdrop: The dropout ratio for the embeddings.
+        layer_norm_epsilon: epsilon to use in the layer norm layers
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        predict_special_tokens: should we predict special tokens (when the model has a LM head)
+    """
+    pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(
+        self,
+        vocab_size_or_config_json_file=40478,
+        n_positions=512,
+        n_ctx=512,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        afn="gelu",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        predict_special_tokens=True,
+
+        num_labels=1,
+        summary_type='cls_index',
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        **kwargs
+    ):
+        """Constructs OpenAIGPTConfig.
+        """
+        super(OpenAIGPTConfig, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.n_ctx = n_ctx
+            self.n_positions = n_positions
+            self.n_embd = n_embd
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.afn = afn
+            self.resid_pdrop = resid_pdrop
+            self.embd_pdrop = embd_pdrop
+            self.attn_pdrop = attn_pdrop
+            self.layer_norm_epsilon = layer_norm_epsilon
+            self.initializer_range = initializer_range
+            self.predict_special_tokens = predict_special_tokens
+
+            self.num_labels = num_labels
+            self.summary_type = summary_type
+            self.summary_use_proj = summary_use_proj
+            self.summary_activation = summary_activation
+            self.summary_first_dropout = summary_first_dropout
+            self.summary_proj_to_labels = summary_proj_to_labels
+        else:
+            raise ValueError(
+                "First argument must be either a vocabulary size (int)"
+                "or the path to a pretrained model config file (str)"
+            )
+
+    @property
+    def max_position_embeddings(self):
+        return self.n_positions
+
+    @property
+    def hidden_size(self):
+        return self.n_embd
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
diff --git a/pytorch_transformers/configuration_roberta.py b/pytorch_transformers/configuration_roberta.py
new file mode 100644
index 0000000..b92d6a9
--- /dev/null
+++ b/pytorch_transformers/configuration_roberta.py
@@ -0,0 +1,35 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" RoBERTa configuration """
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import logging
+
+from .configuration_bert import BertConfig
+
+logger = logging.getLogger(__name__)
+
+ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
+    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
+    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
+}
+
+
+class RobertaConfig(BertConfig):
+    pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
diff --git a/pytorch_transformers/configuration_transfo_xl.py b/pytorch_transformers/configuration_transfo_xl.py
new file mode 100644
index 0000000..2e966ee
--- /dev/null
+++ b/pytorch_transformers/configuration_transfo_xl.py
@@ -0,0 +1,167 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Transformer XL configuration """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
+}
+
+class TransfoXLConfig(PretrainedConfig):
+    """Configuration class to store the configuration of a `TransfoXLModel`.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
+            cutoffs: cutoffs for the adaptive softmax
+            d_model: Dimensionality of the model's hidden states.
+            d_embed: Dimensionality of the embeddings
+            d_head: Dimensionality of the model's heads.
+            div_val: divident value for adapative input and softmax
+            pre_lnorm: apply LayerNorm to the input instead of the output
+            d_inner: Inner dimension in FF
+            n_layer: Number of hidden layers in the Transformer encoder.
+            n_head: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            tgt_len: number of tokens to predict
+            ext_len: length of the extended context
+            mem_len: length of the retained previous heads
+            same_length: use the same attn length for all tokens
+            proj_share_all_but_first: True to share all but first projs, False not to share.
+            attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
+            clamp_len: use the same pos embeddings after clamp_len
+            sample_softmax: number of samples in sampled softmax
+            adaptive: use adaptive softmax
+            tie_weight: tie the word embedding and softmax weights
+            dropout: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            dropatt: The dropout ratio for the attention probabilities.
+            untie_r: untie relative position biases
+            embd_pdrop: The dropout ratio for the embeddings.
+            init: parameter initializer to use
+            init_range: parameters initialized by U(-init_range, init_range).
+            proj_init_std: parameters initialized by N(0, init_std)
+            init_std: parameters initialized by N(0, init_std)
+    """
+    pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=267735,
+                 cutoffs=[20000, 40000, 200000],
+                 d_model=1024,
+                 d_embed=1024,
+                 n_head=16,
+                 d_head=64,
+                 d_inner=4096,
+                 div_val=4,
+                 pre_lnorm=False,
+                 n_layer=18,
+                 tgt_len=128,
+                 ext_len=0,
+                 mem_len=1600,
+                 clamp_len=1000,
+                 same_length=True,
+                 proj_share_all_but_first=True,
+                 attn_type=0,
+                 sample_softmax=-1,
+                 adaptive=True,
+                 tie_weight=True,
+                 dropout=0.1,
+                 dropatt=0.0,
+                 untie_r=True,
+                 init="normal",
+                 init_range=0.01,
+                 proj_init_std=0.01,
+                 init_std=0.02,
+                 **kwargs):
+        """Constructs TransfoXLConfig.
+        """
+        super(TransfoXLConfig, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.n_token = vocab_size_or_config_json_file
+            self.cutoffs = []
+            self.cutoffs.extend(cutoffs)
+            self.tie_weight = tie_weight
+            if proj_share_all_but_first:
+                self.tie_projs = [False] + [True] * len(self.cutoffs)
+            else:
+                self.tie_projs = [False] + [False] * len(self.cutoffs)
+            self.d_model = d_model
+            self.d_embed = d_embed
+            self.d_head = d_head
+            self.d_inner = d_inner
+            self.div_val = div_val
+            self.pre_lnorm = pre_lnorm
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.tgt_len = tgt_len
+            self.ext_len = ext_len
+            self.mem_len = mem_len
+            self.same_length = same_length
+            self.attn_type = attn_type
+            self.clamp_len = clamp_len
+            self.sample_softmax = sample_softmax
+            self.adaptive = adaptive
+            self.dropout = dropout
+            self.dropatt = dropatt
+            self.untie_r = untie_r
+            self.init = init
+            self.init_range = init_range
+            self.proj_init_std = proj_init_std
+            self.init_std = init_std
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             " or the path to a pretrained model config file (str)")
+
+    @property
+    def max_position_embeddings(self):
+        return self.tgt_len + self.ext_len + self.mem_len
+
+    @property
+    def vocab_size(self):
+        return self.n_token
+
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self.n_token = value
+
+    @property
+    def hidden_size(self):
+        return self.d_model
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
diff --git a/pytorch_transformers/configuration_utils.py b/pytorch_transformers/configuration_utils.py
new file mode 100644
index 0000000..7efc735
--- /dev/null
+++ b/pytorch_transformers/configuration_utils.py
@@ -0,0 +1,205 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Configuration base class and utilities."""
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import copy
+import json
+import logging
+import os
+from io import open
+
+from .file_utils import cached_path, CONFIG_NAME
+
+logger = logging.getLogger(__name__)
+
+class PretrainedConfig(object):
+    r""" Base class for all configuration classes.
+        Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
+
+        Note:
+            A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
+            It only affects the model's configuration.
+
+        Class attributes (overridden by derived classes):
+            - ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
+
+        Parameters:
+            ``finetuning_task``: string, default `None`. Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
+            ``num_labels``: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens)
+            ``output_attentions``: boolean, default `False`. Should the model returns attentions weights.
+            ``output_hidden_states``: string, default `False`. Should the model returns all hidden-states.
+            ``torchscript``: string, default `False`. Is the model used with Torchscript.
+    """
+    pretrained_config_archive_map = {}
+
+    def __init__(self, **kwargs):
+        self.finetuning_task = kwargs.pop('finetuning_task', None)
+        self.num_labels = kwargs.pop('num_labels', 2)
+        self.output_attentions = kwargs.pop('output_attentions', False)
+        self.output_hidden_states = kwargs.pop('output_hidden_states', False)
+        self.torchscript = kwargs.pop('torchscript', False)
+        self.pruned_heads = kwargs.pop('pruned_heads', {})
+
+    def save_pretrained(self, save_directory):
+        """ Save a configuration object to the directory `save_directory`, so that it
+            can be re-loaded using the :func:`~pytorch_transformers.PretrainedConfig.from_pretrained` class method.
+        """
+        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_config_file = os.path.join(save_directory, CONFIG_NAME)
+
+        self.to_json_file(output_config_file)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
+
+        Parameters:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
+
+                - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            return_unused_kwargs: (`optional`) bool:
+
+                - If False, then this function returns just the final configuration object.
+                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
+
+        Examples::
+
+            # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
+            # derived class: BertConfig
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+            config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
+            config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+            assert config.output_attention == True
+            config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
+                                                               foo=False, return_unused_kwargs=True)
+            assert config.output_attention == True
+            assert unused_kwargs == {'foo': False}
+
+        """
+        cache_dir = kwargs.pop('cache_dir', None)
+        force_download = kwargs.pop('force_download', False)
+        proxies = kwargs.pop('proxies', None)
+        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
+
+        if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
+            config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
+        elif os.path.isdir(pretrained_model_name_or_path):
+            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        else:
+            config_file = pretrained_model_name_or_path
+        # redirect to the cache, if necessary
+        try:
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
+        except EnvironmentError as e:
+            if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(cls.pretrained_config_archive_map.keys()),
+                        config_file))
+            raise e
+        if resolved_config_file == config_file:
+            logger.info("loading configuration file {}".format(config_file))
+        else:
+            logger.info("loading configuration file {} from cache at {}".format(
+                config_file, resolved_config_file))
+
+        # Load config
+        config = cls.from_json_file(resolved_config_file)
+
+        if hasattr(config, 'pruned_heads'):
+            config.pruned_heads = dict((int(key), set(value)) for key, value in config.pruned_heads.items())
+
+        # Update config with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info("Model config %s", config)
+        if return_unused_kwargs:
+            return config, kwargs
+        else:
+            return config
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `Config` from a Python dictionary of parameters."""
+        config = cls(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __eq__(self, other):
+        return self.__dict__ == other.__dict__
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
diff --git a/pytorch_transformers/configuration_xlm.py b/pytorch_transformers/configuration_xlm.py
new file mode 100644
index 0000000..ab251c8
--- /dev/null
+++ b/pytorch_transformers/configuration_xlm.py
@@ -0,0 +1,184 @@
+# coding=utf-8
+# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" XLM configuration """
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
+    'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json",
+    'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json",
+    'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json",
+    'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json",
+    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json",
+    'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json",
+    'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json",
+    'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json",
+    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json",
+}
+
+
+class XLMConfig(PretrainedConfig):
+    """Configuration class to store the configuration of a `XLMModel`.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
+        d_model: Size of the encoder layers and the pooler layer.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        d_inner: The size of the "intermediate" (i.e., feed-forward)
+            layer in the Transformer encoder.
+        ff_activation: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+        untie_r: untie relative position biases
+        attn_type: 'bi' for XLM, 'uni' for Transformer-XL
+
+        dropout: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        dropatt: The dropout ratio for the attention
+            probabilities.
+        max_position_embeddings: The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_eps: The epsilon used by LayerNorm.
+
+        dropout: float, dropout rate.
+        dropatt: float, dropout rate on attention probabilities.
+        init: str, the initialization scheme, either "normal" or "uniform".
+        init_range: float, initialize the parameters with a uniform distribution
+            in [-init_range, init_range]. Only effective when init="uniform".
+        init_std: float, initialize the parameters with a normal distribution
+            with mean 0 and stddev init_std. Only effective when init="normal".
+        mem_len: int, the number of tokens to cache.
+        reuse_len: int, the number of tokens in the currect batch to be cached
+            and reused in the future.
+        bi_data: bool, whether to use bidirectional input pipeline.
+            Usually set to True during pretraining and False during finetuning.
+        clamp_len: int, clamp all relative distances larger than clamp_len.
+            -1 means no clamping.
+        same_length: bool, whether to use the same attention length for each token.
+    """
+    pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=30145,
+                 emb_dim=2048,
+                 n_layers=12,
+                 n_heads=16,
+                 dropout=0.1,
+                 attention_dropout=0.1,
+                 gelu_activation=True,
+                 sinusoidal_embeddings=False,
+                 causal=False,
+                 asm=False,
+                 n_langs=1,
+                 use_lang_emb=True,
+                 max_position_embeddings=512,
+                 embed_init_std=2048 ** -0.5,
+                 layer_norm_eps=1e-12,
+                 init_std=0.02,
+                 bos_index=0,
+                 eos_index=1,
+                 pad_index=2,
+                 unk_index=3,
+                 mask_index=5,
+                 is_encoder=True,
+
+                 finetuning_task=None,
+                 num_labels=2,
+                 summary_type='first',
+                 summary_use_proj=True,
+                 summary_activation=None,
+                 summary_proj_to_labels=True,
+                 summary_first_dropout=0.1,
+                 start_n_top=5,
+                 end_n_top=5,
+                 **kwargs):
+        """Constructs XLMConfig.
+        """
+        super(XLMConfig, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.n_words = vocab_size_or_config_json_file
+            self.emb_dim = emb_dim
+            self.n_layers = n_layers
+            self.n_heads = n_heads
+            self.dropout = dropout
+            self.attention_dropout = attention_dropout
+            self.gelu_activation = gelu_activation
+            self.sinusoidal_embeddings = sinusoidal_embeddings
+            self.causal = causal
+            self.asm = asm
+            self.n_langs = n_langs
+            self.use_lang_emb = use_lang_emb
+            self.layer_norm_eps = layer_norm_eps
+            self.bos_index = bos_index
+            self.eos_index = eos_index
+            self.pad_index = pad_index
+            self.unk_index = unk_index
+            self.mask_index = mask_index
+            self.is_encoder = is_encoder
+            self.max_position_embeddings = max_position_embeddings
+            self.embed_init_std = embed_init_std
+            self.init_std = init_std
+            self.finetuning_task = finetuning_task
+            self.num_labels = num_labels
+            self.summary_type = summary_type
+            self.summary_use_proj = summary_use_proj
+            self.summary_activation = summary_activation
+            self.summary_proj_to_labels = summary_proj_to_labels
+            self.summary_first_dropout = summary_first_dropout
+            self.start_n_top = start_n_top
+            self.end_n_top = end_n_top
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             " or the path to a pretrained model config file (str)")
+
+    @property
+    def vocab_size(self):
+        return self.n_words
+
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self.n_words = value
+
+    @property
+    def hidden_size(self):
+        return self.emb_dim
+
+    @property
+    def num_attention_heads(self):
+        return self.n_heads
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layers
diff --git a/pytorch_transformers/configuration_xlnet.py b/pytorch_transformers/configuration_xlnet.py
new file mode 100644
index 0000000..204d44a
--- /dev/null
+++ b/pytorch_transformers/configuration_xlnet.py
@@ -0,0 +1,172 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" XLNet configuration """
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json",
+    'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
+}
+
+
+class XLNetConfig(PretrainedConfig):
+    """Configuration class to store the configuration of a ``XLNetModel``.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
+        d_model: Size of the encoder layers and the pooler layer.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        d_inner: The size of the "intermediate" (i.e., feed-forward)
+            layer in the Transformer encoder.
+        ff_activation: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+        untie_r: untie relative position biases
+        attn_type: 'bi' for XLNet, 'uni' for Transformer-XL
+
+        dropout: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        dropatt: The dropout ratio for the attention
+            probabilities.
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_eps: The epsilon used by LayerNorm.
+
+        dropout: float, dropout rate.
+        dropatt: float, dropout rate on attention probabilities.
+        init: str, the initialization scheme, either "normal" or "uniform".
+        init_range: float, initialize the parameters with a uniform distribution
+            in [-init_range, init_range]. Only effective when init="uniform".
+        init_std: float, initialize the parameters with a normal distribution
+            with mean 0 and stddev init_std. Only effective when init="normal".
+        mem_len: int, the number of tokens to cache.
+        reuse_len: int, the number of tokens in the currect batch to be cached
+            and reused in the future.
+        bi_data: bool, whether to use bidirectional input pipeline.
+            Usually set to True during pretraining and False during finetuning.
+        clamp_len: int, clamp all relative distances larger than clamp_len.
+            -1 means no clamping.
+        same_length: bool, whether to use the same attention length for each token.
+        finetuning_task: name of the glue task on which the model was fine-tuned if any
+    """
+    pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=32000,
+                 d_model=1024,
+                 n_layer=24,
+                 n_head=16,
+                 d_inner=4096,
+                 ff_activation="gelu",
+                 untie_r=True,
+                 attn_type="bi",
+
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+
+                 dropout=0.1,
+                 mem_len=None,
+                 reuse_len=None,
+                 bi_data=False,
+                 clamp_len=-1,
+                 same_length=False,
+
+                 finetuning_task=None,
+                 num_labels=2,
+                 summary_type='last',
+                 summary_use_proj=True,
+                 summary_activation='tanh',
+                 summary_last_dropout=0.1,
+                 start_n_top=5,
+                 end_n_top=5,
+                 **kwargs):
+        """Constructs XLNetConfig.
+        """
+        super(XLNetConfig, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.n_token = vocab_size_or_config_json_file
+            self.d_model = d_model
+            self.n_layer = n_layer
+            self.n_head = n_head
+            assert d_model % n_head == 0
+            self.d_head = d_model // n_head
+            self.ff_activation = ff_activation
+            self.d_inner = d_inner
+            self.untie_r = untie_r
+            self.attn_type = attn_type
+
+            self.initializer_range = initializer_range
+            self.layer_norm_eps = layer_norm_eps
+
+            self.dropout = dropout
+            self.mem_len = mem_len
+            self.reuse_len = reuse_len
+            self.bi_data = bi_data
+            self.clamp_len = clamp_len
+            self.same_length = same_length
+
+            self.finetuning_task = finetuning_task
+            self.num_labels = num_labels
+            self.summary_type = summary_type
+            self.summary_use_proj = summary_use_proj
+            self.summary_activation = summary_activation
+            self.summary_last_dropout = summary_last_dropout
+            self.start_n_top = start_n_top
+            self.end_n_top = end_n_top
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             " or the path to a pretrained model config file (str)")
+
+    @property
+    def max_position_embeddings(self):
+        return -1
+
+    @property
+    def vocab_size(self):
+        return self.n_token
+
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self.n_token = value
+
+    @property
+    def hidden_size(self):
+        return self.d_model
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
diff --git a/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
new file mode 100644
index 0000000..eb5b300
--- /dev/null
+++ b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
@@ -0,0 +1,75 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert OpenAI GPT checkpoint."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+from io import open
+
+import torch
+
+from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
+                                                     GPT2Config,
+                                                     GPT2Model,
+                                                     load_tf_weights_in_gpt2)
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+
+def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
+    # Construct model
+    if gpt2_config_file == "":
+        config = GPT2Config()
+    else:
+        config = GPT2Config.from_json_file(gpt2_config_file)
+    model = GPT2Model(config)
+
+    # Load weights from numpy
+    load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
+
+    # Save pytorch-model
+    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
+    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
+    torch.save(model.state_dict(), pytorch_weights_dump_path)
+    print("Save configuration file to {}".format(pytorch_config_dump_path))
+    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+        f.write(config.to_json_string())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--gpt2_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the TensorFlow checkpoint path.")
+    parser.add_argument("--pytorch_dump_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    parser.add_argument("--gpt2_config_file",
+                        default = "",
+                        type = str,
+                        help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
+                            "This specifies the model architecture.")
+    args = parser.parse_args()
+    convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path,
+                                         args.gpt2_config_file,
+                                         args.pytorch_dump_folder_path)
diff --git a/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
new file mode 100644
index 0000000..5eecdd9
--- /dev/null
+++ b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
@@ -0,0 +1,75 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert OpenAI GPT checkpoint."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+from io import open
+
+import torch
+
+from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
+                                                     OpenAIGPTConfig,
+                                                     OpenAIGPTModel,
+                                                     load_tf_weights_in_openai_gpt)
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+
+def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
+    # Construct model
+    if openai_config_file == "":
+        config = OpenAIGPTConfig()
+    else:
+        config = OpenAIGPTConfig.from_json_file(openai_config_file)
+    model = OpenAIGPTModel(config)
+
+    # Load weights from numpy
+    load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path)
+
+    # Save pytorch-model
+    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
+    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
+    torch.save(model.state_dict(), pytorch_weights_dump_path)
+    print("Save configuration file to {}".format(pytorch_config_dump_path))
+    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+        f.write(config.to_json_string())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--openai_checkpoint_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the TensorFlow checkpoint path.")
+    parser.add_argument("--pytorch_dump_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    parser.add_argument("--openai_config_file",
+                        default = "",
+                        type = str,
+                        help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
+                            "This specifies the model architecture.")
+    args = parser.parse_args()
+    convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path,
+                                         args.openai_config_file,
+                                         args.pytorch_dump_folder_path)
diff --git a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
new file mode 100644
index 0000000..15fd6bf
--- /dev/null
+++ b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
@@ -0,0 +1,130 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint."""
+
+import os
+import argparse
+import torch
+import numpy as np
+import tensorflow as tf
+from pytorch_transformers import BertModel
+
+
+def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
+
+    """
+    :param model:BertModel Pytorch model instance to be converted
+    :param ckpt_dir: Tensorflow model directory
+    :param model_name: model name
+    :return:
+
+    Currently supported HF models:
+        Y BertModel
+        N BertForMaskedLM
+        N BertForPreTraining
+        N BertForMultipleChoice
+        N BertForNextSentencePrediction
+        N BertForSequenceClassification
+        N BertForQuestionAnswering
+    """
+
+    tensors_to_transpose = (
+        "dense.weight",
+        "attention.self.query",
+        "attention.self.key",
+        "attention.self.value"
+    )
+
+    var_map = (
+        ('layer.', 'layer_'),
+        ('word_embeddings.weight', 'word_embeddings'),
+        ('position_embeddings.weight', 'position_embeddings'),
+        ('token_type_embeddings.weight', 'token_type_embeddings'),
+        ('.', '/'),
+        ('LayerNorm/weight', 'LayerNorm/gamma'),
+        ('LayerNorm/bias', 'LayerNorm/beta'),
+        ('weight', 'kernel')
+    )
+
+    if not os.path.isdir(ckpt_dir):
+        os.makedirs(ckpt_dir)
+
+    state_dict = model.state_dict()
+
+    def to_tf_var_name(name:str):
+        for patt, repl in iter(var_map):
+            name = name.replace(patt, repl)
+        return 'bert/{}'.format(name)
+
+    def create_tf_var(tensor:np.ndarray, name:str, session:tf.Session):
+        tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
+        tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
+        session.run(tf.variables_initializer([tf_var]))
+        session.run(tf_var)
+        return tf_var
+
+    tf.reset_default_graph()
+    with tf.Session() as session:
+        for var_name in state_dict:
+            tf_name = to_tf_var_name(var_name)
+            torch_tensor = state_dict[var_name].numpy()
+            if any([x in var_name for x in tensors_to_transpose]):
+                torch_tensor = torch_tensor.T
+            tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
+            tf.keras.backend.set_value(tf_var, torch_tensor)
+            tf_weight = session.run(tf_var)
+            print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor)))
+
+        saver = tf.train.Saver(tf.trainable_variables())
+        saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
+
+
+def main(raw_args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name",
+                        type=str,
+                        required=True,
+                        help="model name e.g. bert-base-uncased")
+    parser.add_argument("--cache_dir",
+                        type=str,
+                        default=None,
+                        required=False,
+                        help="Directory containing pytorch model")
+    parser.add_argument("--pytorch_model_path",
+                        type=str,
+                        required=True,
+                        help="/path/to/<pytorch-model-name>.bin")
+    parser.add_argument("--tf_cache_dir",
+                        type=str,
+                        required=True,
+                        help="Directory in which to save tensorflow model")
+    args = parser.parse_args(raw_args)
+    
+    model = BertModel.from_pretrained(
+        pretrained_model_name_or_path=args.model_name,
+        state_dict=torch.load(args.pytorch_model_path),
+        cache_dir=args.cache_dir
+    )
+    
+    convert_pytorch_checkpoint_to_tf(
+        model=model,
+        ckpt_dir=args.tf_cache_dir,
+        model_name=args.model_name
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py b/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
new file mode 100644
index 0000000..9f74254
--- /dev/null
+++ b/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
@@ -0,0 +1,180 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert RoBERTa checkpoint."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import logging
+import numpy as np
+import torch
+
+from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
+from fairseq.modules import TransformerSentenceEncoderLayer
+from pytorch_transformers import (BertConfig, BertEncoder,
+                                                BertIntermediate, BertLayer,
+                                                BertModel, BertOutput,
+                                                BertSelfAttention,
+                                                BertSelfOutput)
+from pytorch_transformers import (RobertaEmbeddings,
+                                                   RobertaForMaskedLM,
+                                                   RobertaForSequenceClassification,
+                                                   RobertaModel)
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+SAMPLE_TEXT = 'Hello world! cécé herlolip'
+
+
+def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_folder_path, classification_head):
+    """
+    Copy/paste/tweak roberta's weights to our BERT structure.
+    """
+    roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
+    roberta.eval()  # disable dropout
+    config = BertConfig(
+        vocab_size_or_config_json_file=50265,
+        hidden_size=roberta.args.encoder_embed_dim,
+        num_hidden_layers=roberta.args.encoder_layers,
+        num_attention_heads=roberta.args.encoder_attention_heads,
+        intermediate_size=roberta.args.encoder_ffn_embed_dim,
+        max_position_embeddings=514,
+        type_vocab_size=1,
+        layer_norm_eps=1e-5, # PyTorch default used in fairseq
+    )
+    if classification_head:
+        config.num_labels = roberta.args.num_classes
+    print("Our BERT config:", config)
+
+    model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config)
+    model.eval()
+
+    # Now let's copy all the weights.
+    # Embeddings
+    roberta_sent_encoder = roberta.model.decoder.sentence_encoder
+    model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
+    model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
+    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight)  # just zero them out b/c RoBERTa doesn't use them.
+    model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
+    model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
+
+    for i in range(config.num_hidden_layers):
+        # Encoder: start of layer
+        layer: BertLayer = model.roberta.encoder.layer[i]
+        roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i]
+
+        ### self attention
+        self_attn: BertSelfAttention = layer.attention.self
+        assert(
+            roberta_layer.self_attn.in_proj_weight.shape == torch.Size((3 * config.hidden_size, config.hidden_size))
+        )
+        # we use three distinct linear layers so we split the source layer here.
+        self_attn.query.weight.data = roberta_layer.self_attn.in_proj_weight[:config.hidden_size, :]
+        self_attn.query.bias.data = roberta_layer.self_attn.in_proj_bias[:config.hidden_size]
+        self_attn.key.weight.data = roberta_layer.self_attn.in_proj_weight[config.hidden_size:2*config.hidden_size, :]
+        self_attn.key.bias.data = roberta_layer.self_attn.in_proj_bias[config.hidden_size:2*config.hidden_size]
+        self_attn.value.weight.data = roberta_layer.self_attn.in_proj_weight[2*config.hidden_size:, :]
+        self_attn.value.bias.data = roberta_layer.self_attn.in_proj_bias[2*config.hidden_size:]
+
+        ### self-attention output
+        self_output: BertSelfOutput = layer.attention.output
+        assert(
+            self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
+        )
+        self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
+        self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
+        self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
+        self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias
+
+        ### intermediate
+        intermediate: BertIntermediate = layer.intermediate
+        assert(
+            intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
+        )
+        intermediate.dense.weight = roberta_layer.fc1.weight
+        intermediate.dense.bias = roberta_layer.fc1.bias
+
+        ### output
+        bert_output: BertOutput = layer.output
+        assert(
+            bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
+        )
+        bert_output.dense.weight = roberta_layer.fc2.weight
+        bert_output.dense.bias = roberta_layer.fc2.bias
+        bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
+        bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
+        #### end of layer
+    
+    if classification_head:
+        model.classifier.dense.weight = roberta.model.classification_heads['mnli'].dense.weight
+        model.classifier.dense.bias = roberta.model.classification_heads['mnli'].dense.bias
+        model.classifier.out_proj.weight = roberta.model.classification_heads['mnli'].out_proj.weight
+        model.classifier.out_proj.bias = roberta.model.classification_heads['mnli'].out_proj.bias
+    else:
+        # LM Head
+        model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight
+        model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias
+        model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
+        model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
+        model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight
+        model.lm_head.bias = roberta.model.decoder.lm_head.bias
+
+    # Let's check that we get the same results.
+    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1
+
+    our_output = model(input_ids)[0]
+    if classification_head:
+        their_output = roberta.model.classification_heads['mnli'](roberta.extract_features(input_ids))
+    else:
+        their_output = roberta.model(input_ids)[0]
+    print(our_output.shape, their_output.shape)
+    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
+    print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
+    success = torch.allclose(our_output, their_output, atol=1e-3)
+    print(
+        "Do both models output the same tensors?",
+        "🔥" if success else "💩"
+    )
+    if not success:
+        raise Exception("Something went wRoNg")
+
+    print(f"Saving model to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--roberta_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path the official PyTorch dump.")
+    parser.add_argument("--pytorch_dump_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    parser.add_argument("--classification_head",
+                        action = "store_true",
+                        help = "Whether to convert a final classification head.")
+    args = parser.parse_args()
+    convert_roberta_checkpoint_to_pytorch(
+        args.roberta_checkpoint_path,
+        args.pytorch_dump_folder_path,
+        args.classification_head
+    )
+
diff --git a/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py b/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
new file mode 100644
index 0000000..d382d35
--- /dev/null
+++ b/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
@@ -0,0 +1,65 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert BERT checkpoint."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import torch
+
+from pytorch_transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
+    # Initialise PyTorch model
+    config = BertConfig.from_json_file(bert_config_file)
+    print("Building PyTorch model from configuration: {}".format(str(config)))
+    model = BertForPreTraining(config)
+
+    # Load weights from tf checkpoint
+    load_tf_weights_in_bert(model, config, tf_checkpoint_path)
+
+    # Save pytorch-model
+    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    torch.save(model.state_dict(), pytorch_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--tf_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the TensorFlow checkpoint path.")
+    parser.add_argument("--bert_config_file",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "The config json file corresponding to the pre-trained BERT model. \n"
+                            "This specifies the model architecture.")
+    parser.add_argument("--pytorch_dump_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    args = parser.parse_args()
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
+                                     args.bert_config_file,
+                                     args.pytorch_dump_path)
diff --git a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
new file mode 100644
index 0000000..b310b73
--- /dev/null
+++ b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -0,0 +1,117 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Transformer XL checkpoint and datasets."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import os
+import sys
+from io import open
+
+import torch
+
+import pytorch_transformers.tokenization_transfo_xl as data_utils
+
+from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
+from pytorch_transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
+                                                      load_tf_weights_in_transfo_xl)
+from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
+
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+# We do this to be able to load python 2 datasets pickles
+# See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
+data_utils.Vocab = data_utils.TransfoXLTokenizer
+data_utils.Corpus = data_utils.TransfoXLCorpus
+sys.modules['data_utils'] = data_utils
+sys.modules['vocabulary'] = data_utils
+
+def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
+                                             transfo_xl_config_file,
+                                             pytorch_dump_folder_path,
+                                             transfo_xl_dataset_file):
+    if transfo_xl_dataset_file:
+        # Convert a pre-processed corpus (see original TensorFlow repo)
+        with open(transfo_xl_dataset_file, "rb") as fp:
+            corpus = pickle.load(fp, encoding="latin1")
+        # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
+        pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['pretrained_vocab_file']
+        print("Save vocabulary to {}".format(pytorch_vocab_dump_path))
+        corpus_vocab_dict = corpus.vocab.__dict__
+        torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
+
+        corpus_dict_no_vocab = corpus.__dict__
+        corpus_dict_no_vocab.pop('vocab', None)
+        pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME
+        print("Save dataset to {}".format(pytorch_dataset_dump_path))
+        torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)
+
+    if tf_checkpoint_path:
+        # Convert a pre-trained TensorFlow model
+        config_path = os.path.abspath(transfo_xl_config_file)
+        tf_path = os.path.abspath(tf_checkpoint_path)
+
+        print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path))
+        # Initialise PyTorch model
+        if transfo_xl_config_file == "":
+            config = TransfoXLConfig()
+        else:
+            config = TransfoXLConfig.from_json_file(transfo_xl_config_file)
+        print("Building PyTorch model from configuration: {}".format(str(config)))
+        model = TransfoXLLMHeadModel(config)
+
+        model = load_tf_weights_in_transfo_xl(model, config, tf_path)
+        # Save pytorch-model
+        pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
+        pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
+        print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
+        torch.save(model.state_dict(), pytorch_weights_dump_path)
+        print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
+        with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+            f.write(config.to_json_string())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pytorch_dump_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the folder to store the PyTorch model or dataset/vocab.")
+    parser.add_argument("--tf_checkpoint_path",
+                        default = "",
+                        type = str,
+                        help = "An optional path to a TensorFlow checkpoint path to be converted.")
+    parser.add_argument("--transfo_xl_config_file",
+                        default = "",
+                        type = str,
+                        help = "An optional config json file corresponding to the pre-trained BERT model. \n"
+                            "This specifies the model architecture.")
+    parser.add_argument("--transfo_xl_dataset_file",
+                        default = "",
+                        type = str,
+                        help = "An optional dataset file to be converted in a vocabulary.")
+    args = parser.parse_args()
+    convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path,
+                                     args.transfo_xl_config_file,
+                                     args.pytorch_dump_folder_path,
+                                     args.transfo_xl_dataset_file)
diff --git a/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
new file mode 100644
index 0000000..d6a3cd8
--- /dev/null
+++ b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
@@ -0,0 +1,75 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert OpenAI GPT checkpoint."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import json
+from io import open
+
+import torch
+import numpy
+
+from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
+from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
+    # Load checkpoint
+    chkpt = torch.load(xlm_checkpoint_path, map_location='cpu')
+
+    model = chkpt['model']
+
+    config = chkpt['params']
+    config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
+
+    vocab = chkpt['dico_word2id']
+    vocab = dict((s + '</w>' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in vocab.items())
+
+    # Save pytorch-model
+    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
+    pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' +  VOCAB_FILES_NAMES['vocab_file']
+
+    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
+    torch.save(model, pytorch_weights_dump_path)
+
+    print("Save configuration file to {}".format(pytorch_config_dump_path))
+    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+        f.write(json.dumps(config, indent=2) + "\n")
+
+    print("Save vocab file to {}".format(pytorch_config_dump_path))
+    with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
+        f.write(json.dumps(vocab, indent=2) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--xlm_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path the official PyTorch dump.")
+    parser.add_argument("--pytorch_dump_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    args = parser.parse_args()
+    convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py b/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
new file mode 100644
index 0000000..a36fa51
--- /dev/null
+++ b/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
@@ -0,0 +1,104 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert BERT checkpoint."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import argparse
+import torch
+
+from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
+                                                    XLNetConfig,
+                                                    XLNetLMHeadModel, XLNetForQuestionAnswering,
+                                                    XLNetForSequenceClassification,
+                                                    load_tf_weights_in_xlnet)
+
+GLUE_TASKS_NUM_LABELS = {
+    "cola": 2,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
+}
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None):
+    # Initialise PyTorch model
+    config = XLNetConfig.from_json_file(bert_config_file)
+
+    finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
+    if finetuning_task in GLUE_TASKS_NUM_LABELS:
+        print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config)))
+        config.finetuning_task = finetuning_task
+        config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
+        model = XLNetForSequenceClassification(config)
+    elif 'squad' in finetuning_task:
+        config.finetuning_task = finetuning_task
+        model = XLNetForQuestionAnswering(config)
+    else:
+        model = XLNetLMHeadModel(config)
+
+    # Load weights from tf checkpoint
+    load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)
+
+    # Save pytorch-model
+    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
+    pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
+    print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
+    torch.save(model.state_dict(), pytorch_weights_dump_path)
+    print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
+    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+        f.write(config.to_json_string())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--tf_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the TensorFlow checkpoint path.")
+    parser.add_argument("--xlnet_config_file",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "The config json file corresponding to the pre-trained XLNet model. \n"
+                               "This specifies the model architecture.")
+    parser.add_argument("--pytorch_dump_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the folder to store the PyTorch model or dataset/vocab.")
+    parser.add_argument("--finetuning_task",
+                        default = None,
+                        type = str,
+                        help = "Name of a task on which the XLNet TensorFloaw model was fine-tuned")
+    args = parser.parse_args()
+    print(args)
+
+    convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path,
+                                        args.xlnet_config_file,
+                                        args.pytorch_dump_folder_path,
+                                        args.finetuning_task)
diff --git a/pytorch_transformers/file_utils.py b/pytorch_transformers/file_utils.py
new file mode 100644
index 0000000..3fe7fa8
--- /dev/null
+++ b/pytorch_transformers/file_utils.py
@@ -0,0 +1,294 @@
+"""
+Utilities for working with the local dataset cache.
+This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
+Copyright by the AllenNLP authors.
+"""
+from __future__ import (absolute_import, division, print_function, unicode_literals)
+
+import sys
+import json
+import logging
+import os
+import six
+import shutil
+import tempfile
+import fnmatch
+from functools import wraps
+from hashlib import sha256
+from io import open
+
+import boto3
+from botocore.config import Config
+from botocore.exceptions import ClientError
+import requests
+from tqdm import tqdm
+
+try:
+    from torch.hub import _get_torch_home
+    torch_cache_home = _get_torch_home()
+except ImportError:
+    torch_cache_home = os.path.expanduser(
+        os.getenv('TORCH_HOME', os.path.join(
+            os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
+default_cache_path = os.path.join(torch_cache_home, 'pytorch_transformers')
+
+try:
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+
+try:
+    from pathlib import Path
+    PYTORCH_PRETRAINED_BERT_CACHE = Path(
+        os.getenv('PYTORCH_TRANSFORMERS_CACHE', os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path)))
+except (AttributeError, ImportError):
+    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_TRANSFORMERS_CACHE',
+                                              os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                                                        default_cache_path))
+
+PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
+
+WEIGHTS_NAME = "pytorch_model.bin"
+TF_WEIGHTS_NAME = 'model.ckpt'
+CONFIG_NAME = "config.json"
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+if not six.PY2:
+    def add_start_docstrings(*docstr):
+        def docstring_decorator(fn):
+            fn.__doc__ = ''.join(docstr) + fn.__doc__
+            return fn
+        return docstring_decorator
+
+    def add_end_docstrings(*docstr):
+        def docstring_decorator(fn):
+            fn.__doc__ = fn.__doc__ + ''.join(docstr)
+            return fn
+        return docstring_decorator
+else:
+    # Not possible to update class docstrings on python2
+    def add_start_docstrings(*docstr):
+        def docstring_decorator(fn):
+            return fn
+        return docstring_decorator
+
+    def add_end_docstrings(*docstr):
+        def docstring_decorator(fn):
+            return fn
+        return docstring_decorator
+
+def url_to_filename(url, etag=None):
+    """
+    Convert `url` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the url's, delimited
+    by a period.
+    """
+    url_bytes = url.encode('utf-8')
+    url_hash = sha256(url_bytes)
+    filename = url_hash.hexdigest()
+
+    if etag:
+        etag_bytes = etag.encode('utf-8')
+        etag_hash = sha256(etag_bytes)
+        filename += '.' + etag_hash.hexdigest()
+
+    return filename
+
+
+def filename_to_url(filename, cache_dir=None):
+    """
+    Return the url and etag (which may be ``None``) stored for `filename`.
+    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_TRANSFORMERS_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        raise EnvironmentError("file {} not found".format(cache_path))
+
+    meta_path = cache_path + '.json'
+    if not os.path.exists(meta_path):
+        raise EnvironmentError("file {} not found".format(meta_path))
+
+    with open(meta_path, encoding="utf-8") as meta_file:
+        metadata = json.load(meta_file)
+    url = metadata['url']
+    etag = metadata['etag']
+
+    return url, etag
+
+
+def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None):
+    """
+    Given something that might be a URL (or might be a local path),
+    determine which. If it's a URL, download the file and cache it, and
+    return the path to the cached file. If it's already a local path,
+    make sure the file exists and then return the path.
+    Args:
+        cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
+        force_download: if True, re-dowload the file even if it's already cached in the cache dir.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_TRANSFORMERS_CACHE
+    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    parsed = urlparse(url_or_filename)
+
+    if parsed.scheme in ('http', 'https', 's3'):
+        # URL, so get it from the cache (downloading if necessary)
+        return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        return url_or_filename
+    elif parsed.scheme == '':
+        # File, but it doesn't exist.
+        raise EnvironmentError("file {} not found".format(url_or_filename))
+    else:
+        # Something unknown
+        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
+
+
+def split_s3_path(url):
+    """Split a full s3 path into the bucket name and path."""
+    parsed = urlparse(url)
+    if not parsed.netloc or not parsed.path:
+        raise ValueError("bad s3 path {}".format(url))
+    bucket_name = parsed.netloc
+    s3_path = parsed.path
+    # Remove '/' at beginning of path.
+    if s3_path.startswith("/"):
+        s3_path = s3_path[1:]
+    return bucket_name, s3_path
+
+
+def s3_request(func):
+    """
+    Wrapper function for s3 requests in order to create more helpful error
+    messages.
+    """
+
+    @wraps(func)
+    def wrapper(url, *args, **kwargs):
+        try:
+            return func(url, *args, **kwargs)
+        except ClientError as exc:
+            if int(exc.response["Error"]["Code"]) == 404:
+                raise EnvironmentError("file {} not found".format(url))
+            else:
+                raise
+
+    return wrapper
+
+
+@s3_request
+def s3_etag(url, proxies=None):
+    """Check ETag on S3 object."""
+    s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
+    bucket_name, s3_path = split_s3_path(url)
+    s3_object = s3_resource.Object(bucket_name, s3_path)
+    return s3_object.e_tag
+
+
+@s3_request
+def s3_get(url, temp_file, proxies=None):
+    """Pull a file directly from S3."""
+    s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
+    bucket_name, s3_path = split_s3_path(url)
+    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
+
+
+def http_get(url, temp_file, proxies=None):
+    req = requests.get(url, stream=True, proxies=proxies)
+    content_length = req.headers.get('Content-Length')
+    total = int(content_length) if content_length is not None else None
+    progress = tqdm(unit="B", total=total)
+    for chunk in req.iter_content(chunk_size=1024):
+        if chunk: # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+
+
+def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
+    """
+    Given a URL, look for the corresponding dataset in the local cache.
+    If it's not there, download it. Then return the path to the cached file.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_TRANSFORMERS_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+    if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
+        cache_dir = str(cache_dir)
+
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+
+    # Get eTag to add to filename, if it exists.
+    if url.startswith("s3://"):
+        etag = s3_etag(url, proxies=proxies)
+    else:
+        try:
+            response = requests.head(url, allow_redirects=True, proxies=proxies)
+            if response.status_code != 200:
+                etag = None
+            else:
+                etag = response.headers.get("ETag")
+        except EnvironmentError:
+            etag = None
+
+    if sys.version_info[0] == 2 and etag is not None:
+        etag = etag.decode('utf-8')
+    filename = url_to_filename(url, etag)
+
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+
+    # If we don't have a connection (etag is None) and can't identify the file
+    # try to get the last downloaded one
+    if not os.path.exists(cache_path) and etag is None:
+        matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*')
+        matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files))
+        if matching_files:
+            cache_path = os.path.join(cache_dir, matching_files[-1])
+
+    if not os.path.exists(cache_path) or force_download:
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with tempfile.NamedTemporaryFile() as temp_file:
+            logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
+
+            # GET file object
+            if url.startswith("s3://"):
+                s3_get(url, temp_file, proxies=proxies)
+            else:
+                http_get(url, temp_file, proxies=proxies)
+
+            # we are copying the file before closing it, so flush to avoid truncation
+            temp_file.flush()
+            # shutil.copyfileobj() starts at the current position, so go to the start
+            temp_file.seek(0)
+
+            logger.info("copying %s to cache at %s", temp_file.name, cache_path)
+            with open(cache_path, 'wb') as cache_file:
+                shutil.copyfileobj(temp_file, cache_file)
+
+            logger.info("creating metadata file for %s", cache_path)
+            meta = {'url': url, 'etag': etag}
+            meta_path = cache_path + '.json'
+            with open(meta_path, 'w') as meta_file:
+                output_string = json.dumps(meta)
+                if sys.version_info[0] == 2 and isinstance(output_string, str):
+                    output_string = unicode(output_string, 'utf-8')  # The beauty of python 2
+                meta_file.write(output_string)
+
+            logger.info("removing temp file %s", temp_file.name)
+
+    return cache_path
diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py
new file mode 100644
index 0000000..31c8faf
--- /dev/null
+++ b/pytorch_transformers/modeling_auto.py
@@ -0,0 +1,497 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Auto Model class. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+
+from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
+from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel
+from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel
+from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel
+from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering
+from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
+from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
+from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
+
+from .modeling_utils import PreTrainedModel, SequenceSummary
+
+from .file_utils import add_start_docstrings
+
+logger = logging.getLogger(__name__)
+
+
+class AutoModel(object):
+    r"""
+        :class:`~pytorch_transformers.AutoModel` is a generic model class
+        that will be instantiated as one of the base model classes of the library
+        when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The base model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertModel (DistilBERT model)
+            - contains `roberta`: RobertaModel (RoBERTa model)
+            - contains `bert`: BertModel (Bert model)
+            - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
+            - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
+            - contains `xlnet`: XLNetModel (XLNet model)
+            - contains `xlm`: XLMModel (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoModel is designed to be instantiated "
+            "using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the base model classes of the library
+        from a pre-trained model configuration.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertModel (DistilBERT model)
+            - contains `roberta`: RobertaModel (RoBERTa model)
+            - contains `bert`: BertModel (Bert model)
+            - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
+            - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
+            - contains `xlnet`: XLNetModel (XLNet model)
+            - contains `xlm`: XLMModel (XLM model)
+
+            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+            To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = AutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return BertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            return OpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'gpt2' in pretrained_model_name_or_path:
+            return GPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            return TransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+
+
+class AutoModelWithLMHead(object):
+    r"""
+        :class:`~pytorch_transformers.AutoModelWithLMHead` is a generic model class
+        that will be instantiated as one of the language modeling model classes of the library
+        when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
+            - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
+            - contains `bert`: BertForMaskedLM (Bert model)
+            - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
+            - contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
+            - contains `xlnet`: XLNetLMHeadModel (XLNet model)
+            - contains `xlm`: XLMWithLMHeadModel (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
+            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the language modeling model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
+            - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
+            - contains `bert`: BertForMaskedLM (Bert model)
+            - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
+            - contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
+            - contains `xlnet`: XLNetLMHeadModel (XLNet model)
+            - contains `xlm`: XLMWithLMHeadModel (XLM model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = AutoModelWithLMHead.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModelWithLMHead.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return BertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            return OpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'gpt2' in pretrained_model_name_or_path:
+            return GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            return TransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+
+
+class AutoModelForSequenceClassification(object):
+    r"""
+        :class:`~pytorch_transformers.AutoModelForSequenceClassification` is a generic model class
+        that will be instantiated as one of the sequence classification model classes of the library
+        when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
+            - contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
+            - contains `bert`: BertForSequenceClassification (Bert model)
+            - contains `xlnet`: XLNetForSequenceClassification (XLNet model)
+            - contains `xlm`: XLMForSequenceClassification (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
+            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the sequence classification model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
+            - contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
+            - contains `bert`: BertForSequenceClassification (Bert model)
+            - contains `xlnet`: XLNetForSequenceClassification (XLNet model)
+            - contains `xlm`: XLMForSequenceClassification (XLM model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModelForSequenceClassification.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
+
+
+class AutoModelForQuestionAnswering(object):
+    r"""
+        :class:`~pytorch_transformers.AutoModelForQuestionAnswering` is a generic model class
+        that will be instantiated as one of the question answering model classes of the library
+        when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
+            - contains `bert`: BertForQuestionAnswering (Bert model)
+            - contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
+            - contains `xlm`: XLMForQuestionAnswering (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
+            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the question answering model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
+            - contains `bert`: BertForQuestionAnswering (Bert model)
+            - contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
+            - contains `xlm`: XLMForQuestionAnswering (XLM model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModelForQuestionAnswering.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModelForQuestionAnswering.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
new file mode 100644
index 0000000..1d89183
--- /dev/null
+++ b/pytorch_transformers/modeling_bert.py
@@ -0,0 +1,1148 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from .modeling_utils import PreTrainedModel, prune_linear_layer
+from .configuration_bert import BertConfig
+from .file_utils import add_start_docstrings
+
+logger = logging.getLogger(__name__)
+
+BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
+    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
+}
+
+def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model.
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
+            logger.info("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'kernel' or l[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'output_bias' or l[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'squad':
+                pointer = getattr(pointer, 'classifier')
+            else:
+                try:
+                    pointer = getattr(pointer, l[0])
+                except AttributeError:
+                    logger.info("Skipping {}".format("/".join(name)))
+                    continue
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
+
+
+try:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
+except (ImportError, AttributeError) as e:
+    logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
+    BertLayerNorm = torch.nn.LayerNorm
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None):
+        seq_length = input_ids.size(1)
+        if position_ids is None:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = words_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.output_attentions = config.output_attentions
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask, head_mask=None):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
+        heads = set(heads) - self.pruned_heads  # Convert to set and emove already pruned heads
+        for head in heads:
+            # Compute how many pruned heads are before the head and move the index accordingly
+            head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, input_tensor, attention_mask, head_mask=None):
+        self_outputs = self.self(input_tensor, attention_mask, head_mask)
+        attention_output = self.output(self_outputs[0], input_tensor)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(self, hidden_states, attention_mask, head_mask=None):
+        attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
+        attention_output = attention_outputs[0]
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(self, hidden_states, attention_mask, head_mask=None):
+        all_hidden_states = ()
+        all_attentions = ()
+        for i, layer_module in enumerate(self.layer):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i])
+            hidden_states = layer_outputs[0]
+
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        outputs = (hidden_states,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
+
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size,
+                                 config.vocab_size,
+                                 bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states) + self.bias
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super(BertOnlyMLMHead, self).__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super(BertOnlyNSPHead, self).__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class BertPreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = BertConfig
+    pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_bert
+    base_model_prefix = "bert"
+
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+BERT_START_DOCSTRING = r"""    The BERT model was proposed in
+    `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_
+    by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer
+    pre-trained using a combination of masked language modeling objective and next sentence prediction
+    on a large corpus comprising the Toronto Book Corpus and Wikipedia.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`:
+        https://arxiv.org/abs/1810.04805
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model. 
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+BERT_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
+                
+                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
+
+            (b) For single sequences:
+
+                ``tokens:         [CLS] the dog is hairy . [SEP]``
+                
+                ``token_type_ids:   0   0   0   0  0     0   0``
+
+            Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+
+            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Segment token indices to indicate first and second portions of the inputs.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
+                      BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class BertModel(BertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during Bert pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertModel.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config):
+        super(BertModel, self).__init__(config)
+
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config)
+
+        self.init_weights()
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        old_embeddings = self.embeddings.word_embeddings
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
+        self.embeddings.word_embeddings = new_embeddings
+        return self.embeddings.word_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
+        encoder_outputs = self.encoder(embedding_output,
+                                       extended_attention_mask,
+                                       head_mask=head_mask)
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output)
+
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Bert Model with two heads on top as done during the pre-training:
+    a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class BertForPreTraining(BertPreTrainedModel):
+    r"""
+        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+        **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
+            Indices should be in ``[0, 1]``.
+            ``0`` indicates sequence B is a continuation of sequence A,
+            ``1`` indicates sequence B is a random sequence.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when both ``masked_lm_labels`` and ``next_sentence_label`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)``
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertForPreTraining.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        prediction_scores, seq_relationship_scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(BertForPreTraining, self).__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+
+        self.init_weights()
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.cls.predictions.decoder,
+                                   self.bert.embeddings.word_embeddings)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
+                next_sentence_label=None, position_ids=None, head_mask=None):
+        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                            attention_mask=attention_mask, head_mask=head_mask)
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        outputs = (prediction_scores, seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
+
+        if masked_lm_labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+            outputs = (total_loss,) + outputs
+
+        return outputs  # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Bert Model with a `language modeling` head on top. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class BertForMaskedLM(BertPreTrainedModel):
+    r"""
+        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Masked language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertForMaskedLM.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, masked_lm_labels=input_ids)
+        loss, prediction_scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(BertForMaskedLM, self).__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.cls.predictions.decoder,
+                                   self.bert.embeddings.word_embeddings)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
+                position_ids=None, head_mask=None):
+        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                            attention_mask=attention_mask, head_mask=head_mask)
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
+        if masked_lm_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            outputs = (masked_lm_loss,) + outputs
+
+        return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class BertForNextSentencePrediction(BertPreTrainedModel):
+    r"""
+        **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
+            Indices should be in ``[0, 1]``.
+            ``0`` indicates sequence B is a continuation of sequence A,
+            ``1`` indicates sequence B is a random sequence.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``next_sentence_label`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Next sequence prediction (classification) loss.
+        **seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)``
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        seq_relationship_scores = outputs[0]
+
+    """
+    def __init__(self, config):
+        super(BertForNextSentencePrediction, self).__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+
+        self.init_weights()
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None,
+                position_ids=None, head_mask=None):
+        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                            attention_mask=attention_mask, head_mask=head_mask)
+        pooled_output = outputs[1]
+
+        seq_relationship_score = self.cls(pooled_output)
+
+        outputs = (seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
+        if next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            outputs = (next_sentence_loss,) + outputs
+
+        return outputs  # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class BertForSequenceClassification(BertPreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(BertForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
+
+        self.init_weights()
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
+                position_ids=None, head_mask=None):
+        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                            attention_mask=attention_mask, head_mask=head_mask)
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    BERT_START_DOCSTRING)
+class BertForMultipleChoice(BertPreTrainedModel):
+    r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            The second dimension of the input (`num_choices`) indicates the number of choices to score.
+            To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
+                
+                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
+
+            (b) For single sequences:
+
+                ``tokens:         [CLS] the dog is hairy . [SEP]``
+                
+                ``token_type_ids:   0   0   0   0  0     0   0``
+    
+            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Segment token indices to indicate first and second portions of the inputs.
+            The second dimension of the input (`num_choices`) indicates the number of choices to score.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            The second dimension of the input (`num_choices`) indicates the number of choices to score.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss.
+        **classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above).
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
+        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
+        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, classification_scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(BertForMultipleChoice, self).__init__(config)
+
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        self.init_weights()
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
+                position_ids=None, head_mask=None):
+        num_choices = input_ids.shape[1]
+
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
+        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        outputs = self.bert(flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids,
+                            attention_mask=flat_attention_mask, head_mask=head_mask)
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class BertForTokenClassification(BertPreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss.
+        **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertForTokenClassification.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(BertForTokenClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
+                position_ids=None, head_mask=None):
+        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                            attention_mask=attention_mask, head_mask=head_mask)
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)[active_loss]
+                active_labels = labels.view(-1)[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), scores, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+class BertForQuestionAnswering(BertPreTrainedModel):
+    r"""
+        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        start_positions = torch.tensor([1])
+        end_positions = torch.tensor([3])
+        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        loss, start_scores, end_scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(BertForQuestionAnswering, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
+                end_positions=None, position_ids=None, head_mask=None):
+        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                            attention_mask=attention_mask, head_mask=head_mask)
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        outputs = (start_logits, end_logits,) + outputs[2:]
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            outputs = (total_loss,) + outputs
+
+        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
diff --git a/pytorch_transformers/modeling_distilbert.py b/pytorch_transformers/modeling_distilbert.py
new file mode 100644
index 0000000..2a0f27f
--- /dev/null
+++ b/pytorch_transformers/modeling_distilbert.py
@@ -0,0 +1,765 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch DistilBERT model
+    adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
+    and in part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import math
+import copy
+import sys
+from io import open
+
+import itertools
+import numpy as np
+
+import torch
+import torch.nn as nn
+
+from .modeling_utils import PreTrainedModel, prune_linear_layer
+from .configuration_distilbert import DistilBertConfig
+from .file_utils import add_start_docstrings
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin",
+    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin"
+}
+
+
+### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
+def gelu(x):
+    return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+def create_sinusoidal_embeddings(n_pos, dim, out):
+    position_enc = np.array([
+        [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
+        for pos in range(n_pos)
+    ])
+    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+    out.detach_()
+    out.requires_grad = False
+
+class Embeddings(nn.Module):
+    def __init__(self,
+                 config):
+        super(Embeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=0)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim)
+        if config.sinusoidal_pos_embds:
+            create_sinusoidal_embeddings(n_pos=config.max_position_embeddings,
+                                         dim=config.dim,
+                                         out=self.position_embeddings.weight)
+
+        self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12)
+        self.dropout = nn.Dropout(config.dropout)
+
+    def forward(self, input_ids):
+        """
+        Parameters
+        ----------
+        input_ids: torch.tensor(bs, max_seq_length)
+            The token ids to embed.
+
+        Outputs
+        -------
+        embeddings: torch.tensor(bs, max_seq_length, dim)
+            The embedded tokens (plus position embeddings, no token_type embeddings)
+        """
+        seq_length = input_ids.size(1)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)                      # (bs, max_seq_length)
+
+        word_embeddings = self.word_embeddings(input_ids)                   # (bs, max_seq_length, dim)
+        position_embeddings = self.position_embeddings(position_ids)        # (bs, max_seq_length, dim)
+
+        embeddings = word_embeddings + position_embeddings  # (bs, max_seq_length, dim)
+        embeddings = self.LayerNorm(embeddings)             # (bs, max_seq_length, dim)
+        embeddings = self.dropout(embeddings)               # (bs, max_seq_length, dim)
+        return embeddings
+
+class MultiHeadSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(MultiHeadSelfAttention, self).__init__()
+
+        self.n_heads = config.n_heads
+        self.dim = config.dim
+        self.dropout = nn.Dropout(p=config.attention_dropout)
+        self.output_attentions = config.output_attentions
+
+        assert self.dim % self.n_heads == 0
+
+        self.q_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
+        self.k_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
+        self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
+        self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
+
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        attention_head_size = self.dim // self.n_heads
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.n_heads, attention_head_size)
+        heads = set(heads) - self.pruned_heads
+        for head in heads:
+            head -= sum(1 if h < head else 0 for h in self.pruned_heads)
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        # Prune linear layers
+        self.q_lin = prune_linear_layer(self.q_lin, index)
+        self.k_lin = prune_linear_layer(self.k_lin, index)
+        self.v_lin = prune_linear_layer(self.v_lin, index)
+        self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.dim = attention_head_size * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, query, key, value, mask, head_mask = None):
+        """
+        Parameters
+        ----------
+        query: torch.tensor(bs, seq_length, dim)
+        key: torch.tensor(bs, seq_length, dim)
+        value: torch.tensor(bs, seq_length, dim)
+        mask: torch.tensor(bs, seq_length)
+
+        Outputs
+        -------
+        weights: torch.tensor(bs, n_heads, seq_length, seq_length)
+            Attention weights
+        context: torch.tensor(bs, seq_length, dim)
+            Contextualized layer. Optional: only if `output_attentions=True`
+        """
+        bs, q_length, dim = query.size()
+        k_length = key.size(1)
+        # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
+        # assert key.size() == value.size()
+
+        dim_per_head = self.dim // self.n_heads
+
+        assert 2 <= mask.dim() <= 3
+        causal = (mask.dim() == 3)
+        mask_reshp = (bs, 1, 1, k_length)
+
+        def shape(x):
+            """ separate heads """
+            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
+
+        def unshape(x):
+            """ group heads """
+            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
+
+        q = shape(self.q_lin(query))           # (bs, n_heads, q_length, dim_per_head)
+        k = shape(self.k_lin(key))             # (bs, n_heads, k_length, dim_per_head)
+        v = shape(self.v_lin(value))           # (bs, n_heads, k_length, dim_per_head)
+
+        q = q / math.sqrt(dim_per_head)                     # (bs, n_heads, q_length, dim_per_head)
+        scores = torch.matmul(q, k.transpose(2,3))          # (bs, n_heads, q_length, k_length)
+        mask = (mask==0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length)
+        scores.masked_fill_(mask, -float('inf'))            # (bs, n_heads, q_length, k_length)
+
+        weights = nn.Softmax(dim=-1)(scores)   # (bs, n_heads, q_length, k_length)
+        weights = self.dropout(weights)        # (bs, n_heads, q_length, k_length)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
+        context = torch.matmul(weights, v)     # (bs, n_heads, q_length, dim_per_head)
+        context = unshape(context)             # (bs, q_length, dim)
+        context = self.out_lin(context)        # (bs, q_length, dim)
+
+        if self.output_attentions:
+            return (context, weights)
+        else:
+            return (context,)
+
+class FFN(nn.Module):
+    def __init__(self, config):
+        super(FFN, self).__init__()
+        self.dropout = nn.Dropout(p=config.dropout)
+        self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim)
+        self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim)
+        assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation)
+        self.activation = gelu if config.activation == 'gelu' else nn.ReLU()
+
+    def forward(self, input):
+        x = self.lin1(input)
+        x = self.activation(x)
+        x = self.lin2(x)
+        x = self.dropout(x)
+        return x
+
+class TransformerBlock(nn.Module):
+    def __init__(self, config):
+        super(TransformerBlock, self).__init__()
+
+        self.n_heads = config.n_heads
+        self.dim = config.dim
+        self.hidden_dim = config.hidden_dim
+        self.dropout = nn.Dropout(p=config.dropout)
+        self.activation = config.activation
+        self.output_attentions = config.output_attentions
+
+        assert config.dim % config.n_heads == 0
+
+        self.attention = MultiHeadSelfAttention(config)
+        self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)
+
+        self.ffn = FFN(config)
+        self.output_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)
+
+    def forward(self, x, attn_mask=None, head_mask=None):
+        """
+        Parameters
+        ----------
+        x: torch.tensor(bs, seq_length, dim)
+        attn_mask: torch.tensor(bs, seq_length)
+
+        Outputs
+        -------
+        sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length)
+            The attention weights
+        ffn_output: torch.tensor(bs, seq_length, dim)
+            The output of the transformer block contextualization.
+        """
+        # Self-Attention
+        sa_output = self.attention(query=x, key=x, value=x, mask=attn_mask, head_mask=head_mask)
+        if self.output_attentions:
+            sa_output, sa_weights = sa_output                  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
+        else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples
+            assert type(sa_output) == tuple
+            sa_output = sa_output[0]
+        sa_output = self.sa_layer_norm(sa_output + x)          # (bs, seq_length, dim)
+
+        # Feed Forward Network
+        ffn_output = self.ffn(sa_output)                             # (bs, seq_length, dim)
+        ffn_output = self.output_layer_norm(ffn_output + sa_output)  # (bs, seq_length, dim)
+
+        output = (ffn_output,)
+        if self.output_attentions:
+            output = (sa_weights,) + output
+        return output
+
+
+class Transformer(nn.Module):
+    def __init__(self, config):
+        super(Transformer, self).__init__()
+        self.n_layers = config.n_layers
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+
+        layer = TransformerBlock(config)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)])
+
+    def forward(self, x, attn_mask=None, head_mask=None):
+        """
+        Parameters
+        ----------
+        x: torch.tensor(bs, seq_length, dim)
+            Input sequence embedded.
+        attn_mask: torch.tensor(bs, seq_length)
+            Attention mask on the sequence.
+
+        Outputs
+        -------
+        hidden_state: torch.tensor(bs, seq_length, dim)
+            Sequence of hiddens states in the last (top) layer
+        all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
+            Tuple of length n_layers with the hidden states from each layer.
+            Optional: only if output_hidden_states=True
+        all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
+            Tuple of length n_layers with the attention weights from each layer
+            Optional: only if output_attentions=True
+        """
+        all_hidden_states = ()
+        all_attentions = ()
+
+        hidden_state = x
+        for i, layer_module in enumerate(self.layer):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_state,)
+
+            layer_outputs = layer_module(x=hidden_state,
+                                         attn_mask=attn_mask,
+                                         head_mask=head_mask[i])
+            hidden_state = layer_outputs[-1]
+
+            if self.output_attentions:
+                assert len(layer_outputs) == 2
+                attentions = layer_outputs[0]
+                all_attentions = all_attentions + (attentions,)
+            else:
+                assert len(layer_outputs) == 1
+
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_state,)
+
+        outputs = (hidden_state,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
+
+
+### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ###
+class DistilBertPreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for downloading and loading pretrained models.
+    """
+    config_class = DistilBertConfig
+    pretrained_model_archive_map = DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_tf_weights = None
+    base_model_prefix = "distilbert"
+
+    def __init__(self, *inputs, **kwargs):
+        super(DistilBertPreTrainedModel, self).__init__(*inputs, **kwargs)
+    
+    def _init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, nn.Embedding):
+            if module.weight.requires_grad:
+                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+DISTILBERT_START_DOCSTRING = r"""
+    DistilBERT is a small, fast, cheap and light Transformer model
+    trained by distilling Bert base. It has 40% less parameters than
+    `bert-base-uncased`, runs 60% faster while preserving over 95% of
+    Bert's performances as measured on the GLUE language understanding benchmark.
+
+    Here are the differences between the interface of Bert and DistilBert:
+
+    - DistilBert doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`)
+    - DistilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option.
+
+    For more information on DistilBERT, please refer to our
+    `detailed blog post`_
+    
+    .. _`detailed blog post`:
+        https://medium.com/huggingface/distilbert-8cf3380435b5
+
+    Parameters:
+        config (:class:`~pytorch_transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+DISTILBERT_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids** ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            The input sequences should start with `[CLS]` and end with `[SEP]` tokens.
+            
+            For now, ONLY BertTokenizer(`bert-base-uncased`) is supported and you should use this tokenizer when using DistilBERT.
+        **attention_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
+                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+class DistilBertModel(DistilBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+        model = DistilBertModel.from_pretrained('distilbert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config):
+        super(DistilBertModel, self).__init__(config)
+
+        self.embeddings = Embeddings(config)   # Embeddings
+        self.transformer = Transformer(config) # Encoder
+
+        self.init_weights()
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        old_embeddings = self.embeddings.word_embeddings
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
+        self.embeddings.word_embeddings = new_embeddings
+        return self.embeddings.word_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.transformer.layer[layer].attention.prune_heads(heads)
+
+    def forward(self,
+                input_ids, attention_mask=None, head_mask=None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids) # (bs, seq_length)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        embedding_output = self.embeddings(input_ids)   # (bs, seq_length, dim)
+        tfmr_output = self.transformer(x=embedding_output,
+                                       attn_mask=attention_mask,
+                                       head_mask=head_mask)
+        hidden_state = tfmr_output[0]
+        output = (hidden_state, ) + tfmr_output[1:]
+
+        return output # last-layer hidden-state, (all hidden_states), (all attentions)
+
+
+@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """,
+                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+class DistilBertForMaskedLM(DistilBertPreTrainedModel):
+    r"""
+        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Masked language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+        model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, masked_lm_labels=input_ids)
+        loss, prediction_scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(DistilBertForMaskedLM, self).__init__(config)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+
+        self.distilbert = DistilBertModel(config)
+        self.vocab_transform = nn.Linear(config.dim, config.dim)
+        self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12)
+        self.vocab_projector = nn.Linear(config.dim, config.vocab_size)
+
+        self.init_weights()
+        self.tie_weights()
+
+        self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.vocab_projector,
+                                   self.distilbert.embeddings.word_embeddings)
+
+    def forward(self, input_ids, attention_mask=None, masked_lm_labels=None, head_mask=None):
+        dlbrt_output = self.distilbert(input_ids=input_ids,
+                                    attention_mask=attention_mask,
+                                    head_mask=head_mask)
+        hidden_states = dlbrt_output[0]                              # (bs, seq_length, dim)
+        prediction_logits = self.vocab_transform(hidden_states)      # (bs, seq_length, dim)
+        prediction_logits = gelu(prediction_logits)                  # (bs, seq_length, dim)
+        prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim)
+        prediction_logits = self.vocab_projector(prediction_logits)  # (bs, seq_length, vocab_size)
+
+        outputs = (prediction_logits, ) + dlbrt_output[1:]
+        if masked_lm_labels is not None:
+            mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)),
+                                         masked_lm_labels.view(-1))
+            outputs = (mlm_loss,) + outputs     
+
+        return outputs # (mlm_loss), prediction_logits, (all hidden_states), (all attentions)
+
+
+@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+                         the pooled output) e.g. for GLUE tasks. """,
+                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+        model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(DistilBertForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.distilbert = DistilBertModel(config)
+        self.pre_classifier = nn.Linear(config.dim, config.dim)
+        self.classifier = nn.Linear(config.dim, config.num_labels)
+        self.dropout = nn.Dropout(config.seq_classif_dropout)
+
+        self.init_weights()
+
+    def forward(self, input_ids,  attention_mask=None, labels=None, head_mask=None):
+        distilbert_output = self.distilbert(input_ids=input_ids,
+                                      attention_mask=attention_mask,
+                                      head_mask=head_mask)
+        hidden_state = distilbert_output[0]                    # (bs, seq_len, dim)
+        pooled_output = hidden_state[:, 0]                    # (bs, dim)
+        pooled_output = self.pre_classifier(pooled_output)   # (bs, dim)
+        pooled_output = nn.ReLU()(pooled_output)             # (bs, dim)
+        pooled_output = self.dropout(pooled_output)         # (bs, dim)
+        logits = self.classifier(pooled_output)              # (bs, dim)
+
+        outputs = (logits,) + distilbert_output[1:]
+        if labels is not None:
+            if self.num_labels == 1:
+                loss_fct = nn.MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
+
+
+
+@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+                         the pooled output) e.g. for GLUE tasks. """,
+                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+class SearchDistilBert(DistilBertPreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+        model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(SearchDistilBert, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.distilbert = DistilBertModel(config)
+        self.pre_classifier = nn.Linear(config.dim, config.dim)
+        self.classifier = nn.Linear(config.dim, 1)
+        self.dropout = nn.Dropout(config.seq_classif_dropout)
+
+        self.init_weights()
+
+    def forward(self, input_ids,  attention_mask=None, labels=None, head_mask=None):
+        distilbert_output = self.distilbert(input_ids=input_ids,
+                                      attention_mask=attention_mask,
+                                      head_mask=head_mask)
+        hidden_state = distilbert_output[0]                    # (bs, seq_len, dim)
+        pooled_output = hidden_state[:, 0]                    # (bs, dim)
+        pooled_output = self.pre_classifier(pooled_output)   # (bs, dim)
+        pooled_output = nn.ReLU()(pooled_output)             # (bs, dim)
+        pooled_output = self.dropout(pooled_output)         # (bs, dim)
+        logits = self.classifier(pooled_output)              # (bs, dim)
+        return logits.view(-1)
+        # outputs = (logits,) + distilbert_output[1:]
+        # if labels is not None:
+        #     if self.num_labels == 1:
+        #         loss_fct = nn.MSELoss()
+        #         loss = loss_fct(logits.view(-1), labels.view(-1))
+        #     else:
+        #         loss_fct = nn.CrossEntropyLoss()
+        #         loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        #     outputs = (loss,) + outputs
+        #
+        # return outputs  # (loss), logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+                         the hidden-states output to compute `span start logits` and `span end logits`). """,
+                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
+    r"""
+        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+        model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        start_positions = torch.tensor([1])
+        end_positions = torch.tensor([3])
+        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        loss, start_scores, end_scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(DistilBertForQuestionAnswering, self).__init__(config)
+
+        self.distilbert = DistilBertModel(config)
+        self.qa_outputs = nn.Linear(config.dim, config.num_labels)
+        assert config.num_labels == 2
+        self.dropout = nn.Dropout(config.qa_dropout)
+
+        self.init_weights()
+        
+    def forward(self, input_ids, attention_mask=None, start_positions=None, end_positions=None, head_mask=None):
+        distilbert_output = self.distilbert(input_ids=input_ids,
+                                      attention_mask=attention_mask,
+                                      head_mask=head_mask)
+        hidden_states = distilbert_output[0]                                 # (bs, max_query_len, dim)
+
+        hidden_states = self.dropout(hidden_states)                       # (bs, max_query_len, dim)
+        logits = self.qa_outputs(hidden_states)                           # (bs, max_query_len, 2)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)                           # (bs, max_query_len)
+        end_logits = end_logits.squeeze(-1)                               # (bs, max_query_len)
+
+        outputs = (start_logits, end_logits,) + distilbert_output[1:]
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            outputs = (total_loss,) + outputs
+
+        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
new file mode 100644
index 0000000..1752891
--- /dev/null
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -0,0 +1,633 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch OpenAI GPT-2 model."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from torch.nn.parameter import Parameter
+
+from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
+from .configuration_gpt2 import GPT2Config
+from .file_utils import add_start_docstrings
+
+logger = logging.getLogger(__name__)
+
+GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
+                                     "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
+                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin"}
+
+def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(gpt2_checkpoint_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array.squeeze())
+
+    for name, array in zip(names, arrays):
+        name = name[6:]  # skip "model/"
+        name = name.split('/')
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+\d+', m_name):
+                l = re.split(r'(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'w' or l[0] == 'g':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'b':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'wpe' or l[0] == 'wte':
+                pointer = getattr(pointer, l[0])
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+def gelu(x):
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+
+
+class Attention(nn.Module):
+    def __init__(self, nx, n_ctx, config, scale=False):
+        super(Attention, self).__init__()
+        self.output_attentions = config.output_attentions
+
+        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
+        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
+        assert n_state % config.n_head == 0
+        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
+        self.n_head = config.n_head
+        self.split_size = n_state
+        self.scale = scale
+
+        self.c_attn = Conv1D(n_state * 3, nx)
+        self.c_proj = Conv1D(n_state, nx)
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.n_head, self.split_size // self.n_head)
+        heads = set(heads) - self.pruned_heads  # Convert to set and emove already pruned heads
+        for head in heads:
+            # Compute how many pruned heads are before the head and move the index accordingly
+            head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
+
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+
+        # Update hyper params
+        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
+        self.n_head = self.n_head - len(heads)
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def _attn(self, q, k, v, head_mask=None):
+        w = torch.matmul(q, k)
+        if self.scale:
+            w = w / math.sqrt(v.size(-1))
+        nd, ns = w.size(-2), w.size(-1)
+        b = self.bias[:, :, ns-nd:ns, :ns]
+        w = w * b - 1e4 * (1 - b)
+
+        w = nn.Softmax(dim=-1)(w)
+        w = self.attn_dropout(w)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            w = w * head_mask
+
+        outputs = [torch.matmul(w, v)]
+        if self.output_attentions:
+            outputs.append(w)
+        return outputs
+
+    def merge_heads(self, x):
+        x = x.permute(0, 2, 1, 3).contiguous()
+        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
+        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
+
+    def split_heads(self, x, k=False):
+        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
+        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
+        if k:
+            return x.permute(0, 2, 3, 1)  # (batch, head, head_features, seq_length)
+        else:
+            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+
+    def forward(self, x, layer_past=None, head_mask=None):
+        x = self.c_attn(x)
+        query, key, value = x.split(self.split_size, dim=2)
+        query = self.split_heads(query)
+        key = self.split_heads(key, k=True)
+        value = self.split_heads(value)
+        if layer_past is not None:
+            past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1]  # transpose back cf below
+            key = torch.cat((past_key, key), dim=-1)
+            value = torch.cat((past_value, value), dim=-2)
+        present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
+
+        attn_outputs = self._attn(query, key, value, head_mask)
+        a = attn_outputs[0]
+
+        a = self.merge_heads(a)
+        a = self.c_proj(a)
+        a = self.resid_dropout(a)
+
+        outputs = [a, present] + attn_outputs[1:]
+        return outputs  # a, present, (attentions)
+
+
+class MLP(nn.Module):
+    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
+        super(MLP, self).__init__()
+        nx = config.n_embd
+        self.c_fc = Conv1D(n_state, nx)
+        self.c_proj = Conv1D(nx, n_state)
+        self.act = gelu
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(self, x):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        return self.dropout(h2)
+
+
+class Block(nn.Module):
+    def __init__(self, n_ctx, config, scale=False):
+        super(Block, self).__init__()
+        nx = config.n_embd
+        self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
+        self.attn = Attention(nx, n_ctx, config, scale)
+        self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
+        self.mlp = MLP(4 * nx, config)
+
+    def forward(self, x, layer_past=None, head_mask=None):
+        output_attn = self.attn(self.ln_1(x), layer_past=layer_past, head_mask=head_mask)
+        a = output_attn[0]  # output_attn: a, present, (attentions)
+
+        x = x + a
+        m = self.mlp(self.ln_2(x))
+        x = x + m
+
+        outputs = [x] + output_attn[1:]
+        return outputs  # x, present, (attentions)
+
+
+class GPT2PreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = GPT2Config
+    pretrained_model_archive_map = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_gpt2
+    base_model_prefix = "transformer"
+
+    def __init__(self, *inputs, **kwargs):
+        super(GPT2PreTrainedModel, self).__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+GPT2_START_DOCSTRING = r"""    OpenAI GPT-2 model was proposed in
+    `Language Models are Unsupervised Multitask Learners`_
+    by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+    It's a causal (unidirectional) transformer pre-trained using  language modeling on a very large
+    corpus of ~40 GB of text data.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`Language Models are Unsupervised Multitask Learners`:
+        https://openai.com/blog/better-language-models/
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+GPT2_INPUTS_DOCSTRING = r"""    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **past**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `past` output below). Can be used to speed up sequential decoding.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
+                      GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+class GPT2Model(GPT2PreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **past**:
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            that contains pre-computed hidden-states (key and values in the attention blocks).
+            Can be used (see `past` input) to speed up sequential decoding.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        model = GPT2Model.from_pretrained('gpt2')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config):
+        super(GPT2Model, self).__init__(config)
+        self.output_hidden_states = config.output_hidden_states
+        self.output_attentions = config.output_attentions
+
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        self.wpe = nn.Embedding(config.n_positions, config.n_embd)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
+        self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+
+        self.init_weights()
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        self.wte = self._get_resized_embeddings(self.wte, new_num_tokens)
+        return self.wte
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.h[layer].attn.prune_heads(heads)
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):
+        if past is None:
+            past_length = 0
+            past = [None] * len(self.h)
+        else:
+            past_length = past[0][0].size(-2)
+        if position_ids is None:
+            position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.n_layer
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_ids.size(-1))
+        position_ids = position_ids.view(-1, position_ids.size(-1))
+
+        inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
+            token_type_embeds = self.wte(token_type_ids)
+        else:
+            token_type_embeds = 0
+        hidden_states = inputs_embeds + position_embeds + token_type_embeds
+        hidden_states = self.drop(hidden_states)
+
+        output_shape = input_shape + (hidden_states.size(-1),)
+
+        presents = ()
+        all_attentions = []
+        all_hidden_states = ()
+        for i, (block, layer_past) in enumerate(zip(self.h, past)):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
+
+            outputs = block(hidden_states, layer_past, head_mask[i])
+            hidden_states, present = outputs[:2]
+            presents = presents + (present,)
+
+            if self.output_attentions:
+                all_attentions.append(outputs[2])
+
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = hidden_states.view(*output_shape)
+        # Add last hidden state
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        outputs = (hidden_states, presents)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            # let the number of heads free (-1) so we can extract attention even after head pruning
+            attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:]
+            all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions)
+            outputs = outputs + (all_attentions,)
+        return outputs  # last hidden state, presents, (all hidden_states), (attentions)
+
+
+@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+class GPT2LMHeadModel(GPT2PreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **past**:
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            that contains pre-computed hidden-states (key and values in the attention blocks).
+            Can be used (see `past` input) to speed up sequential decoding.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import torch
+        from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
+
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        model = GPT2LMHeadModel.from_pretrained('gpt2')
+
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=input_ids)
+        loss, logits = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(GPT2LMHeadModel, self).__init__(config)
+        self.transformer = GPT2Model(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+
+        self.init_weights()
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.lm_head,
+                                   self.transformer.wte)
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, past=None, head_mask=None):
+        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                                               past=past, head_mask=head_mask)
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        outputs = (lm_logits,) + transformer_outputs[1:]
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                            shift_labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)
+
+
+@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
+head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
+The language modeling head has its weights tied to the input embeddings,
+the classification head takes as input the input of a specified classification token index in the input sequence).
+""", GPT2_START_DOCSTRING)
+class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
+    r"""    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            The second dimension of the input (`num_choices`) indicates the number of choices to score.
+            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
+            Index of the classification token in each input sequence.
+            Selected in the range ``[0, input_ids.size(-1) - 1[``.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **past**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `past` output below). Can be used to speed up sequential decoding.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
+        **mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **lm_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **mc_loss**: (`optional`, returned when ``multiple_choice_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Multiple choice classification loss.
+        **lm_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
+            Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
+        **past**:
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            that contains pre-computed hidden-states (key and values in the attention blocks).
+            Can be used (see `past` input) to speed up sequential decoding.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import torch
+        from pytorch_transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
+        
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
+        
+        # Add a [CLS] to the vocabulary (we should train it also!)
+        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
+        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
+        
+        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+        encoded_choices = [tokenizer.encode(s) for s in choices]
+        cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+
+        input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
+        mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
+
+        outputs = model(input_ids, mc_token_ids=mc_token_ids)
+        lm_prediction_scores, mc_prediction_scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(GPT2DoubleHeadsModel, self).__init__(config)
+        self.transformer = GPT2Model(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.multiple_choice_head = SequenceSummary(config)
+
+        self.init_weights()
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.lm_head,
+                                   self.transformer.wte)
+
+    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
+                position_ids=None, past=None, head_mask=None):
+        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                                               past=past, head_mask=head_mask)
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
+
+        outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
+        if mc_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
+                            mc_labels.view(-1))
+            outputs = (loss,) + outputs
+        if lm_labels is not None:
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                            shift_labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions)
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
new file mode 100644
index 0000000..05268d2
--- /dev/null
+++ b/pytorch_transformers/modeling_openai.py
@@ -0,0 +1,606 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch OpenAI GPT model."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from torch.nn.parameter import Parameter
+
+from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
+from .configuration_openai import OpenAIGPTConfig
+from .file_utils import add_start_docstrings
+
+logger = logging.getLogger(__name__)
+
+OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
+
+
+def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
+    """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
+    """
+    import re
+    import numpy as np
+
+    if '.ckpt' in openai_checkpoint_folder_path:
+        openai_checkpoint_folder_path = os.path.dirname(openai_checkpoint_folder_path)
+
+    logger.info("Loading weights from {}".format(openai_checkpoint_folder_path))
+
+    names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
+    shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
+    offsets = np.cumsum([np.prod(shape) for shape in shapes])
+    init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
+    init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
+    init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
+
+    # This was used when we had a single embedding matrix for positions and tokens
+    # init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)
+    # del init_params[1]
+    init_params = [arr.squeeze() for arr in init_params]
+
+    try:
+        assert model.tokens_embed.weight.shape == init_params[1].shape
+        assert model.positions_embed.weight.shape == init_params[0].shape
+    except AssertionError as e:
+        e.args += (model.tokens_embed.weight.shape, init_params[1].shape)
+        e.args += (model.positions_embed.weight.shape, init_params[0].shape)
+        raise
+
+    model.tokens_embed.weight.data = torch.from_numpy(init_params[1])
+    model.positions_embed.weight.data = torch.from_numpy(init_params[0])
+    names.pop(0)
+    # Pop position and token embedding arrays
+    init_params.pop(0)
+    init_params.pop(0)
+
+    for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]):
+        name = name[6:]  # skip "model/"
+        assert name[-2:] == ":0"
+        name = name[:-2]
+        name = name.split('/')
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+\d+', m_name):
+                l = re.split(r'(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'g':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'b':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'w':
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+def gelu(x):
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
+
+
+class Attention(nn.Module):
+    def __init__(self, nx, n_ctx, config, scale=False):
+        super(Attention, self).__init__()
+        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
+        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
+        assert n_state % config.n_head == 0
+        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
+        self.n_head = config.n_head
+        self.split_size = n_state
+        self.scale = scale
+
+        self.output_attentions = config.output_attentions
+
+        self.c_attn = Conv1D(n_state * 3, nx)
+        self.c_proj = Conv1D(n_state, nx)
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.n_head, self.split_size // self.n_head)
+        heads = set(heads) - self.pruned_heads
+        for head in heads:
+            head -= sum(1 if h < head else 0 for h in self.pruned_heads)
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+        # Update hyper params
+        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
+        self.n_head = self.n_head - len(heads)
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def _attn(self, q, k, v, head_mask=None):
+        w = torch.matmul(q, k)
+        if self.scale:
+            w = w / math.sqrt(v.size(-1))
+        # w = w * self.bias + -1e9 * (1 - self.bias)  # TF implem method: mask_attn_weights
+        # XD: self.b may be larger than w, so we need to crop it
+        b = self.bias[:, :, : w.size(-2), : w.size(-1)]
+        w = w * b + -1e9 * (1 - b)
+
+        w = nn.Softmax(dim=-1)(w)
+        w = self.attn_dropout(w)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            w = w * head_mask
+
+        outputs = [torch.matmul(w, v)]
+        if self.output_attentions:
+            outputs.append(w)
+        return outputs
+
+    def merge_heads(self, x):
+        x = x.permute(0, 2, 1, 3).contiguous()
+        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
+        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
+
+    def split_heads(self, x, k=False):
+        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
+        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
+        if k:
+            return x.permute(0, 2, 3, 1)
+        else:
+            return x.permute(0, 2, 1, 3)
+
+    def forward(self, x, head_mask=None):
+        x = self.c_attn(x)
+        query, key, value = x.split(self.split_size, dim=2)
+        query = self.split_heads(query)
+        key = self.split_heads(key, k=True)
+        value = self.split_heads(value)
+
+        attn_outputs = self._attn(query, key, value, head_mask)
+        a = attn_outputs[0]
+
+        a = self.merge_heads(a)
+        a = self.c_proj(a)
+        a = self.resid_dropout(a)
+
+        outputs = [a] + attn_outputs[1:]
+        return outputs  # a, (attentions)
+
+
+class MLP(nn.Module):
+    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
+        super(MLP, self).__init__()
+        nx = config.n_embd
+        self.c_fc = Conv1D(n_state, nx)
+        self.c_proj = Conv1D(nx, n_state)
+        self.act = ACT_FNS[config.afn]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(self, x):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        return self.dropout(h2)
+
+
+class Block(nn.Module):
+    def __init__(self, n_ctx, config, scale=False):
+        super(Block, self).__init__()
+        nx = config.n_embd
+        self.attn = Attention(nx, n_ctx, config, scale)
+        self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
+        self.mlp = MLP(4 * nx, config)
+        self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
+
+    def forward(self, x, head_mask=None):
+        attn_outputs = self.attn(x, head_mask=head_mask)
+        a = attn_outputs[0]
+
+        n = self.ln_1(x + a)
+        m = self.mlp(n)
+        h = self.ln_2(n + m)
+
+        outputs = [h] + attn_outputs[1:]
+        return outputs
+
+
+class OpenAIGPTPreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = OpenAIGPTConfig
+    pretrained_model_archive_map = OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_openai_gpt
+    base_model_prefix = "transformer"
+
+    def _init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+OPENAI_GPT_START_DOCSTRING = r"""    OpenAI GPT model was proposed in
+    `Improving Language Understanding by Generative Pre-Training`_
+    by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+    It's a causal (unidirectional) transformer pre-trained using language modeling on a large
+    corpus will long range dependencies, the Toronto Book Corpus.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`Improving Language Understanding by Generative Pre-Training`:
+        https://openai.com/blog/language-unsupervised/
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices)
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
+                      OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        model = OpenAIGPTModel.from_pretrained('openai-gpt')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config):
+        super(OpenAIGPTModel, self).__init__(config)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+
+        self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd)
+        self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
+
+        self.init_weights()
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens)
+        return self.tokens_embed
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.h[layer].attn.prune_heads(heads)
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=None):
+        if position_ids is None:
+            # This was used when we had a single embedding matrice from position and token embeddings
+            # start = self.config.vocab_size + self.config.n_special
+            # end = start + input_ids.size(-1)
+            # position_ids = torch.arange(start, end, dtype=torch.long, device=input_ids.device)
+            position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.n_layer
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_ids.size(-1))
+        position_ids = position_ids.view(-1, position_ids.size(-1))
+
+        inputs_embeds = self.tokens_embed(input_ids)
+        position_embeds = self.positions_embed(position_ids)
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
+            token_type_embeds = self.tokens_embed(token_type_ids)
+        else:
+            token_type_embeds = 0
+        hidden_states = inputs_embeds + position_embeds + token_type_embeds
+        hidden_states = self.drop(hidden_states)
+
+        output_shape = input_shape + (hidden_states.size(-1),)
+
+        all_attentions = ()
+        all_hidden_states = ()
+        for i, block in enumerate(self.h):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
+
+            outputs = block(hidden_states, head_mask[i])
+            hidden_states = outputs[0]
+            if self.output_attentions:
+                all_attentions = all_attentions + (outputs[1],)
+
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
+
+        outputs = (hidden_states.view(*output_shape),)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # last hidden state, (all hidden states), (all attentions)
+
+
+@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=input_ids)
+        loss, logits = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(OpenAIGPTLMHeadModel, self).__init__(config)
+        self.transformer = OpenAIGPTModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+
+        self.init_weights()
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.lm_head,
+                                   self.transformer.tokens_embed)
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, head_mask=None):
+        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                                               head_mask=head_mask)
+        hidden_states = transformer_outputs[0]
+        lm_logits = self.lm_head(hidden_states)
+
+        outputs = (lm_logits,) + transformer_outputs[1:]
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                            shift_labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), lm_logits, (all hidden states), (all attentions)
+
+
+@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
+head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
+The language modeling head has its weights tied to the input embeddings,
+the classification head takes as input the input of a specified classification token index in the input sequence).
+""", OPENAI_GPT_START_DOCSTRING)
+class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
+    r"""    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            The second dimension of the input (`num_choices`) indicates the number of choices to score.
+            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
+            Index of the classification token in each input sequence.
+            Selected in the range ``[0, input_ids.size(-1) - 1[``.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
+        **mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
+
+            `multiple_choice_labels`: optional multiple choice labels: ``torch.LongTensor`` of shape [batch_size]
+                with indices selected in [0, ..., num_choices].
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **lm_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **mc_loss**: (`optional`, returned when ``multiple_choice_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Multiple choice classification loss.
+        **lm_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
+            Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
+        tokenizer.add_special_tokens({'cls_token': '[CLS]'})  # Add a [CLS] to the vocabulary (we should train it also!)
+        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, mc_token_ids)
+        lm_prediction_scores, mc_prediction_scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
+
+        self.transformer = OpenAIGPTModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.multiple_choice_head = SequenceSummary(config)
+
+        self.init_weights()
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.lm_head,
+                                   self.transformer.tokens_embed)
+
+    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
+                position_ids=None, head_mask=None):
+        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                                               head_mask=head_mask)
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
+
+        outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
+        if mc_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
+                            mc_labels.view(-1))
+            outputs = (loss,) + outputs
+        if lm_labels is not None:
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                            shift_labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (lm loss), (mc loss), lm logits, mc logits, (all hidden_states), (attentions)
diff --git a/pytorch_transformers/modeling_roberta.py b/pytorch_transformers/modeling_roberta.py
new file mode 100644
index 0000000..cf1f862
--- /dev/null
+++ b/pytorch_transformers/modeling_roberta.py
@@ -0,0 +1,340 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch RoBERTa model. """
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
+from .configuration_roberta import RobertaConfig
+from .file_utils import add_start_docstrings
+
+logger = logging.getLogger(__name__)
+
+ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin",
+    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
+    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
+}
+
+class RobertaEmbeddings(BertEmbeddings):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+    def __init__(self, config):
+        super(RobertaEmbeddings, self).__init__(config)
+        self.padding_idx = 1
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None):
+        seq_length = input_ids.size(1)
+        if position_ids is None:
+            # Position numbers begin at padding_idx+1. Padding symbols are ignored.
+            # cf. fairseq's `utils.make_positions`
+            position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        return super(RobertaEmbeddings, self).forward(input_ids, token_type_ids=token_type_ids, position_ids=position_ids)
+
+
+ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
+    `RoBERTa: A Robustly Optimized BERT Pretraining Approach`_
+    by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
+    Veselin Stoyanov. It is based on Google's BERT model released in 2018.
+    
+    It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining
+    objective and training with much larger mini-batches and learning rates.
+    
+    This implementation is the same as BertModel with a tiny embeddings tweak as well as a setup for Roberta pretrained 
+    models.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`RoBERTa: A Robustly Optimized BERT Pretraining Approach`:
+        https://arxiv.org/abs/1907.11692
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~pytorch_transformers.RobertaConfig`): Model configuration class with all the parameters of the 
+            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+ROBERTA_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            To match pre-training, RoBERTa input sequence should be formatted with <s> and </s> tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         <s> Is this Jacksonville ? </s> </s> No it is not . </s>``
+
+            (b) For single sequences:
+
+                ``tokens:         <s> the dog is hairy . </s>``
+
+            Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with 
+            the ``add_special_tokens`` parameter set to ``True``.
+
+            RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1[``.
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.",
+                      ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+class RobertaModel(BertModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during Bert pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+        model = RobertaModel.from_pretrained('roberta-base')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    config_class = RobertaConfig
+    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+    base_model_prefix = "roberta"
+
+    def __init__(self, config):
+        super(RobertaModel, self).__init__(config)
+
+        self.embeddings = RobertaEmbeddings(config)
+        self.init_weights()
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None):
+        if input_ids[:, 0].sum().item() != 0:
+            logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
+                           "This model requires special tokens in order to work. "
+                           "Please specify add_special_tokens=True in your encoding.")
+        return super(RobertaModel, self).forward(input_ids, token_type_ids, attention_mask, position_ids, head_mask)
+
+
+@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
+    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+class RobertaForMaskedLM(BertPreTrainedModel):
+    r"""
+        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Masked language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+        model = RobertaForMaskedLM.from_pretrained('roberta-base')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, masked_lm_labels=input_ids)
+        loss, prediction_scores = outputs[:2]
+
+    """
+    config_class = RobertaConfig
+    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+    base_model_prefix = "roberta"
+
+    def __init__(self, config):
+        super(RobertaForMaskedLM, self).__init__(config)
+
+        self.roberta = RobertaModel(config)
+        self.lm_head = RobertaLMHead(config)
+
+        self.init_weights()
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.lm_head.decoder, self.roberta.embeddings.word_embeddings)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, position_ids=None,
+                head_mask=None):
+        outputs = self.roberta(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                            attention_mask=attention_mask, head_mask=head_mask)
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
+
+        if masked_lm_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            outputs = (masked_lm_loss,) + outputs
+
+        return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
+
+
+class RobertaLMHead(nn.Module):
+    """Roberta Head for masked language modeling."""
+
+    def __init__(self, config):
+        super(RobertaLMHead, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+    def forward(self, features, **kwargs):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        x = self.decoder(x) + self.bias
+
+        return x
+
+
+@add_start_docstrings("""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
+    on top of the pooled output) e.g. for GLUE tasks. """,
+    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+class RobertaForSequenceClassification(BertPreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = RoertaTokenizer.from_pretrained('roberta-base')
+        model = RobertaForSequenceClassification.from_pretrained('roberta-base')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
+
+    """
+    config_class = RobertaConfig
+    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+    base_model_prefix = "roberta"
+
+    def __init__(self, config):
+        super(RobertaForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = RobertaModel(config)
+        self.classifier = RobertaClassificationHead(config)
+    
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
+                position_ids=None, head_mask=None):
+        outputs = self.roberta(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                            attention_mask=attention_mask, head_mask=head_mask)
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        outputs = (logits,) + outputs[2:]
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
+
+
+
+class RobertaClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super(RobertaClassificationHead, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
new file mode 100644
index 0000000..9b8a98a
--- /dev/null
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -0,0 +1,1232 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Transformer XL model.
+    Adapted from https://github.com/kimiyoung/transformer-xl.
+    In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py
+"""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import json
+import math
+import logging
+import collections
+import sys
+from io import open
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+from torch.nn.parameter import Parameter
+
+from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
+from .configuration_transfo_xl import TransfoXLConfig
+from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
+from .file_utils import add_start_docstrings
+
+logger = logging.getLogger(__name__)
+
+TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin",
+}
+
+def build_tf_to_pytorch_map(model, config):
+    """ A map of modules from TF to PyTorch.
+        This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible.
+    """
+    tf_to_pt_map = {}
+
+    if hasattr(model, 'transformer'):
+        # We are loading in a TransfoXLLMHeadModel => we will load also the Adaptive Softmax
+        tf_to_pt_map.update({
+            "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight,
+            "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias})
+        for i, (out_l, proj_l, tie_proj) in enumerate(zip(
+                                model.crit.out_layers,
+                                model.crit.out_projs,
+                                config.tie_projs)):
+            layer_str = "transformer/adaptive_softmax/cutoff_%d/" % i
+            if config.tie_weight:
+                tf_to_pt_map.update({
+                    layer_str + 'b': out_l.bias})
+            else:
+                raise NotImplementedError
+                # I don't think this is implemented in the TF code
+                tf_to_pt_map.update({
+                    layer_str + 'lookup_table': out_l.weight,
+                    layer_str + 'b': out_l.bias})
+            if not tie_proj:
+                tf_to_pt_map.update({
+                    layer_str + 'proj': proj_l
+                    })
+        # Now load the rest of the transformer
+        model = model.transformer
+
+    # Embeddings
+    for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)):
+        layer_str = "transformer/adaptive_embed/cutoff_%d/" % i
+        tf_to_pt_map.update({
+            layer_str + 'lookup_table': embed_l.weight,
+            layer_str + 'proj_W': proj_l
+            })
+
+    # Transformer blocks
+    for i, b in enumerate(model.layers):
+        layer_str = "transformer/layer_%d/" % i
+        tf_to_pt_map.update({
+            layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight,
+            layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias,
+            layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight,
+            layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight,
+            layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight,
+            layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight,
+            layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias,
+            layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight,
+            layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias,
+            layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight,
+            layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias,
+        })
+
+    # Relative positioning biases
+    if config.untie_r:
+        r_r_list = []
+        r_w_list = []
+        for b in model.layers:
+            r_r_list.append(b.dec_attn.r_r_bias)
+            r_w_list.append(b.dec_attn.r_w_bias)
+    else:
+        r_r_list = [model.r_r_bias]
+        r_w_list = [model.r_w_bias]
+    tf_to_pt_map.update({
+        'transformer/r_r_bias': r_r_list,
+        'transformer/r_w_bias': r_w_list})
+    return tf_to_pt_map
+
+def load_tf_weights_in_transfo_xl(model, config, tf_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    try:
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    # Build TF to PyTorch weights loading map
+    tf_to_pt_map = build_tf_to_pytorch_map(model, config)
+
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    tf_weights = {}
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        tf_weights[name] = array
+
+    for name, pointer in tf_to_pt_map.items():
+        assert name in tf_weights
+        array = tf_weights[name]
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if 'kernel' in name or 'proj' in name:
+            array = np.transpose(array)
+        if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1:
+            # Here we will split the TF weigths
+            assert len(pointer) == array.shape[0]
+            for i, p_i in enumerate(pointer):
+                arr_i = array[i, ...]
+                try:
+                    assert p_i.shape == arr_i.shape
+                except AssertionError as e:
+                    e.args += (p_i.shape, arr_i.shape)
+                    raise
+                logger.info("Initialize PyTorch weight {} for layer {}".format(name, i))
+                p_i.data = torch.from_numpy(arr_i)
+        else:
+            try:
+                assert pointer.shape == array.shape
+            except AssertionError as e:
+                e.args += (pointer.shape, array.shape)
+                raise
+            logger.info("Initialize PyTorch weight {}".format(name))
+            pointer.data = torch.from_numpy(array)
+        tf_weights.pop(name, None)
+        tf_weights.pop(name + '/Adam', None)
+        tf_weights.pop(name + '/Adam_1', None)
+
+    logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
+    return model
+
+
+class PositionalEmbedding(nn.Module):
+    def __init__(self, demb):
+        super(PositionalEmbedding, self).__init__()
+
+        self.demb = demb
+
+        inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))
+        self.register_buffer('inv_freq', inv_freq)
+
+    def forward(self, pos_seq, bsz=None):
+        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
+
+        if bsz is not None:
+            return pos_emb[:,None,:].expand(-1, bsz, -1)
+        else:
+            return pos_emb[:,None,:]
+
+
+
+class PositionwiseFF(nn.Module):
+    def __init__(self, d_model, d_inner, dropout, pre_lnorm=False):
+        super(PositionwiseFF, self).__init__()
+
+        self.d_model = d_model
+        self.d_inner = d_inner
+        self.dropout = dropout
+
+        self.CoreNet = nn.Sequential(
+            nn.Linear(d_model, d_inner), nn.ReLU(inplace=True),
+            nn.Dropout(dropout),
+            nn.Linear(d_inner, d_model),
+            nn.Dropout(dropout),
+        )
+
+        self.layer_norm = nn.LayerNorm(d_model)
+
+        self.pre_lnorm = pre_lnorm
+
+    def forward(self, inp):
+        if self.pre_lnorm:
+            ##### layer normalization + positionwise feed-forward
+            core_out = self.CoreNet(self.layer_norm(inp))
+
+            ##### residual connection
+            output = core_out + inp
+        else:
+            ##### positionwise feed-forward
+            core_out = self.CoreNet(inp)
+
+            ##### residual connection + layer normalization
+            output = self.layer_norm(inp + core_out)
+
+        return output
+
+
+
+class MultiHeadAttn(nn.Module):
+    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, 
+                 pre_lnorm=False, r_r_bias=None, r_w_bias=None, output_attentions=False):
+        super(MultiHeadAttn, self).__init__()
+
+        self.output_attentions = output_attentions
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_head = d_head
+        self.dropout = dropout
+
+        self.q_net = nn.Linear(d_model, n_head * d_head, bias=False)
+        self.kv_net = nn.Linear(d_model, 2 * n_head * d_head, bias=False)
+
+        self.drop = nn.Dropout(dropout)
+        self.dropatt = nn.Dropout(dropatt)
+        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
+
+        self.layer_norm = nn.LayerNorm(d_model)
+
+        self.scale = 1 / (d_head ** 0.5)
+
+        self.pre_lnorm = pre_lnorm
+
+        if r_r_bias is None or r_w_bias is None: # Biases are not shared
+            self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+            self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+        else:
+            self.r_r_bias = r_r_bias
+            self.r_w_bias = r_w_bias
+
+    def forward(self, h, attn_mask=None, mems=None, head_mask=None):
+        ##### multihead attention
+        # [hlen x bsz x n_head x d_head]
+
+        if mems is not None:
+            c = torch.cat([mems, h], 0)
+        else:
+            c = h
+
+        if self.pre_lnorm:
+            ##### layer normalization
+            c = self.layer_norm(c)
+
+        head_q = self.q_net(h)
+        head_k, head_v = torch.chunk(self.kv_net(c), 2, -1)
+
+        head_q = head_q.view(h.size(0), h.size(1), self.n_head, self.d_head)
+        head_k = head_k.view(c.size(0), c.size(1), self.n_head, self.d_head)
+        head_v = head_v.view(c.size(0), c.size(1), self.n_head, self.d_head)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = torch.einsum('ibnd,jbnd->ijbn', (head_q, head_k))
+        attn_score.mul_(self.scale)
+        if attn_mask is not None and torch.sum(attn_mask).item():
+            attn_mask = (attn_mask == 1)  # Switch to bool
+            if attn_mask.dim() == 2:
+                attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
+            elif attn_mask.dim() == 3:
+                attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf'))
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = F.softmax(attn_score, dim=1)
+        attn_prob = self.dropatt(attn_prob)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_prob = attn_prob * head_mask
+
+        # [qlen x klen x bsz x n_head] + [klen x bsz x n_head x d_head] -> [qlen x bsz x n_head x d_head]
+        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, head_v))
+        attn_vec = attn_vec.contiguous().view(
+            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
+
+        ##### linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+
+        if self.pre_lnorm:
+            ##### residual connection
+            outputs = [h + attn_out]
+        else:
+            ##### residual connection + layer normalization
+            outputs = [self.layer_norm(h + attn_out)]
+
+        if self.output_attentions:
+            outputs.append(attn_prob)
+
+        return outputs
+
+class RelMultiHeadAttn(nn.Module):
+    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
+                 tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False,
+                 r_r_bias=None, r_w_bias=None, output_attentions=False):
+        super(RelMultiHeadAttn, self).__init__()
+
+        self.output_attentions = output_attentions
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_head = d_head
+        self.dropout = dropout
+
+        self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False)
+
+        self.drop = nn.Dropout(dropout)
+        self.dropatt = nn.Dropout(dropatt)
+        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
+
+        self.layer_norm = nn.LayerNorm(d_model)
+
+        self.scale = 1 / (d_head ** 0.5)
+
+        self.pre_lnorm = pre_lnorm
+
+        if r_r_bias is None or r_w_bias is None: # Biases are not shared
+            self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+            self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+        else:
+            self.r_r_bias = r_r_bias
+            self.r_w_bias = r_w_bias
+
+    def _parallelogram_mask(self, h, w, left=False):
+        mask = torch.ones((h, w)).byte()
+        m = min(h, w)
+        mask[:m,:m] = torch.triu(mask[:m,:m])
+        mask[-m:,-m:] = torch.tril(mask[-m:,-m:])
+
+        if left:
+            return mask
+        else:
+            return mask.flip(0)
+
+    def _shift(self, x, qlen, klen, mask, left=False):
+        if qlen > 1:
+            zero_pad = torch.zeros((x.size(0), qlen-1, x.size(2), x.size(3)),
+                                    device=x.device, dtype=x.dtype)
+        else:
+            zero_pad = torch.zeros(0, device=x.device, dtype=x.dtype)
+
+        if left:
+            mask = mask.flip(1)
+            x_padded = torch.cat([zero_pad, x], dim=1).expand(qlen, -1, -1, -1)
+        else:
+            x_padded = torch.cat([x, zero_pad], dim=1).expand(qlen, -1, -1, -1)
+
+        x = x_padded.masked_select(mask[:,:,None,None]) \
+                    .view(qlen, klen, x.size(2), x.size(3))
+
+        return x
+
+    def _rel_shift(self, x, zero_triu=False):
+        zero_pad_shape = (x.size(0), 1) + x.size()[2:]
+        zero_pad = torch.zeros(zero_pad_shape, device=x.device, dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=1)
+
+        x_padded_shape = (x.size(1) + 1, x.size(0)) + x.size()[2:]
+        x_padded = x_padded.view(*x_padded_shape)
+
+        x = x_padded[1:].view_as(x)
+
+        if zero_triu:
+            ones = torch.ones((x.size(0), x.size(1)))
+            x = x * torch.tril(ones, x.size(1) - x.size(0))[:,:,None,None]
+
+        return x
+
+    def forward(self, w, r, attn_mask=None, mems=None):
+        raise NotImplementedError
+
+class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
+    def __init__(self, *args, **kwargs):
+        super(RelPartialLearnableMultiHeadAttn, self).__init__(*args, **kwargs)
+
+        self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)
+
+    def forward(self, w, r, attn_mask=None, mems=None, head_mask=None):
+        qlen, rlen, bsz = w.size(0), r.size(0), w.size(1)
+
+        if mems is not None:
+            cat = torch.cat([mems, w], 0)
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(cat))
+            else:
+                w_heads = self.qkv_net(cat)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+            w_head_q = w_head_q[-qlen:]
+        else:
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(w))
+            else:
+                w_heads = self.qkv_net(w)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+
+        klen = w_head_k.size(0)
+
+        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
+        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
+        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
+
+        r_head_k = r_head_k.view(rlen, self.n_head, self.d_head)                # qlen x n_head x d_head
+
+        #### compute attention score
+        rw_head_q = w_head_q + self.r_w_bias                                    # qlen x bsz x n_head x d_head
+        AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k))             # qlen x klen x bsz x n_head
+
+        rr_head_q = w_head_q + self.r_r_bias
+        BD = torch.einsum('ibnd,jnd->ijbn', (rr_head_q, r_head_k))              # qlen x klen x bsz x n_head
+        BD = self._rel_shift(BD)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = AC + BD
+        attn_score.mul_(self.scale)
+
+        #### compute attention probability
+        if attn_mask is not None and torch.sum(attn_mask).item():
+            attn_mask = (attn_mask == 1)  # Switch to bool
+            if attn_mask.dim() == 2:
+                attn_score = attn_score.float().masked_fill(
+                    attn_mask[None,:,:,None], -1e30).type_as(attn_score)
+            elif attn_mask.dim() == 3:
+                attn_score = attn_score.float().masked_fill(
+                    attn_mask[:,:,:,None], -1e30).type_as(attn_score)
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = F.softmax(attn_score, dim=1)
+        attn_prob = self.dropatt(attn_prob)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_prob = attn_prob * head_mask
+
+        #### compute attention vector
+        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v))
+
+        # [qlen x bsz x n_head x d_head]
+        attn_vec = attn_vec.contiguous().view(
+            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
+
+        ##### linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+
+        if self.pre_lnorm:
+            ##### residual connection
+            outputs = [w + attn_out]
+        else:
+            ##### residual connection + layer normalization
+            outputs = [self.layer_norm(w + attn_out)]
+
+        if self.output_attentions:
+            outputs.append(attn_prob)
+
+        return outputs
+
+class RelLearnableMultiHeadAttn(RelMultiHeadAttn):
+    def __init__(self, *args, **kwargs):
+        super(RelLearnableMultiHeadAttn, self).__init__(*args, **kwargs)
+
+    def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None, head_mask=None):
+        # r_emb: [klen, n_head, d_head], used for term B
+        # r_w_bias: [n_head, d_head], used for term C
+        # r_bias: [klen, n_head], used for term D
+
+        qlen, bsz = w.size(0), w.size(1)
+
+        if mems is not None:
+            cat = torch.cat([mems, w], 0)
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(cat))
+            else:
+                w_heads = self.qkv_net(cat)
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+
+            w_head_q = w_head_q[-qlen:]
+        else:
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(w))
+            else:
+                w_heads = self.qkv_net(w)
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+
+        klen = w_head_k.size(0)
+
+        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)
+        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)
+        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)
+
+        if klen > r_emb.size(0):
+            r_emb_pad = r_emb[0:1].expand(klen-r_emb.size(0), -1, -1)
+            r_emb = torch.cat([r_emb_pad, r_emb], 0)
+            r_bias_pad = r_bias[0:1].expand(klen-r_bias.size(0), -1)
+            r_bias = torch.cat([r_bias_pad, r_bias], 0)
+        else:
+            r_emb = r_emb[-klen:]
+            r_bias = r_bias[-klen:]
+
+        #### compute attention score
+        rw_head_q = w_head_q + r_w_bias[None]                                   # qlen x bsz x n_head x d_head
+
+        AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k))             # qlen x klen x bsz x n_head
+        B_ = torch.einsum('ibnd,jnd->ijbn', (w_head_q, r_emb))                  # qlen x klen x bsz x n_head
+        D_ = r_bias[None, :, None]                                              # 1    x klen x 1   x n_head
+        BD = self._rel_shift(B_ + D_)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = AC + BD
+        attn_score.mul_(self.scale)
+
+        #### compute attention probability
+        if attn_mask is not None and torch.sum(attn_mask).item():
+            attn_mask = (attn_mask == 1)  # Switch to bool
+            if attn_mask.dim() == 2:
+                attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
+            elif attn_mask.dim() == 3:
+                attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf'))
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = F.softmax(attn_score, dim=1)
+        attn_prob = self.dropatt(attn_prob)
+
+        if head_mask is not None:
+            attn_prob = attn_prob * head_mask
+
+        #### compute attention vector
+        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v))
+
+        # [qlen x bsz x n_head x d_head]
+        attn_vec = attn_vec.contiguous().view(
+            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
+
+        ##### linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+
+        if self.pre_lnorm:
+            ##### residual connection
+            outputs = [w + attn_out]
+        else:
+            ##### residual connection + layer normalization
+            outputs = [self.layer_norm(w + attn_out)]
+
+        if self.output_attentions:
+            outputs.append(attn_prob)
+
+        return outputs
+
+
+
+class DecoderLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout, **kwargs):
+        super(DecoderLayer, self).__init__()
+
+        self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs)
+        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
+                                     pre_lnorm=kwargs.get('pre_lnorm'))
+
+    def forward(self, dec_inp, dec_attn_mask=None, mems=None, head_mask=None):
+
+        attn_outputs = self.dec_attn(dec_inp, attn_mask=dec_attn_mask,
+                               mems=mems, head_mask=head_mask)
+        ff_output = self.pos_ff(attn_outputs[0])
+
+        outputs = [ff_output] + attn_outputs[1:]
+
+        return outputs
+
+class RelLearnableDecoderLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout,
+                 **kwargs):
+        super(RelLearnableDecoderLayer, self).__init__()
+
+        self.dec_attn = RelLearnableMultiHeadAttn(n_head, d_model, d_head, dropout,
+                                         **kwargs)
+        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
+                                     pre_lnorm=kwargs.get('pre_lnorm'))
+
+    def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None, head_mask=None):
+
+        attn_outputs = self.dec_attn(dec_inp, r_emb, r_w_bias, r_bias,
+                               attn_mask=dec_attn_mask,
+                               mems=mems, head_mask=head_mask)
+        ff_output = self.pos_ff(attn_outputs[0])
+
+        outputs = [ff_output] + attn_outputs[1:]
+
+        return outputs
+
+class RelPartialLearnableDecoderLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout,
+                 **kwargs):
+        super(RelPartialLearnableDecoderLayer, self).__init__()
+
+        self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model,
+                            d_head, dropout, **kwargs)
+        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
+                                     pre_lnorm=kwargs.get('pre_lnorm'))
+
+    def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None):
+
+        attn_outputs = self.dec_attn(dec_inp, r,
+                               attn_mask=dec_attn_mask,
+                               mems=mems, head_mask=head_mask)
+        ff_output = self.pos_ff(attn_outputs[0])
+
+        outputs = [ff_output] + attn_outputs[1:]
+
+        return outputs
+
+
+
+class AdaptiveEmbedding(nn.Module):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, 
+                 sample_softmax=False):
+        super(AdaptiveEmbedding, self).__init__()
+
+        self.n_token = n_token
+        self.d_embed = d_embed
+
+        self.cutoffs = cutoffs + [n_token]
+        self.div_val = div_val
+        self.d_proj = d_proj
+
+        self.emb_scale = d_proj ** 0.5
+
+        self.cutoff_ends = [0] + self.cutoffs
+
+        self.emb_layers = nn.ModuleList()
+        self.emb_projs = nn.ParameterList()
+        if div_val == 1:
+            self.emb_layers.append(
+                nn.Embedding(n_token, d_embed, sparse=sample_softmax>0)
+            )
+            if d_proj != d_embed:
+                self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed)))
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                d_emb_i = d_embed // (div_val ** i)
+                self.emb_layers.append(nn.Embedding(r_idx-l_idx, d_emb_i))
+                self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i)))
+
+    def forward(self, inp):
+        if self.div_val == 1:
+            embed = self.emb_layers[0](inp)
+            if self.d_proj != self.d_embed:
+                embed  = F.linear(embed, self.emb_projs[0])
+        else:
+            param = next(self.parameters())
+            inp_flat = inp.view(-1)
+            emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], 
+                dtype=param.dtype, device=param.device)
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+
+                mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)
+                indices_i = mask_i.nonzero().squeeze()
+
+                if indices_i.numel() == 0:
+                    continue
+
+                inp_i = inp_flat.index_select(0, indices_i) - l_idx
+                emb_i = self.emb_layers[i](inp_i)
+                emb_i = F.linear(emb_i, self.emb_projs[i])
+
+                emb_flat.index_copy_(0, indices_i, emb_i)
+
+            embed_shape = inp.size() + (self.d_proj,)
+            embed = emb_flat.view(embed_shape)
+
+        embed.mul_(self.emb_scale)
+
+        return embed
+
+
+class TransfoXLPreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = TransfoXLConfig
+    pretrained_model_archive_map = TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_transfo_xl
+    base_model_prefix = "transformer"
+
+    def _init_weight(self, weight):
+        if self.config.init == 'uniform':
+            nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
+        elif self.config.init == 'normal':
+            nn.init.normal_(weight, 0.0, self.config.init_std)
+
+    def _init_bias(self, bias):
+        nn.init.constant_(bias, 0.0)
+
+    def _init_weights(self, m):
+        """ Initialize the weights.
+        """
+        classname = m.__class__.__name__
+        if classname.find('Linear') != -1:
+            if hasattr(m, 'weight') and m.weight is not None:
+                self._init_weight(m.weight)
+            if hasattr(m, 'bias') and m.bias is not None:
+                self._init_bias(m.bias)
+        elif classname.find('AdaptiveEmbedding') != -1:
+            if hasattr(m, 'emb_projs'):
+                for i in range(len(m.emb_projs)):
+                    if m.emb_projs[i] is not None:
+                        nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std)
+        elif classname.find('Embedding') != -1:
+            if hasattr(m, 'weight'):
+                self._init_weight(m.weight)
+        elif classname.find('ProjectedAdaptiveLogSoftmax') != -1:
+            if hasattr(m, 'cluster_weight') and m.cluster_weight is not None:
+                self._init_weight(m.cluster_weight)
+            if hasattr(m, 'cluster_bias') and m.cluster_bias is not None:
+                self._init_bias(m.cluster_bias)
+            if hasattr(m, 'out_projs'):
+                for i in range(len(m.out_projs)):
+                    if m.out_projs[i] is not None:
+                        nn.init.normal_(m.out_projs[i], 0.0, self.config.proj_init_std)
+        elif classname.find('LayerNorm') != -1:
+            if hasattr(m, 'weight'):
+                nn.init.normal_(m.weight, 1.0, self.config.init_std)
+            if hasattr(m, 'bias') and m.bias is not None:
+                self._init_bias(m.bias)
+        else:
+            if hasattr(m, 'r_emb'):
+                self._init_weight(m.r_emb)
+            if hasattr(m, 'r_w_bias'):
+                self._init_weight(m.r_w_bias)
+            if hasattr(m, 'r_r_bias'):
+                self._init_weight(m.r_r_bias)
+            if hasattr(m, 'r_bias'):
+                self._init_bias(m.r_bias)
+
+    def set_num_special_tokens(self, num_special_tokens):
+        pass
+
+
+TRANSFO_XL_START_DOCSTRING = r"""    The Transformer-XL model was proposed in
+    `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`_
+    by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+    It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can reuse
+    previously computed hidden-states to attend to longer context (memory).
+    This model also uses adaptive softmax inputs and outputs (tied).
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`:
+        https://arxiv.org/abs/1901.02860
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~pytorch_transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+TRANSFO_XL_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            Transformer-XL is a model with relative position embeddings so you can either pad the inputs on
+            the right or on the left.
+            Indices can be obtained using :class:`pytorch_transformers.TransfoXLTokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **mems**: (`optional`)
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
+                      TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
+class TransfoXLModel(TransfoXLPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **mems**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+        model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states, mems = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(TransfoXLModel, self).__init__(config)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+
+        self.n_token = config.n_token
+
+        self.d_embed = config.d_embed
+        self.d_model = config.d_model
+        self.n_head = config.n_head
+        self.d_head = config.d_head
+
+        self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs, 
+                                          div_val=config.div_val)
+
+        self.drop = nn.Dropout(config.dropout)
+
+        self.n_layer = config.n_layer
+
+        self.tgt_len = config.tgt_len
+        self.mem_len = config.mem_len
+        self.ext_len = config.ext_len
+        self.max_klen = config.tgt_len + config.ext_len + config.mem_len
+
+        self.attn_type = config.attn_type
+
+        if not config.untie_r:
+            self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+            self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+
+        self.layers = nn.ModuleList()
+        if config.attn_type == 0: # the default attention
+            for i in range(config.n_layer):
+                self.layers.append(
+                    RelPartialLearnableDecoderLayer(
+                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
+                        tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,
+                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
+                        r_w_bias=None if config.untie_r else self.r_w_bias,
+                        r_r_bias=None if config.untie_r else self.r_r_bias,
+                        output_attentions=self.output_attentions)
+                )
+        elif config.attn_type == 1: # learnable embeddings
+            for i in range(config.n_layer):
+                self.layers.append(
+                    RelLearnableDecoderLayer(
+                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
+                        tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,
+                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
+                        r_w_bias=None if config.untie_r else self.r_w_bias,
+                        r_r_bias=None if config.untie_r else self.r_r_bias,
+                        output_attentions=self.output_attentions)
+                )
+        elif config.attn_type in [2, 3]: # absolute embeddings
+            for i in range(config.n_layer):
+                self.layers.append(
+                    DecoderLayer(
+                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
+                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
+                        r_w_bias=None if config.untie_r else self.r_w_bias,
+                        r_r_bias=None if config.untie_r else self.r_r_bias,
+                        output_attentions=self.output_attentions)
+                )
+
+        self.same_length = config.same_length
+        self.clamp_len = config.clamp_len
+
+        if self.attn_type == 0: # default attention
+            self.pos_emb = PositionalEmbedding(self.d_model)
+        elif self.attn_type == 1: # learnable
+            self.r_emb = nn.Parameter(torch.FloatTensor(
+                    self.n_layer, self.max_klen, self.n_head, self.d_head))
+            self.r_bias = nn.Parameter(torch.FloatTensor(
+                    self.n_layer, self.max_klen, self.n_head))
+        elif self.attn_type == 2: # absolute standard
+            self.pos_emb = PositionalEmbedding(self.d_model)
+        elif self.attn_type == 3: # absolute deeper SA
+            self.r_emb = nn.Parameter(torch.FloatTensor(
+                    self.n_layer, self.max_klen, self.n_head, self.d_head))
+
+        self.init_weights()
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        return self.word_emb
+
+    def backward_compatible(self):
+        self.sample_softmax = -1
+
+    def reset_length(self, tgt_len, ext_len, mem_len):
+        self.tgt_len = tgt_len
+        self.mem_len = mem_len
+        self.ext_len = ext_len
+
+    def _prune_heads(self, heads):
+        logger.info("Head pruning is not implemented for Transformer-XL model")
+        pass
+
+    def init_mems(self, data):
+        if self.mem_len > 0:
+            mems = []
+            param = next(self.parameters())
+            for i in range(self.n_layer):
+                empty = torch.zeros(self.mem_len, data.size(1), self.config.d_model,
+                                    dtype=param.dtype, device=param.device)
+                mems.append(empty)
+
+            return mems
+        else:
+            return None
+
+    def _update_mems(self, hids, mems, qlen, mlen):
+        # does not deal with None
+        if mems is None: return None
+
+        # mems is not None
+        assert len(hids) == len(mems), 'len(hids) != len(mems)'
+
+        # There are `mlen + qlen` steps that can be cached into mems
+        # For the next step, the last `ext_len` of the `qlen` tokens
+        # will be used as the extended context. Hence, we only cache
+        # the tokens from `mlen + qlen - self.ext_len - self.mem_len`
+        # to `mlen + qlen - self.ext_len`.
+        with torch.no_grad():
+            new_mems = []
+            end_idx = mlen + max(0, qlen - 0 - self.ext_len)
+            beg_idx = max(0, end_idx - self.mem_len)
+            for i in range(len(hids)):
+
+                cat = torch.cat([mems[i], hids[i]], dim=0)
+                new_mems.append(cat[beg_idx:end_idx].detach())
+
+        return new_mems
+
+    def _forward(self, dec_inp, mems=None, head_mask=None):
+        qlen, bsz = dec_inp.size()
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
+        # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0)
+                head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.n_layer
+
+        word_emb = self.word_emb(dec_inp)
+
+        mlen = mems[0].size(0) if mems is not None else 0
+        klen = mlen + qlen
+        if self.same_length:
+            all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8)
+            mask_len = klen - self.mem_len
+            if mask_len > 0:
+                mask_shift_len = qlen - mask_len
+            else:
+                mask_shift_len = qlen
+            dec_attn_mask = (torch.triu(all_ones, 1+mlen)
+                    + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1
+        else:
+            dec_attn_mask = torch.triu(
+                word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None]
+
+        hids = []
+        attentions = []
+        if self.attn_type == 0: # default
+            pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device, 
+                                   dtype=word_emb.dtype)
+            if self.clamp_len > 0:
+                pos_seq.clamp_(max=self.clamp_len)
+            pos_emb = self.pos_emb(pos_seq)
+
+            core_out = self.drop(word_emb)
+            pos_emb = self.drop(pos_emb)
+
+            for i, layer in enumerate(self.layers):
+                hids.append(core_out)
+                mems_i = None if mems is None else mems[i]
+                layer_outputs = layer(core_out, pos_emb, dec_attn_mask=dec_attn_mask,
+                                      mems=mems_i, head_mask=head_mask[i])
+                core_out = layer_outputs[0]
+                if self.output_attentions:
+                    attentions.append(layer_outputs[1])
+        elif self.attn_type == 1: # learnable
+            core_out = self.drop(word_emb)
+            for i, layer in enumerate(self.layers):
+                hids.append(core_out)
+                if self.clamp_len > 0:
+                    r_emb = self.r_emb[i][-self.clamp_len :]
+                    r_bias = self.r_bias[i][-self.clamp_len :]
+                else:
+                    r_emb, r_bias = self.r_emb[i], self.r_bias[i]
+
+                mems_i = None if mems is None else mems[i]
+                layer_outputs = layer(core_out, r_emb, self.r_w_bias[i],
+                                      r_bias, dec_attn_mask=dec_attn_mask,
+                                      mems=mems_i, head_mask=head_mask[i])
+                core_out = layer_outputs[0]
+                if self.output_attentions:
+                    attentions.append(layer_outputs[1])
+        elif self.attn_type == 2: # absolute
+            pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device,
+                                   dtype=word_emb.dtype)
+            if self.clamp_len > 0:
+                pos_seq.clamp_(max=self.clamp_len)
+            pos_emb = self.pos_emb(pos_seq)
+
+            core_out = self.drop(word_emb + pos_emb[-qlen:])
+
+            for i, layer in enumerate(self.layers):
+                hids.append(core_out)
+                mems_i = None if mems is None else mems[i]
+                if mems_i is not None and i == 0:
+                    mems_i += pos_emb[:mlen]
+                layer_outputs = layer(core_out, dec_attn_mask=dec_attn_mask,
+                                 mems=mems_i, head_mask=head_mask[i])
+                core_out = layer_outputs[0]
+                if self.output_attentions:
+                    attentions.append(layer_outputs[1])
+        elif self.attn_type == 3:
+            core_out = self.drop(word_emb)
+
+            for i, layer in enumerate(self.layers):
+                hids.append(core_out)
+                mems_i = None if mems is None else mems[i]
+                if mems_i is not None and mlen > 0:
+                    cur_emb = self.r_emb[i][:-qlen]
+                    cur_size = cur_emb.size(0)
+                    if cur_size < mlen:
+                        cur_emb_pad = cur_emb[0:1].expand(mlen-cur_size, -1, -1)
+                        cur_emb = torch.cat([cur_emb_pad, cur_emb], 0)
+                    else:
+                        cur_emb = cur_emb[-mlen:]
+                    mems_i += cur_emb.view(mlen, 1, -1)
+                core_out += self.r_emb[i][-qlen:].view(qlen, 1, -1)
+
+                layer_outputs = layer(core_out, dec_attn_mask=dec_attn_mask,
+                                      mems=mems_i, head_mask=head_mask[i])
+                core_out = layer_outputs[0]
+                if self.output_attentions:
+                    attentions.append(layer_outputs[1])
+
+        core_out = self.drop(core_out)
+
+        new_mems = self._update_mems(hids, mems, mlen, qlen)
+
+        # We transpose back here to shape [bsz, len, hidden_dim]
+        outputs = [core_out.transpose(0, 1).contiguous(), new_mems]
+        if self.output_hidden_states:
+            # Add last layer and transpose to library standard shape [bsz, len, hidden_dim]
+            hids.append(core_out)
+            hids = list(t.transpose(0, 1).contiguous() for t in hids)
+            outputs.append(hids)
+        if self.output_attentions:
+            # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len]
+            attentions = list(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
+            outputs.append(attentions)
+        return outputs  # last hidden state, new_mems, (all hidden states), (all attentions)
+
+    def forward(self, input_ids, mems=None, head_mask=None):
+        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
+        # so we transpose here from shape [bsz, len] to shape [len, bsz]
+        input_ids = input_ids.transpose(0, 1).contiguous()
+
+        if mems is None:
+            mems = self.init_mems(input_ids)
+        outputs = self._forward(input_ids, mems=mems, head_mask=head_mask)
+
+        return outputs  # last hidden state, new_mems, (all hidden states), (all attentions)
+
+
+@add_start_docstrings("""The Transformer-XL Model with a language modeling head on top
+    (adaptive softmax with weights tied to the adaptive input embeddings)""",
+    TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
+class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
+    r"""
+        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **prediction_scores**: ``None`` if ``lm_labels`` is provided else ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+            We don't output them when the loss is computed to speedup adaptive softmax decoding.
+        **mems**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+        model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        prediction_scores, mems = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(TransfoXLLMHeadModel, self).__init__(config)
+        self.transformer = TransfoXLModel(config)
+        self.sample_softmax = config.sample_softmax
+        # use sampled softmax
+        if config.sample_softmax > 0:
+            self.out_layer = nn.Linear(config.d_model, config.n_token)
+            self.sampler = LogUniformSampler(config.n_token, config.sample_softmax)
+        # use adaptive softmax (including standard softmax)
+        else:
+            self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model, 
+                                                    config.cutoffs, div_val=config.div_val)
+        self.init_weights()
+        self.tie_weights()
+
+    def tie_weights(self):
+        """
+        Run this to be sure output and input (adaptive) softmax weights are tied
+        """
+        # sampled softmax
+        if self.sample_softmax > 0:
+            if self.config.tie_weight:
+                self.out_layer.weight = self.transformer.word_emb.weight
+        # adaptive softmax (including standard softmax)
+        else:
+            if self.config.tie_weight:
+                for i in range(len(self.crit.out_layers)):
+                    self._tie_or_clone_weights(self.crit.out_layers[i],
+                                               self.transformer.word_emb.emb_layers[i])
+            if self.config.tie_projs:
+                for i, tie_proj in enumerate(self.config.tie_projs):
+                    if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
+                        if self.config.torchscript:
+                            self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[0].clone())
+                        else:
+                            self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]
+                    elif tie_proj and self.config.div_val != 1:
+                        if self.config.torchscript:
+                            self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[i].clone())
+                        else:
+                            self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]
+
+    def reset_length(self, tgt_len, ext_len, mem_len):
+        self.transformer.reset_length(tgt_len, ext_len, mem_len)
+
+    def init_mems(self, data):
+        return self.transformer.init_mems(data)
+
+    def forward(self, input_ids, labels=None, mems=None, head_mask=None):
+        bsz = input_ids.size(0)
+        tgt_len = input_ids.size(1)
+
+        transformer_outputs = self.transformer(input_ids, mems=mems, head_mask=head_mask)
+
+        last_hidden = transformer_outputs[0]
+        pred_hid = last_hidden[:, -tgt_len:]
+        outputs = transformer_outputs[1:]
+        if self.sample_softmax > 0 and self.training:
+            assert self.config.tie_weight
+            logit = sample_logits(self.transformer.word_emb, self.out_layer.bias, labels, pred_hid, self.sampler)
+            softmax_output = -F.log_softmax(logit, -1)[:, :, 0]
+            outputs = [softmax_output] + outputs
+            if labels is not None:
+                # TODO: This is not implemented
+                raise NotImplementedError
+        else:
+            softmax_output = self.crit(pred_hid.view(-1, pred_hid.size(-1)), labels)
+            if labels is None:
+                softmax_output = softmax_output.view(bsz, tgt_len, -1)
+                outputs = [softmax_output] + outputs
+            else:
+                softmax_output = softmax_output.view(bsz, tgt_len)
+                outputs = [softmax_output, None] + outputs
+
+        return outputs  # (loss), logits or None if labels is not None (speed up adaptive softmax), new_mems, (all hidden states), (all attentions)
diff --git a/pytorch_transformers/modeling_transfo_xl_utilities.py b/pytorch_transformers/modeling_transfo_xl_utilities.py
new file mode 100644
index 0000000..0773d0d
--- /dev/null
+++ b/pytorch_transformers/modeling_transfo_xl_utilities.py
@@ -0,0 +1,332 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Utilities for PyTorch Transformer XL model.
+    Directly adapted from https://github.com/kimiyoung/transformer-xl.
+"""
+
+from collections import defaultdict
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# CUDA_MAJOR = int(torch.version.cuda.split('.')[0])
+# CUDA_MINOR = int(torch.version.cuda.split('.')[1])
+
+class ProjectedAdaptiveLogSoftmax(nn.Module):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
+                 keep_order=False):
+        super(ProjectedAdaptiveLogSoftmax, self).__init__()
+
+        self.n_token = n_token
+        self.d_embed = d_embed
+        self.d_proj = d_proj
+
+        self.cutoffs = cutoffs + [n_token]
+        self.cutoff_ends = [0] + self.cutoffs
+        self.div_val = div_val
+
+        self.shortlist_size = self.cutoffs[0]
+        self.n_clusters = len(self.cutoffs) - 1
+        self.head_size = self.shortlist_size + self.n_clusters
+
+        if self.n_clusters > 0:
+            self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed))
+            self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters))
+
+        self.out_layers = nn.ModuleList()
+        self.out_projs = nn.ParameterList()
+
+        if div_val == 1:
+            for i in range(len(self.cutoffs)):
+                if d_proj != d_embed:
+                    self.out_projs.append(
+                        nn.Parameter(torch.FloatTensor(d_proj, d_embed))
+                    )
+                else:
+                    self.out_projs.append(None)
+
+            self.out_layers.append(nn.Linear(d_embed, n_token))
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                d_emb_i = d_embed // (div_val ** i)
+
+                self.out_projs.append(
+                    nn.Parameter(torch.FloatTensor(d_proj, d_emb_i))
+                )
+
+                self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx))
+
+        self.keep_order = keep_order
+
+    def _compute_logit(self, hidden, weight, bias, proj):
+        if proj is None:
+            logit = F.linear(hidden, weight, bias=bias)
+        else:
+            # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1:
+            proj_hid = F.linear(hidden, proj.t().contiguous())
+            logit = F.linear(proj_hid, weight, bias=bias)
+            # else:
+            #     logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t()))
+            #     if bias is not None:
+            #         logit = logit + bias
+
+        return logit
+
+    def forward(self, hidden, labels=None, keep_order=False):
+        '''
+            Params:
+                hidden :: [len*bsz x d_proj]
+                labels :: [len*bsz]
+            Return:
+                if labels is None:
+                    out :: [len*bsz] Negative log likelihood
+                else:
+                    out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary
+            We could replace this implementation by the native PyTorch one
+            if their's had an option to set bias on all clusters in the native one.
+            here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
+        '''
+
+        if labels is not None:
+            labels = labels.view(-1)
+            if hidden.size(0) != labels.size(0):
+                raise RuntimeError('Input and labels should have the same size '
+                                'in the batch dimension.')
+
+        if self.n_clusters == 0:
+            logit = self._compute_logit(hidden, self.out_layers[0].weight,
+                                        self.out_layers[0].bias, self.out_projs[0])
+            if labels is not None:
+                out = -F.log_softmax(logit, dim=-1) \
+                        .gather(1, labels.unsqueeze(1)).squeeze(1)
+            else:
+                out = F.log_softmax(logit, dim=-1)
+        else:
+            # construct weights and biases
+            weights, biases = [], []
+            for i in range(len(self.cutoffs)):
+                if self.div_val == 1:
+                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                    weight_i = self.out_layers[0].weight[l_idx:r_idx]
+                    bias_i = self.out_layers[0].bias[l_idx:r_idx]
+                else:
+                    weight_i = self.out_layers[i].weight
+                    bias_i = self.out_layers[i].bias
+
+                if i == 0:
+                    weight_i = torch.cat(
+                        [weight_i, self.cluster_weight], dim=0)
+                    bias_i = torch.cat(
+                        [bias_i, self.cluster_bias], dim=0)
+
+                weights.append(weight_i)
+                biases.append(bias_i)
+
+            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
+
+            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
+            head_logprob = F.log_softmax(head_logit, dim=1)
+
+            if labels is None:
+                out = hidden.new_empty((head_logit.size(0), self.n_token))
+            else:
+                out = torch.zeros_like(labels, dtype=hidden.dtype, device=hidden.device)
+
+            offset = 0
+            cutoff_values = [0] + self.cutoffs
+            for i in range(len(cutoff_values) - 1):
+                l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1]
+
+                if labels is not None:
+                    mask_i = (labels >= l_idx) & (labels < r_idx)
+                    indices_i = mask_i.nonzero().squeeze()
+
+                    if indices_i.numel() == 0:
+                        continue
+
+                    target_i = labels.index_select(0, indices_i) - l_idx
+                    head_logprob_i = head_logprob.index_select(0, indices_i)
+                    hidden_i = hidden.index_select(0, indices_i)
+                else:
+                    hidden_i = hidden
+
+                if i == 0:
+                    if labels is not None:
+                        logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1)
+                    else:
+                        out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
+                else:
+                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
+
+                    tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)
+                    tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
+                    cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
+                    if labels is not None:
+                        logprob_i = head_logprob_i[:, cluster_prob_idx] \
+                                + tail_logprob_i.gather(1, target_i[:, None]).squeeze(1)
+                    else:
+                        logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i
+                        out[:, l_idx:r_idx] = logprob_i
+
+                if labels is not None:
+                    if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
+                        out.index_copy_(0, indices_i, -logprob_i)
+                    else:
+                        out[offset:offset+logprob_i.size(0)].copy_(-logprob_i)
+                    offset += logprob_i.size(0)
+
+        return out
+
+
+    def log_prob(self, hidden):
+        r""" Computes log probabilities for all :math:`n\_classes`
+        From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py
+        Args:
+            hidden (Tensor): a minibatch of examples
+        Returns:
+            log-probabilities of for each class :math:`c`
+            in range :math:`0 <= c <= n\_classes`, where :math:`n\_classes` is a
+            parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor.
+        Shape:
+            - Input: :math:`(N, in\_features)`
+            - Output: :math:`(N, n\_classes)`
+        """
+        if self.n_clusters == 0:
+            logit = self._compute_logit(hidden, self.out_layers[0].weight,
+                                        self.out_layers[0].bias, self.out_projs[0])
+            return F.log_softmax(logit, dim=-1)
+        else:
+            # construct weights and biases
+            weights, biases = [], []
+            for i in range(len(self.cutoffs)):
+                if self.div_val == 1:
+                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                    weight_i = self.out_layers[0].weight[l_idx:r_idx]
+                    bias_i = self.out_layers[0].bias[l_idx:r_idx]
+                else:
+                    weight_i = self.out_layers[i].weight
+                    bias_i = self.out_layers[i].bias
+
+                if i == 0:
+                    weight_i = torch.cat(
+                        [weight_i, self.cluster_weight], dim=0)
+                    bias_i = torch.cat(
+                        [bias_i, self.cluster_bias], dim=0)
+
+                weights.append(weight_i)
+                biases.append(bias_i)
+
+            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
+            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
+
+            out = hidden.new_empty((head_logit.size(0), self.n_token))
+            head_logprob = F.log_softmax(head_logit, dim=1)
+
+            cutoff_values = [0] + self.cutoffs
+            for i in range(len(cutoff_values) - 1):
+                start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1]
+
+                if i == 0:
+                    out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
+                else:
+                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
+
+                    tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i)
+                    tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
+
+                    logprob_i = head_logprob[:, -i] + tail_logprob_i
+                    out[:, start_idx, stop_idx] = logprob_i
+
+            return out
+
+
+class LogUniformSampler(object):
+    def __init__(self, range_max, n_sample):
+        """
+        Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
+            `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
+
+        expected count can be approximated by 1 - (1 - p)^n
+        and we use a numerically stable version -expm1(num_tries * log1p(-p))
+
+        Our implementation fixes num_tries at 2 * n_sample, and the actual #samples will vary from run to run
+        """
+        with torch.no_grad():
+            self.range_max = range_max
+            log_indices = torch.arange(1., range_max+2., 1.).log_()
+            self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
+
+            self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()
+
+        self.n_sample = n_sample
+
+    def sample(self, labels):
+        """
+            labels: [b1, b2]
+        Return
+            true_log_probs: [b1, b2]
+            samp_log_probs: [n_sample]
+            neg_samples: [n_sample]
+        """
+
+        # neg_samples = torch.empty(0).long()
+        n_sample = self.n_sample
+        n_tries = 2 * n_sample
+
+        with torch.no_grad():
+            neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique()
+            device = labels.device
+            neg_samples = neg_samples.to(device)
+            true_log_probs = self.log_q[labels].to(device)
+            samp_log_probs = self.log_q[neg_samples].to(device)
+            return true_log_probs, samp_log_probs, neg_samples
+
+def sample_logits(embedding, bias, labels, inputs, sampler):
+    """
+        embedding: an nn.Embedding layer
+        bias: [n_vocab]
+        labels: [b1, b2]
+        inputs: [b1, b2, n_emb]
+        sampler: you may use a LogUniformSampler
+    Return
+        logits: [b1, b2, 1 + n_sample]
+    """
+    true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels)
+    n_sample = neg_samples.size(0)
+    b1, b2 = labels.size(0), labels.size(1)
+    all_ids = torch.cat([labels.view(-1), neg_samples])
+    all_w = embedding(all_ids)
+    true_w = all_w[: -n_sample].view(b1, b2, -1)
+    sample_w = all_w[- n_sample:].view(n_sample, -1)
+
+    all_b = bias[all_ids]
+    true_b = all_b[: -n_sample].view(b1, b2)
+    sample_b = all_b[- n_sample:]
+
+    hit = (labels[:, :, None] == neg_samples).detach()
+
+    true_logits = torch.einsum('ijk,ijk->ij',
+        [true_w, inputs]) + true_b - true_log_probs
+    sample_logits = torch.einsum('lk,ijk->ijl',
+        [sample_w, inputs]) + sample_b - samp_log_probs
+    sample_logits.masked_fill_(hit, -1e30)
+    logits = torch.cat([true_logits[:, :, None], sample_logits], -1)
+
+    return logits
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
new file mode 100644
index 0000000..2fb4671
--- /dev/null
+++ b/pytorch_transformers/modeling_utils.py
@@ -0,0 +1,772 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import copy
+import json
+import logging
+import os
+from io import open
+
+import six
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from torch.nn import functional as F
+
+from .configuration_utils import PretrainedConfig
+from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME
+
+logger = logging.getLogger(__name__)
+
+
+try:
+    from torch.nn import Identity
+except ImportError:
+    # Older PyTorch compatibility
+    class Identity(nn.Module):
+        r"""A placeholder identity operator that is argument-insensitive.
+        """
+        def __init__(self, *args, **kwargs):
+            super(Identity, self).__init__()
+
+        def forward(self, input):
+            return input
+
+class PreTrainedModel(nn.Module):
+    r""" Base class for all models.
+
+        :class:`~pytorch_transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
+        as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
+
+        Class attributes (overridden by derived classes):
+            - ``config_class``: a class derived from :class:`~pytorch_transformers.PretrainedConfig` to use as configuration class for this model architecture.
+            - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
+            - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
+
+                - ``model``: an instance of the relevant subclass of :class:`~pytorch_transformers.PreTrainedModel`,
+                - ``config``: an instance of the relevant subclass of :class:`~pytorch_transformers.PretrainedConfig`,
+                - ``path``: a path (string) to the TensorFlow checkpoint.
+
+            - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
+    """
+    config_class = None
+    pretrained_model_archive_map = {}
+    load_tf_weights = lambda model, config, path: None
+    base_model_prefix = ""
+
+    def __init__(self, config, *inputs, **kwargs):
+        super(PreTrainedModel, self).__init__()
+        if not isinstance(config, PretrainedConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
+                "To create a model from a pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                ))
+        # Save config in model
+        self.config = config
+
+    def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
+        """ Build a resized Embedding Module from a provided token Embedding Module.
+            Increasing the size will add newly initialized vectors at the end
+            Reducing the size will remove vectors from the end
+
+        Args:
+            new_num_tokens: (`optional`) int
+                New number of tokens in the embedding matrix.
+                Increasing the size will add newly initialized vectors at the end
+                Reducing the size will remove vectors from the end
+                If not provided or None: return the provided token Embedding Module.
+        Return: ``torch.nn.Embeddings``
+            Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
+        """
+        if new_num_tokens is None:
+            return old_embeddings
+
+        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+        if old_num_tokens == new_num_tokens:
+            return old_embeddings
+
+        # Build new embeddings
+        new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
+        new_embeddings.to(old_embeddings.weight.device)
+
+        # initialize all new embeddings (in particular added tokens)
+        self._init_weights(new_embeddings)
+
+        # Copy word embeddings from the previous weights
+        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
+        new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
+
+        return new_embeddings
+
+    def _tie_or_clone_weights(self, first_module, second_module):
+        """ Tie or clone module weights depending of weither we are using TorchScript or not
+        """
+        if self.config.torchscript:
+            first_module.weight = nn.Parameter(second_module.weight.clone())
+        else:
+            first_module.weight = second_module.weight
+
+        if hasattr(first_module, 'bias') and first_module.bias is not None:
+            first_module.bias.data = torch.nn.functional.pad(
+                first_module.bias.data,
+                (0, first_module.weight.shape[0] - first_module.bias.shape[0]),
+                'constant',
+                0
+            )
+
+    def resize_token_embeddings(self, new_num_tokens=None):
+        """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
+        Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
+
+        Arguments:
+
+            new_num_tokens: (`optional`) int:
+                New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. 
+                If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model.
+
+        Return: ``torch.nn.Embeddings``
+            Pointer to the input tokens Embeddings Module of the model
+        """
+        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
+        model_embeds = base_model._resize_token_embeddings(new_num_tokens)
+        if new_num_tokens is None:
+            return model_embeds
+
+        # Update base model and current model config
+        self.config.vocab_size = new_num_tokens
+        base_model.vocab_size = new_num_tokens
+
+        # Tie weights again if needed
+        if hasattr(self, 'tie_weights'):
+            self.tie_weights()
+
+        return model_embeds
+
+    def init_weights(self):
+        """ Initialize and prunes weights if needed. """
+        # Initialize weights
+        self.apply(self._init_weights)
+
+        # Prune heads if needed
+        if self.config.pruned_heads:
+            self.prune_heads(self.config.pruned_heads)
+
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the base model.
+
+            Arguments:
+
+                heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
+                E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
+        """
+        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
+
+        # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
+        for layer, heads in heads_to_prune.items():
+            union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)
+            self.config.pruned_heads[layer] = list(union_heads)  # Unfortunately we have to store it as list for JSON
+
+        base_model._prune_heads(heads_to_prune)
+
+    def save_pretrained(self, save_directory):
+        """ Save a model and its configuration file to a directory, so that it
+            can be re-loaded using the `:func:`~pytorch_transformers.PreTrainedModel.from_pretrained`` class method.
+        """
+        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
+
+        # Only save the model it-self if we are using distributed training
+        model_to_save = self.module if hasattr(self, 'module') else self
+
+        # Save configuration file
+        model_to_save.config.save_pretrained(save_directory)
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
+
+        torch.save(model_to_save.state_dict(), output_model_file)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r"""Instantiate a pretrained pytorch model from a pre-trained model configuration.
+
+        The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with ``model.train()``
+
+        The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model.
+        It is up to you to train those weights with a downstream fine-tuning task.
+
+        The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded.
+
+        Parameters:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
+            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        config = kwargs.pop('config', None)
+        state_dict = kwargs.pop('state_dict', None)
+        cache_dir = kwargs.pop('cache_dir', None)
+        from_tf = kwargs.pop('from_tf', False)
+        force_download = kwargs.pop('force_download', False)
+        proxies = kwargs.pop('proxies', None)
+        output_loading_info = kwargs.pop('output_loading_info', False)
+
+        # Load config
+        if config is None:
+            config, model_kwargs = cls.config_class.from_pretrained(
+                pretrained_model_name_or_path, *model_args,
+                cache_dir=cache_dir, return_unused_kwargs=True,
+                force_download=force_download,
+                **kwargs
+            )
+        else:
+            model_kwargs = kwargs
+
+        # Load model
+        if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
+            archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
+        elif os.path.isdir(pretrained_model_name_or_path):
+            if from_tf:
+                # Directly load from a TensorFlow checkpoint
+                archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
+            else:
+                archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+        else:
+            if from_tf:
+                # Directly load from a TensorFlow checkpoint
+                archive_file = pretrained_model_name_or_path + ".index"
+            else:
+                archive_file = pretrained_model_name_or_path
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
+        except EnvironmentError as e:
+            if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained weights.".format(
+                        archive_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(cls.pretrained_model_archive_map.keys()),
+                        archive_file))
+            raise e
+        if resolved_archive_file == archive_file:
+            logger.info("loading weights file {}".format(archive_file))
+        else:
+            logger.info("loading weights file {} from cache at {}".format(
+                archive_file, resolved_archive_file))
+
+        # Instantiate model.
+        model = cls(config, *model_args, **model_kwargs)
+
+        if state_dict is None and not from_tf:
+            state_dict = torch.load(resolved_archive_file, map_location='cpu')
+        if from_tf:
+            # Directly load from a TensorFlow checkpoint
+            return cls.load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
+
+        # Convert old format to new format if needed from a PyTorch state_dict
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        # Load from a PyTorch state_dict
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+
+        # Make sure we are able to load base models as well as derived models (with heads)
+        start_prefix = ''
+        model_to_load = model
+        if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
+            start_prefix = cls.base_model_prefix + '.'
+        if hasattr(model, cls.base_model_prefix) and not any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
+            model_to_load = getattr(model, cls.base_model_prefix)
+
+        load(model_to_load, prefix=start_prefix)
+        if len(missing_keys) > 0:
+            logger.info("Weights of {} not initialized from pretrained model: {}".format(
+                model.__class__.__name__, missing_keys))
+        if len(unexpected_keys) > 0:
+            logger.info("Weights from pretrained model not used in {}: {}".format(
+                model.__class__.__name__, unexpected_keys))
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               model.__class__.__name__, "\n\t".join(error_msgs)))
+
+        if hasattr(model, 'tie_weights'):
+            model.tie_weights()  # make sure word embedding weights are still tied
+
+        # Set model in evaluation mode to desactivate DropOut modules by default
+        model.eval()
+
+        if output_loading_info:
+            loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs}
+            return model, loading_info
+
+        return model
+
+
+class Conv1D(nn.Module):
+    def __init__(self, nf, nx):
+        """ Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
+            Basically works like a Linear layer but the weights are transposed
+        """
+        super(Conv1D, self).__init__()
+        self.nf = nf
+        w = torch.empty(nx, nf)
+        nn.init.normal_(w, std=0.02)
+        self.weight = nn.Parameter(w)
+        self.bias = nn.Parameter(torch.zeros(nf))
+
+    def forward(self, x):
+        size_out = x.size()[:-1] + (self.nf,)
+        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
+        x = x.view(*size_out)
+        return x
+
+
+class PoolerStartLogits(nn.Module):
+    """ Compute SQuAD start_logits from sequence hidden states. """
+    def __init__(self, config):
+        super(PoolerStartLogits, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, 1)
+
+    def forward(self, hidden_states, p_mask=None):
+        """ Args:
+            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)`
+                invalid position mask such as query and special symbols (PAD, SEP, CLS)
+                1.0 means token should be masked.
+        """
+        x = self.dense(hidden_states).squeeze(-1)
+
+        if p_mask is not None:
+            x = x * (1 - p_mask) - 1e30 * p_mask
+
+        return x
+
+
+class PoolerEndLogits(nn.Module):
+    """ Compute SQuAD end_logits from sequence hidden states and start token hidden state.
+    """
+    def __init__(self, config):
+        super(PoolerEndLogits, self).__init__()
+        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dense_1 = nn.Linear(config.hidden_size, 1)
+
+    def forward(self, hidden_states, start_states=None, start_positions=None, p_mask=None):
+        """ Args:
+            One of ``start_states``, ``start_positions`` should be not None.
+            If both are set, ``start_positions`` overrides ``start_states``.
+
+            **start_states**: ``torch.LongTensor`` of shape identical to hidden_states
+                hidden states of the first tokens for the labeled span.
+            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+                position of the first token for the labeled span:
+            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
+                Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
+                1.0 means token should be masked.
+        """
+        assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
+        if start_positions is not None:
+            slen, hsz = hidden_states.shape[-2:]
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions) # shape (bsz, 1, hsz)
+            start_states = start_states.expand(-1, slen, -1) # shape (bsz, slen, hsz)
+
+        x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
+        x = self.activation(x)
+        x = self.LayerNorm(x)
+        x = self.dense_1(x).squeeze(-1)
+
+        if p_mask is not None:
+            x = x * (1 - p_mask) - 1e30 * p_mask
+
+        return x
+
+
+class PoolerAnswerClass(nn.Module):
+    """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """
+    def __init__(self, config):
+        super(PoolerAnswerClass, self).__init__()
+        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
+
+    def forward(self, hidden_states, start_states=None, start_positions=None, cls_index=None):
+        """
+        Args:
+            One of ``start_states``, ``start_positions`` should be not None.
+            If both are set, ``start_positions`` overrides ``start_states``.
+
+            **start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``.
+                hidden states of the first tokens for the labeled span.
+            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+                position of the first token for the labeled span.
+            **cls_index**: torch.LongTensor of shape ``(batch_size,)``
+                position of the CLS token. If None, take the last token.
+
+            note(Original repo):
+                no dependency on end_feature so that we can obtain one single `cls_logits`
+                for each sample
+        """
+        hsz = hidden_states.shape[-1]
+        assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
+        if start_positions is not None:
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz)
+
+        if cls_index is not None:
+            cls_index = cls_index[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
+            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, hsz)
+        else:
+            cls_token_state = hidden_states[:, -1, :] # shape (bsz, hsz)
+
+        x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
+        x = self.activation(x)
+        x = self.dense_1(x).squeeze(-1)
+
+        return x
+
+
+class SQuADHead(nn.Module):
+    r""" A SQuAD head inspired by XLNet.
+
+    Parameters:
+        config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
+
+    Inputs:
+        **hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)``
+            hidden states of sequence tokens
+        **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+            position of the first token for the labeled span.
+        **end_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+            position of the last token for the labeled span.
+        **cls_index**: torch.LongTensor of shape ``(batch_size,)``
+            position of the CLS token. If None, take the last token.
+        **is_impossible**: ``torch.LongTensor`` of shape ``(batch_size,)``
+            Whether the question has a possible answer in the paragraph or not.
+        **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
+            Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
+            1.0 means token should be masked.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
+        **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``
+            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
+        **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``
+            Indices for the top config.start_n_top start token possibilities (beam-search).
+        **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
+            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
+        **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
+            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
+        **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.FloatTensor`` of shape ``(batch_size,)``
+            Log probabilities for the ``is_impossible`` label of the answers.
+    """
+    def __init__(self, config):
+        super(SQuADHead, self).__init__()
+        self.start_n_top = config.start_n_top
+        self.end_n_top = config.end_n_top
+
+        self.start_logits = PoolerStartLogits(config)
+        self.end_logits = PoolerEndLogits(config)
+        self.answer_class = PoolerAnswerClass(config)
+
+    def forward(self, hidden_states, start_positions=None, end_positions=None,
+                cls_index=None, is_impossible=None, p_mask=None):
+        outputs = ()
+
+        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
+
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, let's remove the dimension added by batch splitting
+            for x in (start_positions, end_positions, cls_index, is_impossible):
+                if x is not None and x.dim() > 1:
+                    x.squeeze_(-1)
+
+            # during training, compute the end logits based on the ground truth of the start position
+            end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
+
+            loss_fct = CrossEntropyLoss()
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+            if cls_index is not None and is_impossible is not None:
+                # Predict answerability from the representation of CLS and START
+                cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
+                loss_fct_cls = nn.BCEWithLogitsLoss()
+                cls_loss = loss_fct_cls(cls_logits, is_impossible)
+
+                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
+                total_loss += cls_loss * 0.5
+
+            outputs = (total_loss,) + outputs
+
+        else:
+            # during inference, compute the end logits based on beam search
+            bsz, slen, hsz = hidden_states.size()
+            start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen)
+
+            start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top)
+            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
+            start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz)
+            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz)
+
+            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz)
+            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
+            end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
+            end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top)
+
+            end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top)
+            end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
+            end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
+
+            start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
+            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
+
+            outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
+
+        # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
+        # or (if labels are provided) (total_loss,)
+        return outputs
+
+
+class SequenceSummary(nn.Module):
+    r""" Compute a single vector summary of a sequence hidden states according to various possibilities:
+        Args of the config class:
+            summary_type:
+                - 'last' => [default] take the last token hidden state (like XLNet)
+                - 'first' => take the first token hidden state (like Bert)
+                - 'mean' => take the mean of all tokens hidden states
+                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                - 'attn' => Not implemented now, use multi-head attention
+            summary_use_proj: Add a projection after the vector extraction
+            summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+            summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
+            summary_first_dropout: Add a dropout before the projection and activation
+            summary_last_dropout: Add a dropout after the projection and activation
+    """
+    def __init__(self, config):
+        super(SequenceSummary, self).__init__()
+
+        self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
+        if self.summary_type == 'attn':
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+
+        self.summary = Identity()
+        if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
+            if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
+            else:
+                num_classes = config.hidden_size
+            self.summary = nn.Linear(config.hidden_size, num_classes)
+
+        self.activation = Identity()
+        if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
+            self.activation = nn.Tanh()
+
+        self.first_dropout = Identity()
+        if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0:
+            self.first_dropout = nn.Dropout(config.summary_first_dropout)
+
+        self.last_dropout = Identity()
+        if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
+            self.last_dropout = nn.Dropout(config.summary_last_dropout)
+
+    def forward(self, hidden_states, cls_index=None):
+        """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
+            cls_index: [optional] position of the classification token if summary_type == 'cls_index',
+                shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
+                if summary_type == 'cls_index' and cls_index is None:
+                    we take the last token of the sequence as classification token
+        """
+        if self.summary_type == 'last':
+            output = hidden_states[:, -1]
+        elif self.summary_type == 'first':
+            output = hidden_states[:, 0]
+        elif self.summary_type == 'mean':
+            output = hidden_states.mean(dim=1)
+        elif self.summary_type == 'cls_index':
+            if cls_index is None:
+                cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long)
+            else:
+                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
+                cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
+            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size)
+        elif self.summary_type == 'attn':
+            raise NotImplementedError
+
+        output = self.first_dropout(output)
+        output = self.summary(output)
+        output = self.activation(output)
+        output = self.last_dropout(output)
+
+        return output
+
+
+def prune_linear_layer(layer, index, dim=0):
+    """ Prune a linear layer (a model parameters) to keep only entries in index.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if layer.bias is not None:
+        if dim == 1:
+            b = layer.bias.clone().detach()
+        else:
+            b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    if layer.bias is not None:
+        new_layer.bias.requires_grad = False
+        new_layer.bias.copy_(b.contiguous())
+        new_layer.bias.requires_grad = True
+    return new_layer
+
+
+def prune_conv1d_layer(layer, index, dim=1):
+    """ Prune a Conv1D layer (a model parameters) to keep only entries in index.
+        A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if dim == 0:
+        b = layer.bias.clone().detach()
+    else:
+        b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    new_layer.bias.requires_grad = False
+    new_layer.bias.copy_(b.contiguous())
+    new_layer.bias.requires_grad = True
+    return new_layer
+
+
+def prune_layer(layer, index, dim=None):
+    """ Prune a Conv1D or nn.Linear layer (a model parameters) to keep only entries in index.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    if isinstance(layer, nn.Linear):
+        return prune_linear_layer(layer, index, dim=0 if dim is None else dim)
+    elif isinstance(layer, Conv1D):
+        return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim)
+    else:
+        raise ValueError("Can't prune layer of class {}".format(layer.__class__))
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
new file mode 100644
index 0000000..80c32cb
--- /dev/null
+++ b/pytorch_transformers/modeling_xlm.py
@@ -0,0 +1,781 @@
+# coding=utf-8
+# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch XLM model.
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+import math
+
+import itertools
+import numpy as np
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from .modeling_utils import PreTrainedModel, prune_linear_layer, SequenceSummary, SQuADHead
+from .configuration_xlm import XLMConfig
+from .file_utils import add_start_docstrings
+
+logger = logging.getLogger(__name__)
+
+XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-pytorch_model.bin",
+    'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-pytorch_model.bin",
+    'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-pytorch_model.bin",
+    'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-pytorch_model.bin",
+    'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-pytorch_model.bin",
+    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin",
+    'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin",
+    'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin",
+    'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.bin",
+    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-pytorch_model.bin",
+}
+
+
+def create_sinusoidal_embeddings(n_pos, dim, out):
+    position_enc = np.array([
+        [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
+        for pos in range(n_pos)
+    ])
+    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+    out.detach_()
+    out.requires_grad = False
+
+
+def gelu(x):
+    """
+    GELU activation
+    https://arxiv.org/abs/1606.08415
+    https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/model_pytorch.py#L14
+    https://github.com/huggingface/pytorch-transformers/blob/master/modeling.py
+    """
+    # return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def get_masks(slen, lengths, causal, padding_mask=None):
+    """
+    Generate hidden states mask, and optionally an attention mask.
+    """
+    bs = lengths.size(0)
+    if padding_mask is not None:
+        mask = padding_mask
+    else:
+        assert lengths.max().item() <= slen
+        alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
+        mask = alen < lengths[:, None]
+
+    # attention mask is the same as mask, or triangular inferior attention (causal)
+    if causal:
+        attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None]
+    else:
+        attn_mask = mask
+
+    # sanity check
+    assert mask.size() == (bs, slen)
+    assert causal is False or attn_mask.size() == (bs, slen, slen)
+
+    return mask, attn_mask
+
+
+class MultiHeadAttention(nn.Module):
+
+    NEW_ID = itertools.count()
+
+    def __init__(self, n_heads, dim, config):
+        super(MultiHeadAttention, self).__init__()
+        self.layer_id = next(MultiHeadAttention.NEW_ID)
+        self.output_attentions = config.output_attentions
+        self.dim = dim
+        self.n_heads = n_heads
+        self.dropout = config.attention_dropout
+        assert self.dim % self.n_heads == 0
+
+        self.q_lin = nn.Linear(dim, dim)
+        self.k_lin = nn.Linear(dim, dim)
+        self.v_lin = nn.Linear(dim, dim)
+        self.out_lin = nn.Linear(dim, dim)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        attention_head_size = self.dim // self.n_heads
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.n_heads, attention_head_size)
+        heads = set(heads) - self.pruned_heads
+        for head in heads:
+            head -= sum(1 if h < head else 0 for h in self.pruned_heads)
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        # Prune linear layers
+        self.q_lin = prune_linear_layer(self.q_lin, index)
+        self.k_lin = prune_linear_layer(self.k_lin, index)
+        self.v_lin = prune_linear_layer(self.v_lin, index)
+        self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.dim = attention_head_size * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, input, mask, kv=None, cache=None, head_mask=None):
+        """
+        Self-attention (if kv is None) or attention over source sentence (provided by kv).
+        """
+        # Input is (bs, qlen, dim)
+        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
+        bs, qlen, dim = input.size()
+        if kv is None:
+            klen = qlen if cache is None else cache['slen'] + qlen
+        else:
+            klen = kv.size(1)
+        # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
+        n_heads = self.n_heads
+        dim_per_head = self.dim // n_heads
+        mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen)
+
+        def shape(x):
+            """  projection """
+            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
+
+        def unshape(x):
+            """  compute context """
+            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
+
+        q = shape(self.q_lin(input))                                          # (bs, n_heads, qlen, dim_per_head)
+        if kv is None:
+            k = shape(self.k_lin(input))                                      # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(input))                                      # (bs, n_heads, qlen, dim_per_head)
+        elif cache is None or self.layer_id not in cache:
+            k = v = kv
+            k = shape(self.k_lin(k))                                          # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(v))                                          # (bs, n_heads, qlen, dim_per_head)
+
+        if cache is not None:
+            if self.layer_id in cache:
+                if kv is None:
+                    k_, v_ = cache[self.layer_id]
+                    k = torch.cat([k_, k], dim=2)                             # (bs, n_heads, klen, dim_per_head)
+                    v = torch.cat([v_, v], dim=2)                             # (bs, n_heads, klen, dim_per_head)
+                else:
+                    k, v = cache[self.layer_id]
+            cache[self.layer_id] = (k, v)
+
+        q = q / math.sqrt(dim_per_head)                                       # (bs, n_heads, qlen, dim_per_head)
+        scores = torch.matmul(q, k.transpose(2, 3))                           # (bs, n_heads, qlen, klen)
+        mask = (mask == 0).view(mask_reshape).expand_as(scores)               # (bs, n_heads, qlen, klen)
+        scores.masked_fill_(mask, -float('inf'))                              # (bs, n_heads, qlen, klen)
+
+        weights = F.softmax(scores.float(), dim=-1).type_as(scores)           # (bs, n_heads, qlen, klen)
+        weights = F.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
+        context = torch.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)                                            # (bs, qlen, dim)
+
+        outputs = (self.out_lin(context),)
+        if self.output_attentions:
+            outputs = outputs + (weights,)
+        return outputs
+
+
+class TransformerFFN(nn.Module):
+
+    def __init__(self, in_dim, dim_hidden, out_dim, config):
+        super(TransformerFFN, self).__init__()
+        self.dropout = config.dropout
+        self.lin1 = nn.Linear(in_dim, dim_hidden)
+        self.lin2 = nn.Linear(dim_hidden, out_dim)
+        self.act = gelu if config.gelu_activation else F.relu
+
+    def forward(self, input):
+        x = self.lin1(input)
+        x = self.act(x)
+        x = self.lin2(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        return x
+
+
+class XLMPreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = XLMConfig
+    pretrained_model_archive_map = XLM_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_tf_weights = None
+    base_model_prefix = "transformer"
+
+    def __init__(self, *inputs, **kwargs):
+        super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module):
+        """ Initialize the weights. """
+        if isinstance(module, nn.Embedding):
+            if self.config is not None and self.config.embed_init_std is not None:
+                nn.init.normal_(module.weight, mean=0, std=self.config.embed_init_std)
+        if isinstance(module, nn.Linear):
+            if self.config is not None and self.config.init_std is not None:
+                nn.init.normal_(module.weight, mean=0, std=self.config.init_std)
+                if hasattr(module, 'bias') and module.bias is not None:
+                    nn.init.constant_(module.bias, 0.)
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+XLM_START_DOCSTRING = r"""    The XLM model was proposed in
+    `Cross-lingual Language Model Pretraining`_
+    by Guillaume Lample*, Alexis Conneau*. It's a transformer pre-trained using one of the following objectives:
+
+        - a causal language modeling (CLM) objective (next token prediction),
+        - a masked language modeling (MLM) objective (Bert-like), or
+        - a Translation Language Modeling (TLM) object (extension of Bert's MLM to multiple language inputs)
+
+    Original code can be found `here`_.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`Cross-lingual Language Model Pretraining`:
+        https://arxiv.org/abs/1901.07291
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    .. _`here`:
+        https://github.com/facebookresearch/XLM
+
+    Parameters:
+        config (:class:`~pytorch_transformers.XLMConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+XLM_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+
+            XLM is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+
+            Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens to be used to indicate the language of each token in the input.
+            Indices are languages ids which can be obtained from the language names by using two conversion mappings
+            provided in the configuration of the model (only provided for multilingual models).
+            More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
+            the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **lengths**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Length of each sentence that can be used to avoid performing attention on padding token indices.
+            You can also use `attention_mask` for the same result (see above), kept here for compatbility.
+            Indices selected in ``[0, ..., input_ids.size(-1)]``:
+        **cache**:
+            dictionary with ``torch.FloatTensor`` that contains pre-computed
+            hidden-states (key and values in the attention blocks) as computed by the model
+            (see `cache` output below). Can be used to speed up sequential decoding.
+            The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare XLM Model transformer outputing raw hidden-states without any specific head on top.",
+                      XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+class XLMModel(XLMPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = XLMModel.from_pretrained('xlm-mlm-en-2048')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    ATTRIBUTES = ['encoder', 'eos_index', 'pad_index',  # 'with_output', 
+                  'n_langs', 'use_lang_emb', 'n_words', 'dim', 'n_layers', 'n_heads', 
+                  'hidden_dim', 'dropout', 'attention_dropout', 'asm',
+                  'asm_cutoffs', 'asm_div_value']
+
+    def __init__(self, config):  #, dico, is_encoder, with_output):
+        super(XLMModel, self).__init__(config)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+
+        # encoder / decoder, output layer
+        self.is_encoder = config.is_encoder
+        self.is_decoder = not config.is_encoder
+        if self.is_decoder:
+            raise NotImplementedError("Currently XLM can only be used as an encoder")
+        # self.with_output = with_output
+        self.causal = config.causal
+
+        # dictionary / languages
+        self.n_langs = config.n_langs
+        self.use_lang_emb = config.use_lang_emb
+        self.n_words = config.n_words
+        self.eos_index = config.eos_index
+        self.pad_index = config.pad_index
+        # self.dico = dico
+        # self.id2lang = config.id2lang
+        # self.lang2id = config.lang2id
+        # assert len(self.dico) == self.n_words
+        # assert len(self.id2lang) == len(self.lang2id) == self.n_langs
+
+        # model parameters
+        self.dim = config.emb_dim       # 512 by default
+        self.hidden_dim = self.dim * 4  # 2048 by default
+        self.n_heads = config.n_heads   # 8 by default
+        self.n_layers = config.n_layers
+        self.dropout = config.dropout
+        self.attention_dropout = config.attention_dropout
+        assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads'
+
+        # embeddings
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim)
+        if config.sinusoidal_embeddings:
+            create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
+        if config.n_langs > 1 and config.use_lang_emb:
+            self.lang_embeddings = nn.Embedding(self.n_langs, self.dim)
+        self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
+        self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps)
+
+        # transformer layers
+        self.attentions = nn.ModuleList()
+        self.layer_norm1 = nn.ModuleList()
+        self.ffns = nn.ModuleList()
+        self.layer_norm2 = nn.ModuleList()
+        # if self.is_decoder:
+        #     self.layer_norm15 = nn.ModuleList()
+        #     self.encoder_attn = nn.ModuleList()
+
+        for _ in range(self.n_layers):
+            self.attentions.append(MultiHeadAttention(self.n_heads, self.dim, config=config))
+            self.layer_norm1.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+            # if self.is_decoder:
+            #     self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+            #     self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
+            self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config))
+            self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+
+        if hasattr(config, "pruned_heads"):
+            pruned_heads = config.pruned_heads.copy().items()
+            config.pruned_heads = {}
+            for layer, heads in pruned_heads:
+                if self.attentions[int(layer)].n_heads == config.n_heads:
+                    self.prune_heads({int(layer): list(map(int, heads))})
+
+        self.init_weights()
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
+        return self.embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.attentions[layer].prune_heads(heads)
+
+    def forward(self, input_ids, lengths=None, position_ids=None, langs=None,
+                token_type_ids=None, attention_mask=None, cache=None, head_mask=None):  # src_enc=None, src_len=None, 
+        if lengths is None:
+            lengths = (input_ids != self.pad_index).sum(dim=1).long()
+        # mask = input_ids != self.pad_index
+
+        # check inputs
+        bs, slen = input_ids.size()
+        assert lengths.size(0) == bs
+        assert lengths.max().item() <= slen
+        # input_ids = input_ids.transpose(0, 1)  # batch size as dimension 0
+        # assert (src_enc is None) == (src_len is None)
+        # if src_enc is not None:
+        #     assert self.is_decoder
+        #     assert src_enc.size(0) == bs
+
+        # generate masks
+        mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask)
+        # if self.is_decoder and src_enc is not None:
+        #     src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
+
+        # position_ids
+        if position_ids is None:
+            position_ids = input_ids.new((slen,)).long()
+            position_ids = torch.arange(slen, out=position_ids).unsqueeze(0)
+        else:
+            assert position_ids.size() == (bs, slen)  # (slen, bs)
+            # position_ids = position_ids.transpose(0, 1)
+
+        # langs
+        if langs is not None:
+            assert langs.size() == (bs, slen)  # (slen, bs)
+            # langs = langs.transpose(0, 1)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.n_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.n_layers
+
+        # do not recompute cached elements
+        if cache is not None:
+            _slen = slen - cache['slen']
+            input_ids = input_ids[:, -_slen:]
+            position_ids = position_ids[:, -_slen:]
+            if langs is not None:
+                langs = langs[:, -_slen:]
+            mask = mask[:, -_slen:]
+            attn_mask = attn_mask[:, -_slen:]
+
+        # embeddings
+        tensor = self.embeddings(input_ids)
+        tensor = tensor + self.position_embeddings(position_ids).expand_as(tensor)
+        if langs is not None and self.use_lang_emb:
+            tensor = tensor + self.lang_embeddings(langs)
+        if token_type_ids is not None:
+            tensor = tensor + self.embeddings(token_type_ids)
+        tensor = self.layer_norm_emb(tensor)
+        tensor = F.dropout(tensor, p=self.dropout, training=self.training)
+        tensor *= mask.unsqueeze(-1).to(tensor.dtype)
+
+        # transformer layers
+        hidden_states = ()
+        attentions = ()
+        for i in range(self.n_layers):
+            if self.output_hidden_states:
+                hidden_states = hidden_states + (tensor,)
+
+            # self attention
+            attn_outputs = self.attentions[i](tensor, attn_mask, cache=cache, head_mask=head_mask[i])
+            attn = attn_outputs[0]
+            if self.output_attentions:
+                attentions = attentions + (attn_outputs[1],)
+            attn = F.dropout(attn, p=self.dropout, training=self.training)
+            tensor = tensor + attn
+            tensor = self.layer_norm1[i](tensor)
+
+            # encoder attention (for decoder only)
+            # if self.is_decoder and src_enc is not None:
+            #     attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
+            #     attn = F.dropout(attn, p=self.dropout, training=self.training)
+            #     tensor = tensor + attn
+            #     tensor = self.layer_norm15[i](tensor)
+
+            # FFN
+            tensor = tensor + self.ffns[i](tensor)
+            tensor = self.layer_norm2[i](tensor)
+            tensor *= mask.unsqueeze(-1).to(tensor.dtype)
+
+        # Add last hidden state
+        if self.output_hidden_states:
+            hidden_states = hidden_states + (tensor,)
+
+        # update cache length
+        if cache is not None:
+            cache['slen'] += tensor.size(1)
+
+        # move back sequence length to dimension 0
+        # tensor = tensor.transpose(0, 1)
+
+        outputs = (tensor,)
+        if self.output_hidden_states:
+            outputs = outputs + (hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (attentions,)
+        return outputs  # outputs, (hidden_states), (attentions)
+
+
+class XLMPredLayer(nn.Module):
+    """
+    Prediction layer (cross_entropy or adaptive_softmax).
+    """
+    def __init__(self, config):
+        super(XLMPredLayer, self).__init__()
+        self.asm = config.asm
+        self.n_words = config.n_words
+        self.pad_index = config.pad_index
+        dim = config.emb_dim
+
+        if config.asm is False:
+            self.proj = nn.Linear(dim, config.n_words, bias=True)
+        else:
+            self.proj = nn.AdaptiveLogSoftmaxWithLoss(
+                in_features=dim,
+                n_classes=config.n_words,
+                cutoffs=config.asm_cutoffs,
+                div_value=config.asm_div_value,
+                head_bias=True,  # default is False
+            )
+
+    def forward(self, x, y=None):
+        """ Compute the loss, and optionally the scores.
+        """
+        outputs = ()
+        if self.asm is False:
+            scores = self.proj(x).view(-1, self.n_words)
+            outputs = (scores,) + outputs
+            if y is not None:
+                loss = F.cross_entropy(scores, y, reduction='elementwise_mean')
+                outputs = (loss,) + outputs
+        else:
+            scores = self.proj.log_prob(x)
+            outputs = (scores,) + outputs
+            if y is not None:
+                _, loss = self.proj(x, y)
+                outputs = (loss,) + outputs
+
+        return outputs
+
+
+@add_start_docstrings("""The XLM Model transformer with a language modeling head on top
+    (linear layer with weights tied to the input embeddings). """,
+    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+class XLMWithLMHeadModel(XLMPreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config):
+        super(XLMWithLMHeadModel, self).__init__(config)
+        self.transformer = XLMModel(config)
+        self.pred_layer = XLMPredLayer(config)
+
+        self.init_weights()
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the embeddings
+        """
+        self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings)
+
+    def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
+                attention_mask=None, cache=None, labels=None, head_mask=None):
+        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
+                                               token_type_ids=token_type_ids, langs=langs,
+                                               attention_mask=attention_mask, cache=cache, head_mask=head_mask)
+
+        output = transformer_outputs[0]
+        outputs = self.pred_layer(output, labels)
+        outputs = outputs + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
+
+        return outputs
+
+
+@add_start_docstrings("""XLM Model with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+class XLMForSequenceClassification(XLMPreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(XLMForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = XLMModel(config)
+        self.sequence_summary = SequenceSummary(config)
+
+        self.init_weights()
+
+    def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
+                attention_mask=None, cache=None, labels=None, head_mask=None):
+        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
+                                               token_type_ids=token_type_ids, langs=langs,
+                                               attention_mask=attention_mask, cache=cache, head_mask=head_mask)
+
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output)
+
+        outputs = (logits,) + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
+
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs
+
+
+@add_start_docstrings("""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+class XLMForQuestionAnswering(XLMPreTrainedModel):
+    r"""
+        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **is_impossible**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels whether a question has an answer or no answer (SQuAD 2.0)
+        **cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
+        **p_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...) 
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        start_positions = torch.tensor([1])
+        end_positions = torch.tensor([3])
+        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        loss, start_scores, end_scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(XLMForQuestionAnswering, self).__init__(config)
+
+        self.transformer = XLMModel(config)
+        self.qa_outputs = SQuADHead(config)
+
+        self.init_weights()
+
+    def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
+                attention_mask=None, cache=None, start_positions=None, end_positions=None,
+                cls_index=None, is_impossible=None, p_mask=None, head_mask=None):
+        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
+                                               token_type_ids=token_type_ids, langs=langs,
+                                               attention_mask=attention_mask, cache=cache, head_mask=head_mask)
+
+        output = transformer_outputs[0]
+
+        outputs = self.qa_outputs(output, start_positions=start_positions, end_positions=end_positions,
+                                  cls_index=cls_index, is_impossible=is_impossible, p_mask=p_mask)
+
+        outputs = outputs + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
+
+        return outputs
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
new file mode 100644
index 0000000..81e9f2e
--- /dev/null
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -0,0 +1,1137 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch XLNet model.
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from .modeling_utils import PreTrainedModel, prune_linear_layer, SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits
+from .configuration_xlnet import XLNetConfig
+from .file_utils import add_start_docstrings
+
+
+logger = logging.getLogger(__name__)
+
+XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin",
+    'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-pytorch_model.bin",
+}
+
+
+def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
+    """ A map of modules from TF to PyTorch.
+        I use a map to keep the PyTorch model as
+        identical to the original PyTorch model as possible.
+    """
+
+    tf_to_pt_map = {}
+
+    if hasattr(model, 'transformer'):
+        if hasattr(model, 'lm_loss'):
+            # We will load also the output bias
+            tf_to_pt_map['model/lm_loss/bias'] = model.lm_loss.bias
+        if hasattr(model, 'sequence_summary') and 'model/sequnece_summary/summary/kernel' in tf_weights:
+            # We will load also the sequence summary
+            tf_to_pt_map['model/sequnece_summary/summary/kernel'] = model.sequence_summary.summary.weight
+            tf_to_pt_map['model/sequnece_summary/summary/bias'] = model.sequence_summary.summary.bias
+        if hasattr(model, 'logits_proj') and config.finetuning_task is not None \
+                and 'model/regression_{}/logit/kernel'.format(config.finetuning_task) in tf_weights:
+            tf_to_pt_map['model/regression_{}/logit/kernel'.format(config.finetuning_task)] = model.logits_proj.weight
+            tf_to_pt_map['model/regression_{}/logit/bias'.format(config.finetuning_task)] = model.logits_proj.bias
+
+        # Now load the rest of the transformer
+        model = model.transformer
+
+    # Embeddings and output
+    tf_to_pt_map.update({'model/transformer/word_embedding/lookup_table': model.word_embedding.weight,
+                         'model/transformer/mask_emb/mask_emb': model.mask_emb})
+
+    # Transformer blocks
+    for i, b in enumerate(model.layer):
+        layer_str = "model/transformer/layer_%d/" % i
+        tf_to_pt_map.update({
+            layer_str + "rel_attn/LayerNorm/gamma": b.rel_attn.layer_norm.weight,
+            layer_str + "rel_attn/LayerNorm/beta": b.rel_attn.layer_norm.bias,
+            layer_str + "rel_attn/o/kernel": b.rel_attn.o,
+            layer_str + "rel_attn/q/kernel": b.rel_attn.q,
+            layer_str + "rel_attn/k/kernel": b.rel_attn.k,
+            layer_str + "rel_attn/r/kernel": b.rel_attn.r,
+            layer_str + "rel_attn/v/kernel": b.rel_attn.v,
+            layer_str + "ff/LayerNorm/gamma": b.ff.layer_norm.weight,
+            layer_str + "ff/LayerNorm/beta": b.ff.layer_norm.bias,
+            layer_str + "ff/layer_1/kernel": b.ff.layer_1.weight,
+            layer_str + "ff/layer_1/bias": b.ff.layer_1.bias,
+            layer_str + "ff/layer_2/kernel": b.ff.layer_2.weight,
+            layer_str + "ff/layer_2/bias": b.ff.layer_2.bias,
+        })
+
+    # Relative positioning biases
+    if config.untie_r:
+        r_r_list = []
+        r_w_list = []
+        r_s_list = []
+        seg_embed_list = []
+        for b in model.layer:
+            r_r_list.append(b.rel_attn.r_r_bias)
+            r_w_list.append(b.rel_attn.r_w_bias)
+            r_s_list.append(b.rel_attn.r_s_bias)
+            seg_embed_list.append(b.rel_attn.seg_embed)
+    else:
+        r_r_list = [model.r_r_bias]
+        r_w_list = [model.r_w_bias]
+        r_s_list = [model.r_s_bias]
+        seg_embed_list = [model.seg_embed]
+    tf_to_pt_map.update({
+        'model/transformer/r_r_bias': r_r_list,
+        'model/transformer/r_w_bias': r_w_list,
+        'model/transformer/r_s_bias': r_s_list,
+        'model/transformer/seg_embed': seg_embed_list})
+    return tf_to_pt_map
+
+def load_tf_weights_in_xlnet(model, config, tf_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    try:
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    tf_weights = {}
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        tf_weights[name] = array
+
+    # Build TF to PyTorch weights loading map
+    tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights)
+
+    for name, pointer in tf_to_pt_map.items():
+        logger.info("Importing {}".format(name))
+        if name not in tf_weights:
+            logger.info("{} not in tf pre-trained weights, skipping".format(name))
+            continue
+        array = tf_weights[name]
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if 'kernel' in name and ('ff' in name or 'summary' in name or 'logit' in name):
+            logger.info("Transposing")
+            array = np.transpose(array)
+        if isinstance(pointer, list):
+            # Here we will split the TF weigths
+            assert len(pointer) == array.shape[0]
+            for i, p_i in enumerate(pointer):
+                arr_i = array[i, ...]
+                try:
+                    assert p_i.shape == arr_i.shape
+                except AssertionError as e:
+                    e.args += (p_i.shape, arr_i.shape)
+                    raise
+                logger.info("Initialize PyTorch weight {} for layer {}".format(name, i))
+                p_i.data = torch.from_numpy(arr_i)
+        else:
+            try:
+                assert pointer.shape == array.shape
+            except AssertionError as e:
+                e.args += (pointer.shape, array.shape)
+                raise
+            logger.info("Initialize PyTorch weight {}".format(name))
+            pointer.data = torch.from_numpy(array)
+        tf_weights.pop(name, None)
+        tf_weights.pop(name + '/Adam', None)
+        tf_weights.pop(name + '/Adam_1', None)
+
+    logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
+    return model
+
+
+def gelu(x):
+    """ Implementation of the gelu activation function.
+        XLNet is using OpenAI GPT's gelu (not exactly the same as BERT)
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    cdf = 0.5 * (1.0 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    return x * cdf
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
+
+
+try:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as XLNetLayerNorm
+except (ImportError, AttributeError) as e:
+    logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
+    from torch.nn import LayerNorm as XLNetLayerNorm
+
+class XLNetRelativeAttention(nn.Module):
+    def __init__(self, config):
+        super(XLNetRelativeAttention, self).__init__()
+        self.output_attentions = config.output_attentions
+
+        if config.d_model % config.n_head != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.d_model, config.n_head))
+
+        self.n_head = config.n_head
+        self.d_head = config.d_head
+        self.d_model = config.d_model
+        self.scale = 1 / (config.d_head ** 0.5)
+
+        self.q = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
+        self.k = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
+        self.v = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
+        self.o = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
+        self.r = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
+
+        self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+        self.r_s_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+        self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+        self.seg_embed = nn.Parameter(torch.FloatTensor(2, self.n_head, self.d_head))
+
+        self.layer_norm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.dropout)
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    @staticmethod
+    def rel_shift(x, klen=-1):
+        """perform relative shift to form the relative attention score."""
+        x_size = x.shape
+
+        x = x.reshape(x_size[1], x_size[0], x_size[2], x_size[3])
+        x = x[1:, ...]
+        x = x.reshape(x_size[0], x_size[1] - 1, x_size[2], x_size[3])
+        # x = x[:, 0:klen, :, :]
+        x = torch.index_select(x, 1, torch.arange(klen, device=x.device, dtype=torch.long))
+
+        return x
+
+    def rel_attn_core(self, q_head, k_head_h, v_head_h, k_head_r, seg_mat=None, attn_mask=None, head_mask=None):
+        """Core relative positional attention operations."""
+
+        # content based attention score
+        ac = torch.einsum('ibnd,jbnd->ijbn', [q_head + self.r_w_bias, k_head_h])
+
+        # position based attention score
+        bd = torch.einsum('ibnd,jbnd->ijbn', [q_head + self.r_r_bias, k_head_r])
+        bd = self.rel_shift(bd, klen=ac.shape[1])
+
+        # segment based attention score
+        if seg_mat is None:
+            ef = 0
+        else:
+            ef = torch.einsum('ibnd,snd->ibns', [q_head + self.r_s_bias, self.seg_embed])
+            ef = torch.einsum('ijbs,ibns->ijbn', [seg_mat, ef])
+
+        # merge attention scores and perform masking
+        attn_score = (ac + bd + ef) * self.scale
+        if attn_mask is not None:
+            # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
+            if attn_mask.dtype == torch.float16:
+                attn_score = attn_score - 65500 * attn_mask
+            else:
+                attn_score = attn_score - 1e30 * attn_mask
+
+        # attention probability
+        attn_prob = F.softmax(attn_score, dim=1)
+        attn_prob = self.dropout(attn_prob)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_prob = attn_prob * head_mask
+
+        # attention output
+        attn_vec = torch.einsum('ijbn,jbnd->ibnd', [attn_prob, v_head_h])
+
+        if self.output_attentions:
+            return attn_vec, attn_prob
+
+        return attn_vec
+
+    def post_attention(self, h, attn_vec, residual=True):
+        """Post-attention processing."""
+        # post-attention projection (back to `d_model`)
+        attn_out = torch.einsum('ibnd,hnd->ibh', [attn_vec, self.o])
+
+        attn_out = self.dropout(attn_out)
+        if residual:
+            attn_out = attn_out + h
+        output = self.layer_norm(attn_out)
+
+        return output
+
+    def forward(self, h, g,
+                      attn_mask_h, attn_mask_g,
+                      r, seg_mat,
+                      mems=None, target_mapping=None, head_mask=None):
+        if g is not None:
+            ###### Two-stream attention with relative positional encoding.
+            # content based attention score
+            if mems is not None and mems.dim() > 1:
+                cat = torch.cat([mems, h], dim=0)
+            else:
+                cat = h
+
+            # content-based key head
+            k_head_h = torch.einsum('ibh,hnd->ibnd', [cat, self.k])
+
+            # content-based value head
+            v_head_h = torch.einsum('ibh,hnd->ibnd', [cat, self.v])
+
+            # position-based key head
+            k_head_r = torch.einsum('ibh,hnd->ibnd', [r, self.r])
+
+            ##### h-stream
+            # content-stream query head
+            q_head_h = torch.einsum('ibh,hnd->ibnd', [h, self.q])
+
+            # core attention ops
+            attn_vec_h = self.rel_attn_core(
+                q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask)
+
+            if self.output_attentions:
+                attn_vec_h, attn_prob_h = attn_vec_h
+
+            # post processing
+            output_h = self.post_attention(h, attn_vec_h)
+
+            ##### g-stream
+            # query-stream query head
+            q_head_g = torch.einsum('ibh,hnd->ibnd', [g, self.q])
+
+            # core attention ops
+            if target_mapping is not None:
+                q_head_g = torch.einsum('mbnd,mlb->lbnd', [q_head_g, target_mapping])
+                attn_vec_g = self.rel_attn_core(
+                    q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask)
+
+                if self.output_attentions:
+                    attn_vec_g, attn_prob_g = attn_vec_g
+
+                attn_vec_g = torch.einsum('lbnd,mlb->mbnd', [attn_vec_g, target_mapping])
+            else:
+                attn_vec_g = self.rel_attn_core(
+                    q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask)
+
+                if self.output_attentions:
+                    attn_vec_g, attn_prob_g = attn_vec_g
+
+            # post processing
+            output_g = self.post_attention(g, attn_vec_g)
+
+            if self.output_attentions:
+                attn_prob = attn_prob_h, attn_prob_g
+
+        else:
+            ###### Multi-head attention with relative positional encoding
+            if mems is not None and mems.dim() > 1:
+                cat = torch.cat([mems, h], dim=0)
+            else:
+                cat = h
+
+            # content heads
+            q_head_h = torch.einsum('ibh,hnd->ibnd', [h, self.q])
+            k_head_h = torch.einsum('ibh,hnd->ibnd', [cat, self.k])
+            v_head_h = torch.einsum('ibh,hnd->ibnd', [cat, self.v])
+
+            # positional heads
+            k_head_r = torch.einsum('ibh,hnd->ibnd', [r, self.r])
+
+            # core attention ops
+            attn_vec = self.rel_attn_core(
+                q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask)
+
+            if self.output_attentions:
+                attn_vec, attn_prob = attn_vec
+
+            # post processing
+            output_h = self.post_attention(h, attn_vec)
+            output_g = None
+
+        outputs = (output_h, output_g)
+        if self.output_attentions:
+            outputs = outputs + (attn_prob,)
+        return outputs
+
+class XLNetFeedForward(nn.Module):
+    def __init__(self, config):
+        super(XLNetFeedForward, self).__init__()
+        self.layer_norm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps)
+        self.layer_1 = nn.Linear(config.d_model, config.d_inner)
+        self.layer_2 = nn.Linear(config.d_inner, config.d_model)
+        self.dropout = nn.Dropout(config.dropout)
+        if isinstance(config.ff_activation, str) or \
+                (sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)):
+            self.activation_function = ACT2FN[config.ff_activation]
+        else:
+            self.activation_function = config.ff_activation
+
+    def forward(self, inp):
+        output = inp
+        output = self.layer_1(output)
+        output = self.activation_function(output)
+        output = self.dropout(output)
+        output = self.layer_2(output)
+        output = self.dropout(output)
+        output = self.layer_norm(output + inp)
+        return output
+
+class XLNetLayer(nn.Module):
+    def __init__(self, config):
+        super(XLNetLayer, self).__init__()
+        self.rel_attn = XLNetRelativeAttention(config)
+        self.ff = XLNetFeedForward(config)
+        self.dropout = nn.Dropout(config.dropout)
+
+    def forward(self, output_h, output_g,
+                attn_mask_h, attn_mask_g,
+                r, seg_mat, mems=None, target_mapping=None, head_mask=None):
+        outputs = self.rel_attn(output_h, output_g, attn_mask_h, attn_mask_g,
+                                r, seg_mat, mems=mems, target_mapping=target_mapping,
+                                head_mask=head_mask)
+        output_h, output_g = outputs[:2]
+
+        if output_g is not None:
+            output_g = self.ff(output_g)
+        output_h = self.ff(output_h)
+
+        outputs = (output_h, output_g) + outputs[2:]  # Add again attentions if there are there
+        return outputs
+
+
+class XLNetPreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = XLNetConfig
+    pretrained_model_archive_map = XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_xlnet
+    base_model_prefix = "transformer"
+
+    def _init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, XLNetLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, XLNetRelativeAttention):
+            for param in [module.q, module.k, module.v, module.o, module.r,
+                          module.r_r_bias, module.r_s_bias, module.r_w_bias,
+                          module.seg_embed]:
+                param.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, XLNetModel):
+                module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+
+XLNET_START_DOCSTRING = r"""    The XLNet model was proposed in
+    `XLNet: Generalized Autoregressive Pretraining for Language Understanding`_
+    by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+    XLnet is an extension of the Transformer-XL model pre-trained using an autoregressive method
+    to learn bidirectional contexts by maximizing the expected likelihood over all permutations
+    of the input sequence factorization order.
+
+    The specific attention pattern can be controlled at training and test time using the `perm_mask` input.
+
+    Do to the difficulty of training a fully auto-regressive model over various factorization order,
+    XLNet is pretrained using only a sub-set of the output tokens as target which are selected
+    with the `target_mapping` input.
+
+    To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the `perm_mask` and
+    `target_mapping` inputs to control the attention span and outputs (see examples in `examples/run_generation.py`)
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`XLNet: Generalized Autoregressive Pretraining for Language Understanding`:
+        http://arxiv.org/abs/1906.08237
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+XLNET_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            XLNet is a model with relative position embeddings so you can either pad the inputs on
+            the right or on the left.
+            Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
+            Kept for compatibility with the original code base.
+            You can only uses one of `input_mask` and `attention_mask`
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
+        **mems**: (`optional`)
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as output by the model
+            (see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
+            To activate mems you need to set up config.mem_len to a positive value which will be the max number of tokens in
+            the memory output by the model. E.g. `model = XLNetModel.from_pretrained('xlnet-base-case, mem_len=1024)` will
+            instantiate a model which can use up to 1024 tokens of memory (in addition to the input it self).
+        **perm_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, sequence_length)``:
+            Mask to indicate the attention pattern for each input token with values selected in ``[0, 1]``:
+            If ``perm_mask[k, i, j] = 0``, i attend to j in batch k;
+            if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k.
+            If None, each token attends to all the others (full bidirectional attention).
+            Only used during pretraining (to define factorization order) or for sequential decoding (generation).
+        **target_mapping**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_predict, sequence_length)``:
+            Mask to indicate the output tokens to use.
+            If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
+            Only used during pretraining for partial prediction or for sequential decoding (generation).
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.",
+                      XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+class XLNetModel(XLNetPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **mems**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
+            See details in the docstring of the `mems` input above.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        model = XLNetModel.from_pretrained('xlnet-large-cased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config):
+        super(XLNetModel, self).__init__(config)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+
+        self.mem_len = config.mem_len
+        self.reuse_len = config.reuse_len
+        self.d_model = config.d_model
+        self.same_length = config.same_length
+        self.attn_type = config.attn_type
+        self.bi_data = config.bi_data
+        self.clamp_len = config.clamp_len
+        self.n_layer = config.n_layer
+
+        self.word_embedding = nn.Embedding(config.n_token, config.d_model)
+        self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model))
+        self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
+        self.dropout = nn.Dropout(config.dropout)
+
+        self.init_weights()
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens)
+        return self.word_embedding
+
+    def _prune_heads(self, heads_to_prune):
+        raise NotImplementedError
+
+    def create_mask(self, qlen, mlen):
+        """
+        Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.
+
+        Args:
+            qlen: TODO Lysandre didn't fill
+            mlen: TODO Lysandre didn't fill
+
+        ::
+
+                  same_length=False:      same_length=True:
+                  <mlen > <  qlen >       <mlen > <  qlen >
+               ^ [0 0 0 0 0 1 1 1 1]     [0 0 0 0 0 1 1 1 1]
+                 [0 0 0 0 0 0 1 1 1]     [1 0 0 0 0 0 1 1 1]
+            qlen [0 0 0 0 0 0 0 1 1]     [1 1 0 0 0 0 0 1 1]
+                 [0 0 0 0 0 0 0 0 1]     [1 1 1 0 0 0 0 0 1]
+               v [0 0 0 0 0 0 0 0 0]     [1 1 1 1 0 0 0 0 0]
+
+        """
+        attn_mask = torch.ones([qlen, qlen])
+        mask_up = torch.triu(attn_mask, diagonal=1)
+        attn_mask_pad = torch.zeros([qlen, mlen])
+        ret = torch.cat([attn_mask_pad, mask_up], dim=1)
+        if self.same_length:
+            mask_lo = torch.tril(attn_mask, diagonal=-1)
+            ret = torch.cat([ret[:, :qlen] + mask_lo, ret[:, qlen:]], dim=1)
+
+        ret = ret.to(next(self.parameters()))
+        return ret
+
+    def cache_mem(self, curr_out, prev_mem):
+        """cache hidden states into memory."""
+        if self.mem_len is None or self.mem_len == 0:
+            return None
+        else:
+            if self.reuse_len is not None and self.reuse_len > 0:
+                curr_out = curr_out[:self.reuse_len]
+
+            if prev_mem is None:
+                new_mem = curr_out[-self.mem_len:]
+            else:
+                new_mem = torch.cat([prev_mem, curr_out], dim=0)[-self.mem_len:]
+
+        return new_mem.detach()
+
+    @staticmethod
+    def positional_embedding(pos_seq, inv_freq, bsz=None):
+        sinusoid_inp = torch.einsum('i,d->id', [pos_seq, inv_freq])
+        pos_emb = torch.cat([torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)], dim=-1)
+        pos_emb = pos_emb[:, None, :]
+
+        if bsz is not None:
+            pos_emb = pos_emb.expand(-1, bsz, -1)
+
+        return pos_emb
+
+    def relative_positional_encoding(self, qlen, klen, bsz=None):
+        """create relative positional encoding."""
+        freq_seq = torch.arange(0, self.d_model, 2.0, dtype=torch.float)
+        inv_freq = 1 / torch.pow(10000, (freq_seq / self.d_model))
+
+        if self.attn_type == 'bi':
+            # beg, end = klen - 1, -qlen
+            beg, end = klen, -qlen
+        elif self.attn_type == 'uni':
+            # beg, end = klen - 1, -1
+            beg, end = klen, -1
+        else:
+            raise ValueError('Unknown `attn_type` {}.'.format(self.attn_type))
+
+        if self.bi_data:
+            fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=torch.float)
+            bwd_pos_seq = torch.arange(-beg, -end, 1.0, dtype=torch.float)
+
+            if self.clamp_len > 0:
+                fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
+                bwd_pos_seq = bwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
+
+            if bsz is not None:
+                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz//2)
+                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz//2)
+            else:
+                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
+                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)
+
+            pos_emb = torch.cat([fwd_pos_emb, bwd_pos_emb], dim=1)
+        else:
+            fwd_pos_seq = torch.arange(beg, end, -1.0)
+            if self.clamp_len > 0:
+                fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
+            pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz)
+
+        pos_emb = pos_emb.to(next(self.parameters()))
+        return pos_emb
+
+    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
+                mems=None, perm_mask=None, target_mapping=None, head_mask=None):
+        # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
+        # but we want a unified interface in the library with the batch size on the first dimension
+        # so we move here the first dimension (batch) to the end
+        input_ids = input_ids.transpose(0, 1).contiguous()
+        token_type_ids = token_type_ids.transpose(0, 1).contiguous() if token_type_ids is not None else None
+        input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None
+        attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None
+        perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None
+        target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None
+
+        qlen, bsz = input_ids.shape[0], input_ids.shape[1]
+        mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0
+        klen = mlen + qlen
+
+        dtype_float = next(self.parameters()).dtype
+        device = next(self.parameters()).device
+
+        ##### Attention mask
+        # causal attention mask
+        if self.attn_type == 'uni':
+            attn_mask = self.create_mask(qlen, mlen)
+            attn_mask = attn_mask[:, :, None, None]
+        elif self.attn_type == 'bi':
+            attn_mask = None
+        else:
+            raise ValueError('Unsupported attention type: {}'.format(self.attn_type))
+
+        # data mask: input mask & perm mask
+        assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) "
+        "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one."
+        if input_mask is None and attention_mask is not None:
+            input_mask = 1.0 - attention_mask
+        if input_mask is not None and perm_mask is not None:
+            data_mask = input_mask[None] + perm_mask
+        elif input_mask is not None and perm_mask is None:
+            data_mask = input_mask[None]
+        elif input_mask is None and perm_mask is not None:
+            data_mask = perm_mask
+        else:
+            data_mask = None
+
+        if data_mask is not None:
+            # all mems can be attended to
+            mems_mask = torch.zeros([data_mask.shape[0], mlen, bsz]).to(data_mask)
+            data_mask = torch.cat([mems_mask, data_mask], dim=1)
+            if attn_mask is None:
+                attn_mask = data_mask[:, :, :, None]
+            else:
+                attn_mask += data_mask[:, :, :, None]
+
+        if attn_mask is not None:
+            attn_mask = (attn_mask > 0).to(dtype_float)
+
+        if attn_mask is not None:
+            non_tgt_mask = -torch.eye(qlen).to(attn_mask)
+            non_tgt_mask = torch.cat([torch.zeros([qlen, mlen]).to(attn_mask), non_tgt_mask], dim=-1)
+            non_tgt_mask = ((attn_mask + non_tgt_mask[:, :, None, None]) > 0).to(attn_mask)
+        else:
+            non_tgt_mask = None
+
+        ##### Word embeddings and prepare h & g hidden states
+        word_emb_k = self.word_embedding(input_ids)
+        output_h = self.dropout(word_emb_k)
+        if target_mapping is not None:
+            word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1)
+        # else:  # We removed the inp_q input which was same as target mapping
+        #     inp_q_ext = inp_q[:, :, None]
+        #     word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
+            output_g = self.dropout(word_emb_q)
+        else:
+            output_g = None
+
+        ##### Segment embedding
+        if token_type_ids is not None:
+            # Convert `token_type_ids` to one-hot `seg_mat`
+            mem_pad = torch.zeros([mlen, bsz], dtype=torch.long, device=device)
+            cat_ids = torch.cat([mem_pad, token_type_ids], dim=0)
+
+            # `1` indicates not in the same segment [qlen x klen x bsz]
+            seg_mat = (token_type_ids[:, None] != cat_ids[None, :]).long()
+            seg_mat = F.one_hot(seg_mat, num_classes=2).to(dtype_float)
+        else:
+            seg_mat = None
+
+        ##### Positional encoding
+        pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz)
+        pos_emb = self.dropout(pos_emb)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
+        # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0)
+                head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.n_layer
+
+        new_mems = ()
+        if mems is None:
+            mems = [None] * len(self.layer)
+
+        attentions = []
+        hidden_states = []
+        for i, layer_module in enumerate(self.layer):
+            # cache new mems
+            new_mems = new_mems + (self.cache_mem(output_h, mems[i]),)
+            if self.output_hidden_states:
+                hidden_states.append((output_h, output_g) if output_g is not None else output_h)
+
+            outputs = layer_module(output_h, output_g, attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask,
+                                   r=pos_emb, seg_mat=seg_mat, mems=mems[i], target_mapping=target_mapping,
+                                   head_mask=head_mask[i])
+            output_h, output_g = outputs[:2]
+            if self.output_attentions:
+                attentions.append(outputs[2])
+
+        # Add last hidden state
+        if self.output_hidden_states:
+            hidden_states.append((output_h, output_g) if output_g is not None else output_h)
+
+        output = self.dropout(output_g if output_g is not None else output_h)
+
+        # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
+        outputs = (output.permute(1, 0, 2).contiguous(), new_mems)
+        if self.output_hidden_states:
+            if output_g is not None:
+                hidden_states = tuple(h.permute(1, 0, 2).contiguous() for hs in hidden_states for h in hs)
+            else:
+                hidden_states = tuple(hs.permute(1, 0, 2).contiguous() for hs in hidden_states)
+            outputs = outputs + (hidden_states,)
+        if self.output_attentions:
+            attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
+            outputs = outputs + (attentions,)
+
+        return outputs  # outputs, new_mems, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""XLNet Model with a language modeling head on top
+    (linear layer with weights tied to the input embeddings). """,
+    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+class XLNetLMHeadModel(XLNetPreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **mems**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
+            See details in the docstring of the `mems` input above.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')
+        # We show how to setup inputs to predict a next token using a bi-directional context.
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze(0)  # We will predict the masked token
+        perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
+        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+        target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
+        target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+        outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
+        next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
+
+    """
+    def __init__(self, config):
+        super(XLNetLMHeadModel, self).__init__(config)
+        self.attn_type = config.attn_type
+        self.same_length = config.same_length
+
+        self.transformer = XLNetModel(config)
+        self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
+
+        self.init_weights()
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the embeddings
+        """
+        self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding)
+
+    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
+                mems=None, perm_mask=None, target_mapping=None,
+                labels=None, head_mask=None):
+        transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids,
+                                               input_mask=input_mask, attention_mask=attention_mask,
+                                               mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
+                                               head_mask=head_mask)
+
+        logits = self.lm_loss(transformer_outputs[0])
+
+        outputs = (logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+
+        if labels is not None:
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(logits.view(-1, logits.size(-1)),
+                            labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # return (loss), logits, mems, (hidden states), (attentions)
+
+
+@add_start_docstrings("""XLNet Model with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+class XLNetForSequenceClassification(XLNetPreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **mems**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
+            See details in the docstring of the `mems` input above.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(XLNetForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = XLNetModel(config)
+        self.sequence_summary = SequenceSummary(config)
+        self.logits_proj = nn.Linear(config.d_model, config.num_labels)
+
+        self.init_weights()
+
+    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
+                mems=None, perm_mask=None, target_mapping=None,
+                labels=None, head_mask=None):
+        transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids,
+                                               input_mask=input_mask, attention_mask=attention_mask,
+                                               mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
+                                               head_mask=head_mask)
+        output = transformer_outputs[0]
+
+        output = self.sequence_summary(output)
+        logits = self.logits_proj(output)
+
+        outputs = (logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # return (loss), logits, mems, (hidden states), (attentions)
+
+
+@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+class XLNetForQuestionAnswering(XLNetPreTrainedModel):
+    r"""
+        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **is_impossible**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels whether a question has an answer or no answer (SQuAD 2.0)
+        **cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
+        **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
+            1.0 means token should be masked. 0.0 mean token is not masked.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
+        **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``
+            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
+        **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``
+            Indices for the top config.start_n_top start token possibilities (beam-search).
+        **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
+            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
+        **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
+            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
+        **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.FloatTensor`` of shape ``(batch_size,)``
+            Log probabilities for the ``is_impossible`` label of the answers.
+        **mems**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
+            See details in the docstring of the `mems` input above.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        start_positions = torch.tensor([1])
+        end_positions = torch.tensor([3])
+        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        loss, start_scores, end_scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(XLNetForQuestionAnswering, self).__init__(config)
+        self.start_n_top = config.start_n_top
+        self.end_n_top = config.end_n_top
+
+        self.transformer = XLNetModel(config)
+        self.start_logits = PoolerStartLogits(config)
+        self.end_logits = PoolerEndLogits(config)
+        self.answer_class = PoolerAnswerClass(config)
+
+        self.init_weights()
+
+    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
+                mems=None, perm_mask=None, target_mapping=None,
+                start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None,
+                head_mask=None):
+        transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids,
+                                               input_mask=input_mask, attention_mask=attention_mask,
+                                               mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
+                                               head_mask=head_mask)
+        hidden_states = transformer_outputs[0]
+        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
+
+        outputs = transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, let's remove the dimension added by batch splitting
+            for x in (start_positions, end_positions, cls_index, is_impossible):
+                if x is not None and x.dim() > 1:
+                    x.squeeze_(-1)
+
+            # during training, compute the end logits based on the ground truth of the start position
+            end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
+
+            loss_fct = CrossEntropyLoss()
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+            if cls_index is not None and is_impossible is not None:
+                # Predict answerability from the representation of CLS and START
+                cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
+                loss_fct_cls = nn.BCEWithLogitsLoss()
+                cls_loss = loss_fct_cls(cls_logits, is_impossible)
+
+                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
+                total_loss += cls_loss * 0.5
+
+            outputs = (total_loss,) + outputs
+
+        else:
+            # during inference, compute the end logits based on beam search
+            bsz, slen, hsz = hidden_states.size()
+            start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen)
+
+            start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top)
+            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
+            start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz)
+            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz)
+
+            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz)
+            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
+            end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
+            end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top)
+
+            end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top)
+            end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
+            end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
+
+            start_states = torch.einsum("blh,bl->bh", [hidden_states, start_log_probs])  # get the representation of START as weighted sum of hidden states
+            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)  # Shape (batch size,): one single `cls_logits` for each sample
+
+            outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
+
+        # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
+        # or (if labels are provided) (total_loss,)
+        return outputs
diff --git a/pytorch_transformers/optimization.py b/pytorch_transformers/optimization.py
new file mode 100644
index 0000000..39dc7a5
--- /dev/null
+++ b/pytorch_transformers/optimization.py
@@ -0,0 +1,189 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch optimization for BERT model."""
+
+import logging
+import math
+
+import torch
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LambdaLR
+
+logger = logging.getLogger(__name__)
+
+class ConstantLRSchedule(LambdaLR):
+    """ Constant learning rate schedule.
+    """
+    def __init__(self, optimizer, last_epoch=-1):
+        super(ConstantLRSchedule, self).__init__(optimizer, lambda _: 1.0, last_epoch=last_epoch)
+
+
+class WarmupConstantSchedule(LambdaLR):
+    """ Linear warmup and then constant.
+        Linearly increases learning rate schedule from 0 to 1 over `warmup_steps` training steps.
+        Keeps learning rate schedule equal to 1. after warmup_steps.
+    """
+    def __init__(self, optimizer, warmup_steps, last_epoch=-1):
+        self.warmup_steps = warmup_steps
+        super(WarmupConstantSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
+
+    def lr_lambda(self, step):
+        if step < self.warmup_steps:
+            return float(step) / float(max(1.0, self.warmup_steps))
+        return 1.
+
+
+class WarmupLinearSchedule(LambdaLR):
+    """ Linear warmup and then linear decay.
+        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
+        Linearly decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps.
+    """
+    def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
+        self.warmup_steps = warmup_steps
+        self.t_total = t_total
+        super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
+
+    def lr_lambda(self, step):
+        if step < self.warmup_steps:
+            return float(step) / float(max(1, self.warmup_steps))
+        return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps)))
+
+
+class WarmupCosineSchedule(LambdaLR):
+    """ Linear warmup and then cosine decay.
+        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
+        Decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps following a cosine curve.
+        If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
+    """
+    def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1):
+        self.warmup_steps = warmup_steps
+        self.t_total = t_total
+        self.cycles = cycles
+        super(WarmupCosineSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
+
+    def lr_lambda(self, step):
+        if step < self.warmup_steps:
+            return float(step) / float(max(1.0, self.warmup_steps))
+        # progress after warmup
+        progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
+        return max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
+
+
+class WarmupCosineWithHardRestartsSchedule(LambdaLR):
+    """ Linear warmup and then cosine cycles with hard restarts.
+        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
+        If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
+        learning rate (with hard restarts).
+    """
+    def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1):
+        self.warmup_steps = warmup_steps
+        self.t_total = t_total
+        self.cycles = cycles
+        super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
+
+    def lr_lambda(self, step):
+        if step < self.warmup_steps:
+            return float(step) / float(max(1, self.warmup_steps))
+        # progress after warmup
+        progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
+        if progress >= 1.0:
+            return 0.0
+        return max(0.0, 0.5 * (1. + math.cos(math.pi * ((float(self.cycles) * progress) % 1.0))))
+
+
+
+class AdamW(Optimizer):
+    """ Implements Adam algorithm with weight decay fix.
+
+    Parameters:
+        lr (float): learning rate. Default 1e-3.
+        betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999)
+        eps (float): Adams epsilon. Default: 1e-6
+        weight_decay (float): Weight decay. Default: 0.0
+        correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True.
+    """
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True):
+        if lr < 0.0:
+            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
+        if not 0.0 <= betas[1]  < 1.0:
+            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
+                        correct_bias=correct_bias)
+        super(AdamW, self).__init__(params, defaults)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                # Decay the first and second moment running average coefficient
+                # In-place operations to update the averages at the same time
+                exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
+                exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad)
+                denom = exp_avg_sq.sqrt().add_(group['eps'])
+
+                step_size = group['lr']
+                if group['correct_bias']:  # No bias correction for Bert
+                    bias_correction1 = 1.0 - beta1 ** state['step']
+                    bias_correction2 = 1.0 - beta2 ** state['step']
+                    step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
+
+                p.data.addcdiv_(-step_size, exp_avg, denom)
+
+                # Just adding the square of the weights to the loss function is *not*
+                # the correct way of using L2 regularization/weight decay with Adam,
+                # since that will interact with the m and v parameters in strange ways.
+                #
+                # Instead we want to decay the weights in a manner that doesn't interact
+                # with the m/v parameters. This is equivalent to adding the square
+                # of the weights to the loss with plain (non-momentum) SGD.
+                # Add weight decay at the end (fixed version)
+                if group['weight_decay'] > 0.0:
+                    p.data.add_(-group['lr'] * group['weight_decay'], p.data)
+
+        return loss
diff --git a/pytorch_transformers/tests/__init__.py b/pytorch_transformers/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pytorch_transformers/tests/configuration_common_test.py b/pytorch_transformers/tests/configuration_common_test.py
new file mode 100644
index 0000000..8ee7511
--- /dev/null
+++ b/pytorch_transformers/tests/configuration_common_test.py
@@ -0,0 +1,63 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import os
+import shutil
+import json
+import random
+import uuid
+
+import unittest
+import logging
+
+
+class ConfigTester(object):
+    def __init__(self, parent, config_class=None, **kwargs):
+        self.parent = parent
+        self.config_class = config_class
+        self.inputs_dict = kwargs
+
+    def create_and_test_config_common_properties(self):
+        config = self.config_class(**self.inputs_dict)
+        self.parent.assertTrue(hasattr(config, 'vocab_size'))
+        self.parent.assertTrue(hasattr(config, 'hidden_size'))
+        self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
+        self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
+
+    def create_and_test_config_to_json_string(self):
+        config = self.config_class(**self.inputs_dict)
+        obj = json.loads(config.to_json_string())
+        for key, value in self.inputs_dict.items():
+            self.parent.assertEqual(obj[key], value)
+
+    def create_and_test_config_to_json_file(self):
+        config_first = self.config_class(**self.inputs_dict)
+        json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
+        config_first.to_json_file(json_file_path)
+        config_second = self.config_class.from_json_file(json_file_path)
+        os.remove(json_file_path)
+        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
+
+    def run_common_tests(self):
+        self.create_and_test_config_common_properties()
+        self.create_and_test_config_to_json_string()
+        self.create_and_test_config_to_json_file()
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/pytorch_transformers/tests/conftest.py b/pytorch_transformers/tests/conftest.py
new file mode 100644
index 0000000..841ebc8
--- /dev/null
+++ b/pytorch_transformers/tests/conftest.py
@@ -0,0 +1,19 @@
+# content of conftest.py
+
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--runslow", action="store_true", default=False, help="run slow tests"
+    )
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--runslow"):
+        # --runslow given in cli: do not skip slow tests
+        return
+    skip_slow = pytest.mark.skip(reason="need --runslow option to run")
+    for item in items:
+        if "slow" in item.keywords:
+            item.add_marker(skip_slow)
diff --git a/pytorch_transformers/tests/fixtures/input.txt b/pytorch_transformers/tests/fixtures/input.txt
new file mode 100644
index 0000000..d1e3f41
--- /dev/null
+++ b/pytorch_transformers/tests/fixtures/input.txt
@@ -0,0 +1 @@
+Who was Jim Henson ? ||| Jim Henson was a puppeteer
diff --git a/pytorch_transformers/tests/fixtures/sample_text.txt b/pytorch_transformers/tests/fixtures/sample_text.txt
new file mode 100644
index 0000000..a428120
--- /dev/null
+++ b/pytorch_transformers/tests/fixtures/sample_text.txt
@@ -0,0 +1,33 @@
+This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
+Text should be one-sentence-per-line, with empty lines between documents.
+This sample text is public domain and was randomly selected from Project Guttenberg.
+
+The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
+Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
+Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
+"Cass" Beard had risen early that morning, but not with a view to discovery.
+A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
+The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
+This was nearly opposite.
+Mr. Cassius crossed the highway, and stopped suddenly.
+Something glittered in the nearest red pool before him.
+Gold, surely!
+But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
+Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
+Like most of his fellow gold-seekers, Cass was superstitious.
+
+The fountain of classic wisdom, Hypatia herself.
+As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
+From my youth I felt in me a soul above the matter-entangled herd.
+She revealed to me the glorious fact, that I am a spark of Divinity itself.
+A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
+There is a philosophic pleasure in opening one's treasures to the modest young.
+Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
+Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
+but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
+Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
+His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
+while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
+At last they reached the quay at the opposite end of the street;
+and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
+He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
diff --git a/pytorch_transformers/tests/fixtures/test_sentencepiece.model b/pytorch_transformers/tests/fixtures/test_sentencepiece.model
new file mode 100644
index 0000000..376dda7
Binary files /dev/null and b/pytorch_transformers/tests/fixtures/test_sentencepiece.model differ
diff --git a/pytorch_transformers/tests/modeling_auto_test.py b/pytorch_transformers/tests/modeling_auto_test.py
new file mode 100644
index 0000000..dfdedbb
--- /dev/null
+++ b/pytorch_transformers/tests/modeling_auto_test.py
@@ -0,0 +1,88 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+import pytest
+import logging
+
+from pytorch_transformers import (AutoConfig, BertConfig,
+                                  AutoModel, BertModel,
+                                  AutoModelWithLMHead, BertForMaskedLM,
+                                  AutoModelForSequenceClassification, BertForSequenceClassification,
+                                  AutoModelForQuestionAnswering, BertForQuestionAnswering)
+from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+
+class AutoModelTest(unittest.TestCase):
+    def test_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModel.from_pretrained(model_name)
+            model, loading_info = AutoModel.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertModel)
+            for value in loading_info.values():
+                self.assertEqual(len(value), 0)
+
+    def test_lmhead_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModelWithLMHead.from_pretrained(model_name)
+            model, loading_info = AutoModelWithLMHead.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForMaskedLM)
+
+    def test_sequence_classification_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModelForSequenceClassification.from_pretrained(model_name)
+            model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForSequenceClassification)
+
+    def test_question_answering_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModelForQuestionAnswering.from_pretrained(model_name)
+            model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForQuestionAnswering)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/tests/modeling_bert_test.py b/pytorch_transformers/tests/modeling_bert_test.py
new file mode 100644
index 0000000..39d1699
--- /dev/null
+++ b/pytorch_transformers/tests/modeling_bert_test.py
@@ -0,0 +1,313 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+import pytest
+
+from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
+                                     BertForNextSentencePrediction, BertForPreTraining,
+                                     BertForQuestionAnswering, BertForSequenceClassification,
+                                     BertForTokenClassification, BertForMultipleChoice)
+from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+
+class BertModelTest(CommonTestCases.CommonModelTester):
+
+    all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
+            BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
+            BertForTokenClassification)
+
+    class BertModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = BertConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                initializer_range=self.initializer_range)
+
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+
+        def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = BertModel(config=config)
+            model.eval()
+            sequence_output, pooled_output = model(input_ids, token_type_ids, input_mask)
+            sequence_output, pooled_output = model(input_ids, token_type_ids)
+            sequence_output, pooled_output = model(input_ids)
+
+            result = {
+                "sequence_output": sequence_output,
+                "pooled_output": pooled_output,
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
+
+
+        def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = BertForMaskedLM(config=config)
+            model.eval()
+            loss, prediction_scores = model(input_ids, token_type_ids, input_mask, token_labels)
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.check_loss_output(result)
+
+        def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = BertForNextSentencePrediction(config=config)
+            model.eval()
+            loss, seq_relationship_score = model(input_ids, token_type_ids, input_mask, sequence_labels)
+            result = {
+                "loss": loss,
+                "seq_relationship_score": seq_relationship_score,
+            }
+            self.parent.assertListEqual(
+                list(result["seq_relationship_score"].size()),
+                [self.batch_size, 2])
+            self.check_loss_output(result)
+
+
+        def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = BertForPreTraining(config=config)
+            model.eval()
+            loss, prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+                "seq_relationship_score": seq_relationship_score,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(result["seq_relationship_score"].size()),
+                [self.batch_size, 2])
+            self.check_loss_output(result)
+
+
+        def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = BertForQuestionAnswering(config=config)
+            model.eval()
+            loss, start_logits, end_logits = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
+            result = {
+                "loss": loss,
+                "start_logits": start_logits,
+                "end_logits": end_logits,
+            }
+            self.parent.assertListEqual(
+                list(result["start_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.check_loss_output(result)
+
+
+        def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = BertForSequenceClassification(config)
+            model.eval()
+            loss, logits = model(input_ids, token_type_ids, input_mask, sequence_labels)
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.num_labels])
+            self.check_loss_output(result)
+
+
+        def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = BertForTokenClassification(config=config)
+            model.eval()
+            loss, logits = model(input_ids, token_type_ids, input_mask, token_labels)
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.seq_length, self.num_labels])
+            self.check_loss_output(result)
+
+
+        def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_choices = self.num_choices
+            model = BertForMultipleChoice(config=config)
+            model.eval()
+            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            loss, logits = model(multiple_choice_inputs_ids,
+                         multiple_choice_token_type_ids,
+                         multiple_choice_input_mask,
+                         choice_labels)
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.num_choices])
+            self.check_loss_output(result)
+
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_mask,
+             sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = BertModelTest.BertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_bert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
+
+    def test_for_next_sequence_prediction(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_pretraining(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_transformers_test/"
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/tests/modeling_common_test.py b/pytorch_transformers/tests/modeling_common_test.py
new file mode 100644
index 0000000..c50d667
--- /dev/null
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -0,0 +1,709 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import os
+import shutil
+import json
+import random
+import uuid
+
+import unittest
+import logging
+
+import torch
+
+from pytorch_transformers import (PretrainedConfig, PreTrainedModel,
+                                  BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                  GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+
+def _config_zero_init(config):
+    configs_no_init = copy.deepcopy(config)
+    for key in configs_no_init.__dict__.keys():
+        if '_range' in key or '_std' in key:
+            setattr(configs_no_init, key, 0.0)
+    return configs_no_init
+
+class CommonTestCases:
+
+    class CommonModelTester(unittest.TestCase):
+
+        model_tester = None
+        all_model_classes = ()
+        test_torchscript = True
+        test_pruning = True
+        test_resize_embeddings = True
+        test_head_masking = True
+
+        def test_initialization(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            configs_no_init = _config_zero_init(config)
+            for model_class in self.all_model_classes:
+                model = model_class(config=configs_no_init)
+                for name, param in model.named_parameters():
+                    if param.requires_grad:
+                        self.assertIn(param.data.mean().item(), [0.0, 1.0],
+                        msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
+
+        def test_attention_outputs(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            for model_class in self.all_model_classes:
+                config.output_attentions = True
+                config.output_hidden_states = False
+                model = model_class(config)
+                model.eval()
+                outputs = model(**inputs_dict)
+                attentions = outputs[-1]
+                self.assertEqual(model.config.output_attentions, True)
+                self.assertEqual(model.config.output_hidden_states, False)
+                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads,
+                    self.model_tester.seq_length,
+                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                out_len = len(outputs)
+
+                # Check attention is always last and order is fine
+                config.output_attentions = True
+                config.output_hidden_states = True
+                model = model_class(config)
+                model.eval()
+                outputs = model(**inputs_dict)
+                self.assertEqual(out_len+1, len(outputs))
+                self.assertEqual(model.config.output_attentions, True)
+                self.assertEqual(model.config.output_hidden_states, True)
+
+                attentions = outputs[-1]
+                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads,
+                    self.model_tester.seq_length,
+                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+
+        def test_torchscript(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            self._create_and_check_torchscript(config, inputs_dict)
+
+        def test_torchscript_output_attentions(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            config.output_attentions = True
+            self._create_and_check_torchscript(config, inputs_dict)
+
+        def test_torchscript_output_hidden_state(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            config.output_hidden_states = True
+            self._create_and_check_torchscript(config, inputs_dict)
+
+        def _create_and_check_torchscript(self, config, inputs_dict):
+            if not self.test_torchscript:
+                return
+
+            configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+            configs_no_init.torchscript = True
+            for model_class in self.all_model_classes:
+                model = model_class(config=configs_no_init)
+                model.eval()
+                inputs = inputs_dict['input_ids']  # Let's keep only input_ids
+
+                try:
+                    torch.jit.trace(model, inputs)
+                except RuntimeError:
+                    self.fail("Couldn't trace module.")
+
+                try:
+                    traced_gpt2 = torch.jit.trace(model, inputs)
+                    torch.jit.save(traced_gpt2, "traced_model.pt")
+                except RuntimeError:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load("traced_model.pt")
+                    os.remove("traced_model.pt")
+                except ValueError:
+                    self.fail("Couldn't load module.")
+
+                model.eval()
+                loaded_model.eval()
+
+                model_params = model.parameters()
+                loaded_model_params = loaded_model.parameters()
+
+                models_equal = True
+                for p1, p2 in zip(model_params, loaded_model_params):
+                    if p1.data.ne(p2.data).sum() > 0:
+                        models_equal = False
+
+                self.assertTrue(models_equal)
+
+
+        def test_headmasking(self):
+            if not self.test_head_masking:
+                return
+
+            torch.manual_seed(42)
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            config.output_attentions = True
+            config.output_hidden_states = True
+            configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+            for model_class in self.all_model_classes:
+                model = model_class(config=configs_no_init)
+                model.eval()
+
+                # Prepare head_mask
+                # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) 
+                head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
+                head_mask[0, 0] = 0
+                head_mask[-1, :-1] = 0
+                head_mask.requires_grad_(requires_grad=True)
+                inputs = inputs_dict.copy()
+                inputs['head_mask'] = head_mask
+
+                outputs = model(**inputs)
+
+                # Test that we can get a gradient back for importance score computation
+                output = sum(t.sum() for t in outputs[0])
+                output = output.sum()
+                output.backward()
+                multihead_outputs = head_mask.grad
+
+                attentions = outputs[-1]
+                hidden_states = outputs[-2]
+
+                # Remove Nan
+
+                self.assertIsNotNone(multihead_outputs)
+                self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
+                self.assertAlmostEqual(
+                    attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(
+                    attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(
+                    attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
+                self.assertAlmostEqual(
+                    attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(
+                    attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
+
+
+        def test_head_pruning(self):
+            if not self.test_pruning:
+                return
+
+            for model_class in self.all_model_classes:
+                config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+                if "head_mask" in inputs_dict:
+                    del inputs_dict["head_mask"]
+
+                config.output_attentions = True
+                config.output_hidden_states = False
+                model = model_class(config=config)
+                model.eval()
+                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
+                                -1: [0]}
+                model.prune_heads(heads_to_prune)
+                outputs = model(**inputs_dict)
+
+                attentions = outputs[-1]
+
+                self.assertEqual(
+                    attentions[0].shape[-3], 1)
+                self.assertEqual(
+                    attentions[1].shape[-3], self.model_tester.num_attention_heads)
+                self.assertEqual(
+                    attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
+
+        def test_head_pruning_save_load_from_pretrained(self):
+            if not self.test_pruning:
+                return
+
+            for model_class in self.all_model_classes:
+                config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+                if "head_mask" in inputs_dict:
+                    del inputs_dict["head_mask"]
+
+                config.output_attentions = True
+                config.output_hidden_states = False
+                model = model_class(config=config)
+                model.eval()
+                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
+                                -1: [0]}
+                model.prune_heads(heads_to_prune)
+                directory = "pruned_model"
+                if not os.path.exists(directory):
+                    os.makedirs(directory)
+                model.save_pretrained(directory)
+                model = model_class.from_pretrained(directory)
+
+                outputs = model(**inputs_dict)
+                attentions = outputs[-1]
+                self.assertEqual(attentions[0].shape[-3], 1)
+                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+                self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
+
+                shutil.rmtree(directory)
+
+        def test_head_pruning_save_load_from_config_init(self):
+            if not self.test_pruning:
+                return
+
+            for model_class in self.all_model_classes:
+                config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+                if "head_mask" in inputs_dict:
+                    del inputs_dict["head_mask"]
+
+                config.output_attentions = True
+                config.output_hidden_states = False
+
+                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
+                                 -1: [0]}
+                config.pruned_heads = heads_to_prune
+
+                model = model_class(config=config)
+                model.eval()
+
+                outputs = model(**inputs_dict)
+                attentions = outputs[-1]
+
+                self.assertEqual(attentions[0].shape[-3], 1)
+                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+                self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
+
+        def test_head_pruning_integration(self):
+            if not self.test_pruning:
+                return
+
+            for model_class in self.all_model_classes:
+                config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+                if "head_mask" in inputs_dict:
+                    del inputs_dict["head_mask"]
+
+                config.output_attentions = True
+                config.output_hidden_states = False
+
+                heads_to_prune = {0: [0], 1: [1, 2]}
+                config.pruned_heads = heads_to_prune
+
+                model = model_class(config=config)
+                model.eval()
+
+                outputs = model(**inputs_dict)
+                attentions = outputs[-1]
+
+                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
+                self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
+                self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
+
+                directory = "pruned_model"
+
+                if not os.path.exists(directory):
+                    os.makedirs(directory)
+                model.save_pretrained(directory)
+                model = model_class.from_pretrained(directory)
+                shutil.rmtree(directory)
+
+                outputs = model(**inputs_dict)
+                attentions = outputs[-1]
+
+                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
+                self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
+                self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
+
+                heads_to_prune = {0: [0], 2: [1, 2]}
+                model.prune_heads(heads_to_prune)
+
+                outputs = model(**inputs_dict)
+                attentions = outputs[-1]
+
+                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1)
+                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
+                self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
+                self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
+
+                self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
+
+
+        def test_hidden_states_output(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            for model_class in self.all_model_classes:
+                config.output_hidden_states = True
+                config.output_attentions = False
+                model = model_class(config)
+                model.eval()
+                outputs = model(**inputs_dict)
+                hidden_states = outputs[-1]
+                self.assertEqual(model.config.output_attentions, False)
+                self.assertEqual(model.config.output_hidden_states, True)
+                self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
+                self.assertListEqual(
+                    list(hidden_states[0].shape[-2:]),
+                    [self.model_tester.seq_length, self.model_tester.hidden_size])
+
+        def test_resize_tokens_embeddings(self):
+            original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            if not self.test_resize_embeddings:
+                return
+
+            for model_class in self.all_model_classes:
+                config = copy.deepcopy(original_config)
+                model = model_class(config)
+
+                model_vocab_size = config.vocab_size
+                # Retrieve the embeddings and clone theme
+                model_embed = model.resize_token_embeddings(model_vocab_size)
+                cloned_embeddings = model_embed.weight.clone()
+
+                # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+                model_embed = model.resize_token_embeddings(model_vocab_size + 10)
+                self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+                # Check that it actually resizes the embeddings matrix
+                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+
+                # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+                model_embed = model.resize_token_embeddings(model_vocab_size - 15)
+                self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+                # Check that it actually resizes the embeddings matrix
+                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+
+                # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+                models_equal = True
+                for p1, p2 in zip(cloned_embeddings, model_embed.weight):
+                    if p1.data.ne(p2.data).sum() > 0:
+                        models_equal = False
+
+                self.assertTrue(models_equal)
+
+        def test_tie_model_weights(self):
+            if not self.test_torchscript:
+                return
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            def check_same_values(layer_1, layer_2):
+                equal = True
+                for p1, p2 in zip(layer_1.weight, layer_2.weight):
+                    if p1.data.ne(p2.data).sum() > 0:
+                        equal = False
+                return equal
+
+            for model_class in self.all_model_classes:
+                if not hasattr(model_class, 'tie_weights'):
+                    continue
+
+                config.torchscript = True
+                model_not_tied = model_class(config)
+                params_not_tied = list(model_not_tied.parameters())
+
+                config_tied = copy.deepcopy(config)
+                config_tied.torchscript = False
+                model_tied = model_class(config_tied)
+                params_tied = list(model_tied.parameters())
+
+                # Check that the embedding layer and decoding layer are the same in size and in value
+                self.assertGreater(len(params_not_tied), len(params_tied))
+                # self.assertTrue(check_same_values(embeddings, decoding))
+
+                # # Check that after modification, they remain the same.
+                # embeddings.weight.data.div_(2)
+                # # Check that the embedding layer and decoding layer are the same in size and in value
+                # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
+                # self.assertTrue(check_same_values(embeddings, decoding))
+
+                # # Check that after modification, they remain the same.
+                # decoding.weight.data.div_(4)
+                # # Check that the embedding layer and decoding layer are the same in size and in value
+                # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
+                # self.assertTrue(check_same_values(embeddings, decoding))
+
+                # Check that after resize they remain tied.
+                model_tied.resize_token_embeddings(config.vocab_size + 10)
+                params_tied_2 = list(model_tied.parameters())
+                self.assertGreater(len(params_not_tied), len(params_tied))
+                self.assertEqual(len(params_tied_2), len(params_tied))
+
+                # decoding.weight.data.mul_(20)
+                # # Check that the embedding layer and decoding layer are the same in size and in value
+                # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
+                # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
+
+
+    class GPTModelTester(CommonModelTester):
+
+        def __init__(self,
+                        parent,
+                        batch_size=13,
+                        seq_length=7,
+                        is_training=True,
+                        use_position_ids=True,
+                        use_token_type_ids=True,
+                        use_labels=True,
+                        vocab_size=99,
+                        n_positions=33,
+                        hidden_size=32,
+                        num_hidden_layers=5,
+                        num_attention_heads=4,
+                        n_choices=3,
+                        type_sequence_label_size=2,
+                        initializer_range=0.02,
+                        num_labels=3,
+                        scope=None,
+                        config_class=None,
+                        base_model_class=None,
+                        lm_head_model_class=None,
+                        double_head_model_class=None,
+                        ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_position_ids = use_position_ids
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.n_positions = n_positions
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.n_choices = n_choices
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.scope = scope
+            self.config_class = config_class
+            self.base_model_class = base_model_class
+            self.lm_head_model_class = lm_head_model_class
+            self.double_head_model_class = double_head_model_class
+            self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class)
+
+        def prepare_config_and_inputs(self):
+            total_num_tokens = self.vocab_size
+            input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
+
+            position_ids = None
+            if self.use_position_ids:
+                position_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                total_voc = self.vocab_size
+                token_type_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
+
+            mc_labels = None
+            lm_labels = None
+            mc_token_ids = None
+            if self.use_labels:
+                mc_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                lm_labels = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
+                mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
+
+            config = self.config_class(
+                vocab_size_or_config_json_file=self.vocab_size,
+                n_positions=self.n_positions,
+                n_embd=self.hidden_size,
+                n_layer=self.num_hidden_layers,
+                n_head=self.num_attention_heads,
+                initializer_range=self.initializer_range)
+
+            return (config, input_ids, token_type_ids, position_ids,
+                    mc_labels, lm_labels, mc_token_ids)
+
+        def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
+                                mc_labels, lm_labels, mc_token_ids):
+            model = self.base_model_class(config)
+            model.eval()
+
+            outputs = model(input_ids, position_ids, token_type_ids)
+            outputs = model(input_ids, position_ids)
+            outputs = model(input_ids)
+
+            hidden_state = outputs[0]
+            self.parent.assertListEqual(
+                list(hidden_state.size()),
+                [self.batch_size, self.n_choices, self.seq_length, self.hidden_size])
+
+
+        def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
+                                        mc_labels, lm_labels, mc_token_ids):
+            model = self.lm_head_model_class(config)
+            model.eval()
+            outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
+            loss, lm_logits = outputs[:2]
+
+            total_voc = self.vocab_size
+            self.parent.assertListEqual(
+                list(lm_logits.size()),
+                [self.batch_size, self.n_choices, self.seq_length, total_voc])
+            self.parent.assertListEqual(
+                list(loss.size()),
+                [])
+
+        def create_and_check_presents(self, config, input_ids, token_type_ids, position_ids,
+                                        mc_labels, lm_labels, mc_token_ids):
+            for model_class in self.all_model_classes:
+                model = model_class(config)
+                model.eval()
+                outputs = model(input_ids)
+                presents = outputs[-1]
+                self.parent.assertEqual(self.num_hidden_layers, len(presents))
+                self.parent.assertListEqual(
+                    list(presents[0].size()),
+                    [2, self.batch_size * self.n_choices, self.num_attention_heads,
+                        self.seq_length, self.hidden_size // self.num_attention_heads])
+
+        def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
+                                        mc_labels, lm_labels, mc_token_ids):
+            model = self.double_head_model_class(config)
+            model.eval()
+            outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
+                            token_type_ids=token_type_ids, position_ids=position_ids)
+            lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
+            loss = [lm_loss, mc_loss]
+
+            total_voc = self.vocab_size
+            self.parent.assertListEqual(
+                list(lm_logits.size()),
+                [self.batch_size, self.n_choices, self.seq_length, total_voc])
+            self.parent.assertListEqual(
+                list(mc_logits.size()),
+                [self.batch_size, self.n_choices])
+            self.parent.assertListEqual(
+                [list(l.size()) for l in loss],
+                [[], []])
+
+        def create_and_check_model_from_pretrained(self):
+            cache_dir = "/tmp/pytorch_transformers_test/"
+            for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
+                model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
+                shutil.rmtree(cache_dir)
+                self.parent.assertIsNotNone(model)
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, position_ids,
+                mc_labels, lm_labels, mc_token_ids) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids}
+            return config, inputs_dict
+
+        def run_common_tests(self, test_presents=False):
+            config_and_inputs = self.prepare_config_and_inputs()
+            self.create_and_check_base_model(*config_and_inputs)
+
+            config_and_inputs = self.prepare_config_and_inputs()
+            self.create_and_check_lm_head(*config_and_inputs)
+
+            config_and_inputs = self.prepare_config_and_inputs()
+            self.create_and_check_double_heads(*config_and_inputs)
+
+            if test_presents:
+                config_and_inputs = self.prepare_config_and_inputs()
+                self.create_and_check_presents(*config_and_inputs)
+
+        def run_slow_tests(self):
+            self.create_and_check_model_from_pretrained()
+
+
+class ConfigTester(object):
+    def __init__(self, parent, config_class=None, **kwargs):
+        self.parent = parent
+        self.config_class = config_class
+        self.inputs_dict = kwargs
+
+    def create_and_test_config_common_properties(self):
+        config = self.config_class(**self.inputs_dict)
+        self.parent.assertTrue(hasattr(config, 'vocab_size'))
+        self.parent.assertTrue(hasattr(config, 'hidden_size'))
+        self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
+        self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
+
+    def create_and_test_config_to_json_string(self):
+        config = self.config_class(**self.inputs_dict)
+        obj = json.loads(config.to_json_string())
+        for key, value in self.inputs_dict.items():
+            self.parent.assertEqual(obj[key], value)
+
+    def create_and_test_config_to_json_file(self):
+        config_first = self.config_class(**self.inputs_dict)
+        json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
+        config_first.to_json_file(json_file_path)
+        config_second = self.config_class.from_json_file(json_file_path)
+        os.remove(json_file_path)
+        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
+
+    def run_common_tests(self):
+        self.create_and_test_config_common_properties()
+        self.create_and_test_config_to_json_string()
+        self.create_and_test_config_to_json_file()
+
+
+
+
+def ids_tensor(shape, vocab_size, rng=None, name=None):
+    """Creates a random int32 tensor of the shape within the vocab size."""
+    if rng is None:
+        rng = random.Random()
+
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.randint(0, vocab_size - 1))
+
+    return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
+
+
+class ModelUtilsTest(unittest.TestCase):
+    def test_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = BertConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, PretrainedConfig)
+
+            model = BertModel.from_pretrained(model_name)
+            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, PreTrainedModel)
+            for value in loading_info.values():
+                self.assertEqual(len(value), 0)
+
+            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            self.assertEqual(model.config.output_attentions, True)
+            self.assertEqual(model.config.output_hidden_states, True)
+            self.assertEqual(model.config, config)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/tests/modeling_distilbert_test.py b/pytorch_transformers/tests/modeling_distilbert_test.py
new file mode 100644
index 0000000..c1503b4
--- /dev/null
+++ b/pytorch_transformers/tests/modeling_distilbert_test.py
@@ -0,0 +1,215 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+from pytorch_transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
+                                  DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
+
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+
+class DistilBertModelTest(CommonTestCases.CommonModelTester):
+
+    all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering,
+                         DistilBertForSequenceClassification)
+    test_pruning = True
+    test_torchscript = True
+    test_resize_embeddings = True
+    test_head_masking = True
+
+    class DistilBertModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_token_type_ids=False,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = DistilBertConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                dim=self.hidden_size,
+                n_layers=self.num_hidden_layers,
+                n_heads=self.num_attention_heads,
+                hidden_dim=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                dropout=self.hidden_dropout_prob,
+                attention_dropout=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                initializer_range=self.initializer_range)
+
+            return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+
+        def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = DistilBertModel(config=config)
+            model.eval()
+            (sequence_output,) = model(input_ids, input_mask)
+            (sequence_output,) = model(input_ids)
+
+            result = {
+                "sequence_output": sequence_output,
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+
+        def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = DistilBertForMaskedLM(config=config)
+            model.eval()
+            loss, prediction_scores = model(input_ids, attention_mask=input_mask, masked_lm_labels=token_labels)
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.check_loss_output(result)
+
+        def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = DistilBertForQuestionAnswering(config=config)
+            model.eval()
+            loss, start_logits, end_logits = model(input_ids, input_mask, sequence_labels, sequence_labels)
+            result = {
+                "loss": loss,
+                "start_logits": start_logits,
+                "end_logits": end_logits,
+            }
+            self.parent.assertListEqual(
+                list(result["start_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.check_loss_output(result)
+
+        def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = DistilBertForSequenceClassification(config)
+            model.eval()
+            loss, logits = model(input_ids, input_mask, sequence_labels)
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.num_labels])
+            self.check_loss_output(result)
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = DistilBertModelTest.DistilBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_distilbert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_masked_lm(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)
+
+    # @pytest.mark.slow
+    # def test_model_from_pretrained(self):
+    #     cache_dir = "/tmp/pytorch_transformers_test/"
+    #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir)
+    #         shutil.rmtree(cache_dir)
+    #         self.assertIsNotNone(model)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/tests/modeling_gpt2_test.py b/pytorch_transformers/tests/modeling_gpt2_test.py
new file mode 100644
index 0000000..2717805
--- /dev/null
+++ b/pytorch_transformers/tests/modeling_gpt2_test.py
@@ -0,0 +1,214 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import pytest
+import shutil
+
+
+from pytorch_transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                  GPT2LMHeadModel, GPT2DoubleHeadsModel)
+
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+
+class GPT2ModelTest(CommonTestCases.CommonModelTester):
+
+    all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel)
+
+    class GPT2ModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                     ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = GPT2Config(
+                vocab_size_or_config_json_file=self.vocab_size,
+                n_embd=self.hidden_size,
+                n_layer=self.num_hidden_layers,
+                n_head=self.num_attention_heads,
+                # intermediate_size=self.intermediate_size,
+                # hidden_act=self.hidden_act,
+                # hidden_dropout_prob=self.hidden_dropout_prob,
+                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                n_positions=self.max_position_embeddings,
+                n_ctx=self.max_position_embeddings
+                # type_vocab_size=self.type_vocab_size,
+                # initializer_range=self.initializer_range
+            )
+
+            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+            return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
+
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+
+        def create_and_check_gpt2_model(self, config, input_ids, head_mask, token_type_ids, *args):
+            model = GPT2Model(config=config)
+            model.eval()
+
+            model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
+            model(input_ids, token_type_ids=token_type_ids)
+            sequence_output, presents = model(input_ids)
+
+            result = {
+                "sequence_output": sequence_output,
+                "presents": presents,
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertEqual(len(result["presents"]), config.n_layer)
+
+        def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
+            model = GPT2LMHeadModel(config)
+            model.eval()
+
+            loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+
+            result = {
+                "loss": loss,
+                "lm_logits": lm_logits
+            }
+
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["lm_logits"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+        def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
+            model = GPT2DoubleHeadsModel(config)
+            model.eval()
+
+            loss, lm_logits, mc_logits, _ = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
+
+            result = {
+                "loss": loss,
+                "lm_logits": lm_logits
+            }
+
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["lm_logits"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {
+                'input_ids': input_ids,
+                'token_type_ids': token_type_ids,
+                'head_mask': head_mask
+            }
+
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = GPT2ModelTest.GPT2ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_gpt2_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model(*config_and_inputs)
+
+    def test_gpt2_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
+
+    def test_gpt2_double_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_transformers_test/"
+        for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/tests/modeling_openai_test.py b/pytorch_transformers/tests/modeling_openai_test.py
new file mode 100644
index 0000000..dbef6c5
--- /dev/null
+++ b/pytorch_transformers/tests/modeling_openai_test.py
@@ -0,0 +1,212 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import pytest
+import shutil
+
+
+from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                  OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
+
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+
+class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
+
+    all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
+
+    class OpenAIGPTModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                     ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = OpenAIGPTConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                n_embd=self.hidden_size,
+                n_layer=self.num_hidden_layers,
+                n_head=self.num_attention_heads,
+                # intermediate_size=self.intermediate_size,
+                # hidden_act=self.hidden_act,
+                # hidden_dropout_prob=self.hidden_dropout_prob,
+                # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                n_positions=self.max_position_embeddings,
+                n_ctx=self.max_position_embeddings
+                # type_vocab_size=self.type_vocab_size,
+                # initializer_range=self.initializer_range
+            )
+
+            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+            return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
+
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+
+        def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
+            model = OpenAIGPTModel(config=config)
+            model.eval()
+
+            model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
+            model(input_ids, token_type_ids=token_type_ids)
+            (sequence_output,) = model(input_ids)
+
+            result = {
+                "sequence_output": sequence_output
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+
+        def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
+            model = OpenAIGPTLMHeadModel(config)
+            model.eval()
+
+            loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+
+            result = {
+                "loss": loss,
+                "lm_logits": lm_logits
+            }
+
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["lm_logits"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+        def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
+            model = OpenAIGPTDoubleHeadsModel(config)
+            model.eval()
+
+            loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
+
+            result = {
+                "loss": loss,
+                "lm_logits": lm_logits
+            }
+
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["lm_logits"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {
+                'input_ids': input_ids,
+                'token_type_ids': token_type_ids,
+                'head_mask': head_mask
+            }
+
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = OpenAIGPTModelTest.OpenAIGPTModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_openai_gpt_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_openai_gpt_model(*config_and_inputs)
+
+    def test_openai_gpt_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
+
+    def test_openai_gpt_double_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_transformers_test/"
+        for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/tests/modeling_roberta_test.py b/pytorch_transformers/tests/modeling_roberta_test.py
new file mode 100644
index 0000000..0471505
--- /dev/null
+++ b/pytorch_transformers/tests/modeling_roberta_test.py
@@ -0,0 +1,243 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+import pytest
+import torch
+
+from pytorch_transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification)
+from pytorch_transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+
+class RobertaModelTest(CommonTestCases.CommonModelTester):
+
+    all_model_classes = (RobertaForMaskedLM, RobertaModel)
+
+    class RobertaModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = RobertaConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                initializer_range=self.initializer_range)
+
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+
+        def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
+                                           token_labels, choice_labels):
+            model = RobertaModel(config=config)
+            model.eval()
+            sequence_output, pooled_output = model(input_ids, token_type_ids, input_mask)
+            sequence_output, pooled_output = model(input_ids, token_type_ids)
+            sequence_output, pooled_output = model(input_ids)
+
+            result = {
+                "sequence_output": sequence_output,
+                "pooled_output": pooled_output,
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
+
+        def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
+                                                   token_labels, choice_labels):
+            model = RobertaForMaskedLM(config=config)
+            model.eval()
+            loss, prediction_scores = model(input_ids, token_type_ids, input_mask, token_labels)
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.check_loss_output(result)
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_mask,
+             sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = RobertaModelTest.RobertaModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_roberta_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_roberta_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_transformers_test/"
+        for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = RobertaModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+
+
+class RobertaModelIntegrationTest(unittest.TestCase):
+
+    @pytest.mark.slow
+    def test_inference_masked_lm(self):
+        model = RobertaForMaskedLM.from_pretrained('roberta-base')
+        
+        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 11, 50265))
+        self.assertEqual(
+            output.shape,
+            expected_shape
+        )
+        # compare the actual values for a slice.
+        expected_slice = torch.Tensor(
+            [[[33.8843, -4.3107, 22.7779],
+              [ 4.6533, -2.8099, 13.6252],
+              [ 1.8222, -3.6898,  8.8600]]]
+        )
+        self.assertTrue(
+            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
+        )
+
+    @pytest.mark.slow
+    def test_inference_no_head(self):
+        model = RobertaModel.from_pretrained('roberta-base')
+        
+        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        output = model(input_ids)[0]
+        # compare the actual values for a slice.
+        expected_slice = torch.Tensor(
+            [[[-0.0231,  0.0782,  0.0074],
+              [-0.1854,  0.0539, -0.0174],
+              [ 0.0548,  0.0799,  0.1687]]]
+        )
+        self.assertTrue(
+            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
+        )
+
+    @pytest.mark.slow
+    def test_inference_classification_head(self):
+        model = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
+        
+        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 3))
+        self.assertEqual(
+            output.shape,
+            expected_shape
+        )
+        expected_tensor = torch.Tensor([[-0.9469,  0.3913,  0.5118]])
+        self.assertTrue(
+            torch.allclose(output, expected_tensor, atol=1e-3)
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/tests/modeling_transfo_xl_test.py b/pytorch_transformers/tests/modeling_transfo_xl_test.py
new file mode 100644
index 0000000..f482c47
--- /dev/null
+++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py
@@ -0,0 +1,213 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import random
+import shutil
+import pytest
+
+import torch
+
+from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
+from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
+
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+class TransfoXLModelTest(CommonTestCases.CommonModelTester):
+
+    all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel)
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+
+    class TransfoXLModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     mem_len=30,
+                     clamp_len=15,
+                     is_training=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     cutoffs=[10, 50, 80],
+                     hidden_size=32,
+                     d_embed=32,
+                     num_attention_heads=4,
+                     d_head=8,
+                     d_inner=128,
+                     div_val=2,
+                     num_hidden_layers=5,
+                     scope=None,
+                     seed=1,
+                     ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.mem_len = mem_len
+            self.key_len = seq_length + mem_len
+            self.clamp_len = clamp_len
+            self.is_training = is_training
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.cutoffs = cutoffs
+            self.hidden_size = hidden_size
+            self.d_embed = d_embed
+            self.num_attention_heads = num_attention_heads
+            self.d_head = d_head
+            self.d_inner = d_inner
+            self.div_val = div_val
+            self.num_hidden_layers = num_hidden_layers
+            self.scope = scope
+            self.seed = seed
+
+        def prepare_config_and_inputs(self):
+            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            lm_labels = None
+            if self.use_labels:
+                lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            config = TransfoXLConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                mem_len=self.mem_len,
+                clamp_len=self.clamp_len,
+                cutoffs=self.cutoffs,
+                d_model=self.hidden_size,
+                d_embed=self.d_embed,
+                n_head=self.num_attention_heads,
+                d_head=self.d_head,
+                d_inner=self.d_inner,
+                div_val=self.div_val,
+                n_layer=self.num_hidden_layers)
+
+            return (config, input_ids_1, input_ids_2, lm_labels)
+
+        def set_seed(self):
+            random.seed(self.seed)
+            torch.manual_seed(self.seed)
+
+        def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
+            model = TransfoXLModel(config)
+            model.eval()
+
+            hidden_states_1, mems_1 = model(input_ids_1)
+            hidden_states_2, mems_2 = model(input_ids_2, mems_1)
+            outputs = {
+                "hidden_states_1": hidden_states_1,
+                "mems_1": mems_1,
+                "hidden_states_2": hidden_states_2,
+                "mems_2": mems_2,
+            }
+            return outputs
+
+        def check_transfo_xl_model_output(self, result):
+            self.parent.assertListEqual(
+                list(result["hidden_states_1"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(result["hidden_states_2"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_1"]),
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_2"]),
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+
+        def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
+            model = TransfoXLLMHeadModel(config)
+            model.eval()
+
+            lm_logits_1, mems_1 = model(input_ids_1)
+            loss_1, _, mems_1 = model(input_ids_1, labels=lm_labels)
+            lm_logits_2, mems_2 = model(input_ids_2, mems=mems_1)
+            loss_2, _, mems_2 = model(input_ids_2, labels=lm_labels, mems=mems_1)
+
+            outputs = {
+                "loss_1": loss_1,
+                "mems_1": mems_1,
+                "lm_logits_1": lm_logits_1,
+                "loss_2": loss_2,
+                "mems_2": mems_2,
+                "lm_logits_2": lm_logits_2,
+            }
+            return outputs
+
+        def check_transfo_xl_lm_head_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss_1"].size()),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["lm_logits_1"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_1"]),
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+            self.parent.assertListEqual(
+                list(result["loss_2"].size()),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["lm_logits_2"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_2"]),
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids_1}
+            return config, inputs_dict
+
+
+    def setUp(self):
+        self.model_tester = TransfoXLModelTest.TransfoXLModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_transfo_xl_model(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        output_result = self.model_tester.create_transfo_xl_model(*config_and_inputs)
+        self.model_tester.check_transfo_xl_model_output(output_result)
+
+    def test_transfo_xl_lm_head(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        output_result = self.model_tester.create_transfo_xl_lm_head(*config_and_inputs)
+        self.model_tester.check_transfo_xl_lm_head_output(output_result)
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_transformers_test/"
+        for model_name in list(TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = TransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/tests/modeling_xlm_test.py b/pytorch_transformers/tests/modeling_xlm_test.py
new file mode 100644
index 0000000..dcd0963
--- /dev/null
+++ b/pytorch_transformers/tests/modeling_xlm_test.py
@@ -0,0 +1,294 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+import pytest
+
+from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
+from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
+
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+
+class XLMModelTest(CommonTestCases.CommonModelTester):
+
+    all_model_classes = (XLMModel, XLMWithLMHeadModel,  
+                         XLMForQuestionAnswering, XLMForSequenceClassification) 
+                         # , XLMForSequenceClassification, XLMForTokenClassification),
+
+    class XLMModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_lengths=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     gelu_activation=True,
+                     sinusoidal_embeddings=False,
+                     causal=False,
+                     asm=False,
+                     n_langs=2,
+                     vocab_size=99,
+                     n_special=0,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     summary_type="last",
+                     use_proj=True,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_lengths = use_input_lengths
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.gelu_activation = gelu_activation
+            self.sinusoidal_embeddings = sinusoidal_embeddings
+            self.asm = asm
+            self.n_langs = n_langs
+            self.vocab_size = vocab_size
+            self.n_special = n_special
+            self.summary_type = summary_type
+            self.causal = causal
+            self.use_proj = use_proj
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.n_langs = n_langs
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.summary_type = summary_type
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
+
+            input_lengths = None
+            if self.use_input_lengths:
+                input_lengths = ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2  # small variation of seq_length
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
+
+            sequence_labels = None
+            token_labels = None
+            is_impossible_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                is_impossible_labels = ids_tensor([self.batch_size], 2).float()
+
+            config = XLMConfig(
+                 vocab_size_or_config_json_file=self.vocab_size,
+                 n_special=self.n_special,
+                 emb_dim=self.hidden_size,
+                 n_layers=self.num_hidden_layers,
+                 n_heads=self.num_attention_heads,
+                 dropout=self.hidden_dropout_prob,
+                 attention_dropout=self.attention_probs_dropout_prob,
+                 gelu_activation=self.gelu_activation,
+                 sinusoidal_embeddings=self.sinusoidal_embeddings,
+                 asm=self.asm,
+                 causal=self.causal,
+                 n_langs=self.n_langs,
+                 max_position_embeddings=self.max_position_embeddings,
+                 initializer_range=self.initializer_range,
+                 summary_type=self.summary_type,
+                 use_proj=self.use_proj)
+
+            return config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask
+
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+
+        def create_and_check_xlm_model(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+            model = XLMModel(config=config)
+            model.eval()
+            outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids)
+            outputs = model(input_ids, langs=token_type_ids)
+            outputs = model(input_ids)
+            sequence_output = outputs[0]
+            result = {
+                "sequence_output": sequence_output,
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+
+
+        def create_and_check_xlm_lm_head(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+            model = XLMWithLMHeadModel(config)
+            model.eval()
+
+            loss, logits = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
+
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+
+        def create_and_check_xlm_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+            model = XLMForQuestionAnswering(config)
+            model.eval()
+
+            outputs = model(input_ids)
+            start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, mems = outputs
+
+            outputs = model(input_ids, start_positions=sequence_labels,
+                                         end_positions=sequence_labels,
+                                         cls_index=sequence_labels,
+                                         is_impossible=is_impossible_labels,
+                                         p_mask=input_mask)
+
+            outputs = model(input_ids, start_positions=sequence_labels,
+                                         end_positions=sequence_labels,
+                                         cls_index=sequence_labels,
+                                         is_impossible=is_impossible_labels)
+
+            (total_loss,) = outputs
+
+            outputs = model(input_ids, start_positions=sequence_labels,
+                                         end_positions=sequence_labels)
+
+            (total_loss,) = outputs
+
+            result = {
+                "loss": total_loss,
+                "start_top_log_probs": start_top_log_probs,
+                "start_top_index": start_top_index,
+                "end_top_log_probs": end_top_log_probs,
+                "end_top_index": end_top_index,
+                "cls_logits": cls_logits,
+            }
+
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["start_top_log_probs"].size()),
+                [self.batch_size, model.config.start_n_top])
+            self.parent.assertListEqual(
+                list(result["start_top_index"].size()),
+                [self.batch_size, model.config.start_n_top])
+            self.parent.assertListEqual(
+                list(result["end_top_log_probs"].size()),
+                [self.batch_size, model.config.start_n_top * model.config.end_n_top])
+            self.parent.assertListEqual(
+                list(result["end_top_index"].size()),
+                [self.batch_size, model.config.start_n_top * model.config.end_n_top])
+            self.parent.assertListEqual(
+                list(result["cls_logits"].size()),
+                [self.batch_size])
+
+
+        def create_and_check_xlm_sequence_classif(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+            model = XLMForSequenceClassification(config)
+            model.eval()
+
+            (logits,) = model(input_ids)
+            loss, logits = model(input_ids, labels=sequence_labels)
+
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.type_sequence_label_size])
+
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_lengths,
+             sequence_labels, token_labels, is_impossible_labels, input_mask) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'lengths': input_lengths}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = XLMModelTest.XLMModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_xlm_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_model(*config_and_inputs)
+
+        # config_and_inputs = tester.prepare_config_and_inputs()
+        # tester.create_and_check_xlm_for_masked_lm(*config_and_inputs)
+
+        # config_and_inputs = tester.prepare_config_and_inputs()
+        # tester.create_and_check_xlm_for_multiple_choice(*config_and_inputs)
+
+        # config_and_inputs = tester.prepare_config_and_inputs()
+        # tester.create_and_check_xlm_for_question_answering(*config_and_inputs)
+
+        # config_and_inputs = tester.prepare_config_and_inputs()
+        # tester.create_and_check_xlm_for_sequence_classification(*config_and_inputs)
+
+        # config_and_inputs = tester.prepare_config_and_inputs()
+        # tester.create_and_check_xlm_for_token_classification(*config_and_inputs)
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_transformers_test/"
+        for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/tests/modeling_xlnet_test.py b/pytorch_transformers/tests/modeling_xlnet_test.py
new file mode 100644
index 0000000..4445bc1
--- /dev/null
+++ b/pytorch_transformers/tests/modeling_xlnet_test.py
@@ -0,0 +1,323 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import unittest
+import json
+import random
+import shutil
+import pytest
+
+import torch
+
+from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
+from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
+
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+class XLNetModelTest(CommonTestCases.CommonModelTester):
+
+    all_model_classes=(XLNetModel, XLNetLMHeadModel,
+                    XLNetForSequenceClassification, XLNetForQuestionAnswering)
+    test_pruning = False
+
+    class XLNetModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     mem_len=10,
+                     clamp_len=-1,
+                     reuse_len=15,
+                     is_training=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     cutoffs=[10, 50, 80],
+                     hidden_size=32,
+                     num_attention_heads=4,
+                     d_inner=128,
+                     num_hidden_layers=5,
+                     max_position_embeddings=10,
+                     type_sequence_label_size=2,
+                     untie_r=True,
+                     bi_data=False,
+                     same_length=False,
+                     initializer_range=0.05,
+                     seed=1,
+                     type_vocab_size=2,
+            ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.mem_len = mem_len
+            # self.key_len = seq_length + mem_len
+            self.clamp_len = clamp_len
+            self.reuse_len = reuse_len
+            self.is_training = is_training
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.cutoffs = cutoffs
+            self.hidden_size = hidden_size
+            self.num_attention_heads = num_attention_heads
+            self.d_inner = d_inner
+            self.num_hidden_layers = num_hidden_layers
+            self.max_position_embeddings = max_position_embeddings
+            self.bi_data = bi_data
+            self.untie_r = untie_r
+            self.same_length = same_length
+            self.initializer_range = initializer_range
+            self.seed = seed
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+
+        def prepare_config_and_inputs(self):
+            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            segment_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+            input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
+
+            input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
+            perm_mask = torch.zeros(self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float)
+            perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+            target_mapping = torch.zeros(self.batch_size, 1, self.seq_length + 1, dtype=torch.float)
+            target_mapping[:, 0, -1] = 1.0  # predict last token
+
+            sequence_labels = None
+            lm_labels = None
+            is_impossible_labels = None
+            if self.use_labels:
+                lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                is_impossible_labels = ids_tensor([self.batch_size], 2).float()
+
+            config = XLNetConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                d_model=self.hidden_size,
+                n_head=self.num_attention_heads,
+                d_inner=self.d_inner,
+                n_layer=self.num_hidden_layers,
+                untie_r=self.untie_r,
+                max_position_embeddings=self.max_position_embeddings,
+                mem_len=self.mem_len,
+                clamp_len=self.clamp_len,
+                same_length=self.same_length,
+                reuse_len=self.reuse_len,
+                bi_data=self.bi_data,
+                initializer_range=self.initializer_range,
+                num_labels=self.type_sequence_label_size)
+
+            return (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                    target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels)
+
+        def set_seed(self):
+            random.seed(self.seed)
+            torch.manual_seed(self.seed)
+
+        def create_and_check_xlnet_base_model(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+            model = XLNetModel(config)
+            model.eval()
+
+            _, _ = model(input_ids_1, input_mask=input_mask)
+            _, _ = model(input_ids_1, attention_mask=input_mask)
+            _, _ = model(input_ids_1, token_type_ids=segment_ids)
+            outputs, mems_1 = model(input_ids_1)
+
+            result = {
+                "mems_1": mems_1,
+                "outputs": outputs,
+            }
+
+            self.parent.assertListEqual(
+                list(result["outputs"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_1"]),
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+        def create_and_check_xlnet_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+            model = XLNetLMHeadModel(config)
+            model.eval()
+
+            loss_1, all_logits_1, mems_1 = model(input_ids_1, token_type_ids=segment_ids, labels=lm_labels)
+
+            loss_2, all_logits_2, mems_2 = model(input_ids_2, token_type_ids=segment_ids, labels=lm_labels, mems=mems_1)
+
+            logits, _ = model(input_ids_q, perm_mask=perm_mask, target_mapping=target_mapping)
+
+            result = {
+                "loss_1": loss_1,
+                "mems_1": mems_1,
+                "all_logits_1": all_logits_1,
+                "loss_2": loss_2,
+                "mems_2": mems_2,
+                "all_logits_2": all_logits_2,
+            }
+
+            self.parent.assertListEqual(
+                list(result["loss_1"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["all_logits_1"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_1"]),
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+            self.parent.assertListEqual(
+                list(result["loss_2"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["all_logits_2"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_2"]),
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+        def create_and_check_xlnet_qa(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+            model = XLNetForQuestionAnswering(config)
+            model.eval()
+
+            outputs = model(input_ids_1)
+            start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, mems = outputs
+
+            outputs = model(input_ids_1, start_positions=sequence_labels,
+                                         end_positions=sequence_labels,
+                                         cls_index=sequence_labels,
+                                         is_impossible=is_impossible_labels,
+                                         p_mask=input_mask)
+
+            outputs = model(input_ids_1, start_positions=sequence_labels,
+                                         end_positions=sequence_labels,
+                                         cls_index=sequence_labels,
+                                         is_impossible=is_impossible_labels)
+
+            total_loss, mems = outputs
+
+            outputs = model(input_ids_1, start_positions=sequence_labels,
+                                         end_positions=sequence_labels)
+
+            total_loss, mems = outputs
+
+            result = {
+                "loss": total_loss,
+                "start_top_log_probs": start_top_log_probs,
+                "start_top_index": start_top_index,
+                "end_top_log_probs": end_top_log_probs,
+                "end_top_index": end_top_index,
+                "cls_logits": cls_logits,
+                "mems": mems,
+            }
+
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["start_top_log_probs"].size()),
+                [self.batch_size, model.config.start_n_top])
+            self.parent.assertListEqual(
+                list(result["start_top_index"].size()),
+                [self.batch_size, model.config.start_n_top])
+            self.parent.assertListEqual(
+                list(result["end_top_log_probs"].size()),
+                [self.batch_size, model.config.start_n_top * model.config.end_n_top])
+            self.parent.assertListEqual(
+                list(result["end_top_index"].size()),
+                [self.batch_size, model.config.start_n_top * model.config.end_n_top])
+            self.parent.assertListEqual(
+                list(result["cls_logits"].size()),
+                [self.batch_size])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems"]),
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+        def create_and_check_xlnet_sequence_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+            model = XLNetForSequenceClassification(config)
+            model.eval()
+
+            logits, mems_1 = model(input_ids_1)
+            loss, logits, mems_1 = model(input_ids_1, labels=sequence_labels)
+
+            result = {
+                "loss": loss,
+                "mems_1": mems_1,
+                "logits": logits,
+            }
+
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.type_sequence_label_size])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_1"]),
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, segment_ids, lm_labels,
+                sequence_labels, is_impossible_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids_1}
+            return config, inputs_dict
+
+
+    def setUp(self):
+        self.model_tester = XLNetModelTest.XLNetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_xlnet_base_model(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_base_model(*config_and_inputs)
+
+    def test_xlnet_lm_head(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs) 
+
+    def test_xlnet_sequence_classif(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
+
+    def test_xlnet_qa(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_transformers_test/"
+        for model_name in list(XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = XLNetModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/tests/optimization_test.py b/pytorch_transformers/tests/optimization_test.py
new file mode 100644
index 0000000..0146541
--- /dev/null
+++ b/pytorch_transformers/tests/optimization_test.py
@@ -0,0 +1,139 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import os
+
+import torch
+
+from pytorch_transformers import (AdamW, ConstantLRSchedule, WarmupConstantSchedule,
+                                  WarmupCosineSchedule, WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
+
+from .tokenization_tests_commons import TemporaryDirectory
+
+
+def unwrap_schedule(scheduler, num_steps=10):
+    lrs = []
+    for _ in range(num_steps):
+        scheduler.step()
+        lrs.append(scheduler.get_lr())
+    return lrs
+
+def unwrap_and_save_reload_schedule(scheduler, num_steps=10):
+    lrs = []
+    for step in range(num_steps):
+        scheduler.step()
+        lrs.append(scheduler.get_lr())
+        if step == num_steps // 2:
+            with TemporaryDirectory() as tmpdirname:
+                file_name = os.path.join(tmpdirname, 'schedule.bin')
+                torch.save(scheduler.state_dict(), file_name)
+
+                state_dict = torch.load(file_name)
+                scheduler.load_state_dict(state_dict)
+    return lrs
+
+class OptimizationTest(unittest.TestCase):
+
+    def assertListAlmostEqual(self, list1, list2, tol):
+        self.assertEqual(len(list1), len(list2))
+        for a, b in zip(list1, list2):
+            self.assertAlmostEqual(a, b, delta=tol)
+
+    def test_adam_w(self):
+        w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)
+        target = torch.tensor([0.4, 0.2, -0.5])
+        criterion = torch.nn.MSELoss()
+        # No warmup, constant schedule, no gradient clipping
+        optimizer = AdamW(params=[w], lr=2e-1, weight_decay=0.0)
+        for _ in range(100):
+            loss = criterion(w, target)
+            loss.backward()
+            optimizer.step()
+            w.grad.detach_() # No zero_grad() function on simple tensors. we do it ourselves.
+            w.grad.zero_()
+        self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
+
+
+class ScheduleInitTest(unittest.TestCase):
+    m = torch.nn.Linear(50, 50)
+    optimizer = AdamW(m.parameters(), lr=10.)
+    num_steps = 10
+
+    def assertListAlmostEqual(self, list1, list2, tol):
+        self.assertEqual(len(list1), len(list2))
+        for a, b in zip(list1, list2):
+            self.assertAlmostEqual(a, b, delta=tol)
+
+    def test_constant_scheduler(self):
+        scheduler = ConstantLRSchedule(self.optimizer)
+        lrs = unwrap_schedule(scheduler, self.num_steps)
+        expected_learning_rates = [10.] * self.num_steps
+        self.assertEqual(len(lrs[0]), 1)
+        self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
+
+        scheduler = ConstantLRSchedule(self.optimizer)
+        lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
+        self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
+
+    def test_warmup_constant_scheduler(self):
+        scheduler = WarmupConstantSchedule(self.optimizer, warmup_steps=4)
+        lrs = unwrap_schedule(scheduler, self.num_steps)
+        expected_learning_rates = [2.5, 5.0, 7.5, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0]
+        self.assertEqual(len(lrs[0]), 1)
+        self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
+
+        scheduler = WarmupConstantSchedule(self.optimizer, warmup_steps=4)
+        lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
+        self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
+
+    def test_warmup_linear_scheduler(self):
+        scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10)
+        lrs = unwrap_schedule(scheduler, self.num_steps)
+        expected_learning_rates = [5.0, 10.0, 8.75, 7.5, 6.25, 5.0, 3.75, 2.5, 1.25, 0.0]
+        self.assertEqual(len(lrs[0]), 1)
+        self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
+
+        scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10)
+        lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
+        self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
+
+    def test_warmup_cosine_scheduler(self):
+        scheduler = WarmupCosineSchedule(self.optimizer, warmup_steps=2, t_total=10)
+        lrs = unwrap_schedule(scheduler, self.num_steps)
+        expected_learning_rates = [5.0, 10.0, 9.61, 8.53, 6.91, 5.0, 3.08, 1.46, 0.38, 0.0]
+        self.assertEqual(len(lrs[0]), 1)
+        self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
+
+        scheduler = WarmupCosineSchedule(self.optimizer, warmup_steps=2, t_total=10)
+        lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
+        self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
+
+    def test_warmup_cosine_hard_restart_scheduler(self):
+        scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, warmup_steps=2, cycles=2, t_total=10)
+        lrs = unwrap_schedule(scheduler, self.num_steps)
+        expected_learning_rates = [5.0, 10.0, 8.53, 5.0, 1.46, 10.0, 8.53, 5.0, 1.46, 0.0]
+        self.assertEqual(len(lrs[0]), 1)
+        self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
+
+        scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, warmup_steps=2, cycles=2, t_total=10)
+        lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
+        self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_auto_test.py b/pytorch_transformers/tests/tokenization_auto_test.py
new file mode 100644
index 0000000..f4f8208
--- /dev/null
+++ b/pytorch_transformers/tests/tokenization_auto_test.py
@@ -0,0 +1,46 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+import pytest
+import logging
+
+from pytorch_transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
+from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers.modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
+class AutoTokenizerTest(unittest.TestCase):
+    def test_tokenizer_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.assertIsNotNone(tokenizer)
+            self.assertIsInstance(tokenizer, BertTokenizer)
+            self.assertGreater(len(tokenizer), 0)
+
+        for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.assertIsNotNone(tokenizer)
+            self.assertIsInstance(tokenizer, GPT2Tokenizer)
+            self.assertGreater(len(tokenizer), 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_bert_test.py b/pytorch_transformers/tests/tokenization_bert_test.py
new file mode 100644
index 0000000..1111683
--- /dev/null
+++ b/pytorch_transformers/tests/tokenization_bert_test.py
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+from io import open
+
+from pytorch_transformers.tokenization_bert import (BasicTokenizer,
+                                                    BertTokenizer,
+                                                    WordpieceTokenizer,
+                                                    _is_control, _is_punctuation,
+                                                    _is_whitespace, VOCAB_FILES_NAMES)
+
+from .tokenization_tests_commons import CommonTestCases
+
+class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+    tokenizer_class = BertTokenizer
+
+    def setUp(self):
+        super(BertTokenizationTest, self).setUp()
+
+        vocab_tokens = [
+            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
+            "##ing", ",", "low", "lowest",
+        ]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_tokenizer(self, **kwargs):
+        return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self):
+        input_text = u"UNwant\u00E9d,running"
+        output_text = u"unwanted, running"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file)
+
+        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
+        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
+
+    def test_chinese(self):
+        tokenizer = BasicTokenizer()
+
+        self.assertListEqual(
+            tokenizer.tokenize(u"ah\u535A\u63A8zz"),
+            [u"ah", u"\u535A", u"\u63A8", u"zz"])
+
+    def test_basic_tokenizer_lower(self):
+        tokenizer = BasicTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
+            ["hello", "!", "how", "are", "you", "?"])
+        self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_no_lower(self):
+        tokenizer = BasicTokenizer(do_lower_case=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
+            ["HeLLo", "!", "how", "Are", "yoU", "?"])
+
+    def test_wordpiece_tokenizer(self):
+        vocab_tokens = [
+            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
+            "##ing"
+        ]
+
+        vocab = {}
+        for (i, token) in enumerate(vocab_tokens):
+            vocab[token] = i
+        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
+
+        self.assertListEqual(tokenizer.tokenize(""), [])
+
+        self.assertListEqual(
+            tokenizer.tokenize("unwanted running"),
+            ["un", "##want", "##ed", "runn", "##ing"])
+
+        self.assertListEqual(
+            tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
+
+    def test_is_whitespace(self):
+        self.assertTrue(_is_whitespace(u" "))
+        self.assertTrue(_is_whitespace(u"\t"))
+        self.assertTrue(_is_whitespace(u"\r"))
+        self.assertTrue(_is_whitespace(u"\n"))
+        self.assertTrue(_is_whitespace(u"\u00A0"))
+
+        self.assertFalse(_is_whitespace(u"A"))
+        self.assertFalse(_is_whitespace(u"-"))
+
+    def test_is_control(self):
+        self.assertTrue(_is_control(u"\u0005"))
+
+        self.assertFalse(_is_control(u"A"))
+        self.assertFalse(_is_control(u" "))
+        self.assertFalse(_is_control(u"\t"))
+        self.assertFalse(_is_control(u"\r"))
+
+    def test_is_punctuation(self):
+        self.assertTrue(_is_punctuation(u"-"))
+        self.assertTrue(_is_punctuation(u"$"))
+        self.assertTrue(_is_punctuation(u"`"))
+        self.assertTrue(_is_punctuation(u"."))
+
+        self.assertFalse(_is_punctuation(u"A"))
+        self.assertFalse(_is_punctuation(u" "))
+
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
+
+        text = tokenizer.encode("sequence builders")
+        text_2 = tokenizer.encode("multi-sequence build")
+
+        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
+        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
+
+        assert encoded_sentence == [101] + text + [102]
+        assert encoded_pair == [101] + text + [102] + text_2 + [102]
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_dilbert_test.py b/pytorch_transformers/tests/tokenization_dilbert_test.py
new file mode 100644
index 0000000..42f8060
--- /dev/null
+++ b/pytorch_transformers/tests/tokenization_dilbert_test.py
@@ -0,0 +1,46 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+from io import open
+
+from pytorch_transformers.tokenization_distilbert import (DistilBertTokenizer)
+
+from .tokenization_tests_commons import CommonTestCases
+from .tokenization_bert_test import BertTokenizationTest
+
+class DistilBertTokenizationTest(BertTokenizationTest):
+
+    tokenizer_class = DistilBertTokenizer
+
+    def get_tokenizer(self, **kwargs):
+        return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def test_sequence_builders(self):
+        tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+
+        text = tokenizer.encode("sequence builders")
+        text_2 = tokenizer.encode("multi-sequence build")
+
+        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
+        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
+
+        assert encoded_sentence == [101] + text + [102]
+        assert encoded_pair == [101] + text + [102] + text_2 + [102]
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py
new file mode 100644
index 0000000..8ee9cb0
--- /dev/null
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -0,0 +1,72 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+import json
+from io import open
+
+from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
+
+from .tokenization_tests_commons import CommonTestCases
+
+class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+    tokenizer_class = GPT2Tokenizer
+
+    def setUp(self):
+        super(GPT2TokenizationTest, self).setUp()
+
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
+                 "\u0120", "\u0120l", "\u0120n",
+                 "\u0120lo", "\u0120low", "er",
+                 "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self):
+        input_text = u"lower newer"
+        output_text = u" lower newer"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
+        text = "lower newer"
+        bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + [tokenizer.unk_token]
+        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_openai_test.py b/pytorch_transformers/tests/tokenization_openai_test.py
new file mode 100644
index 0000000..6b86416
--- /dev/null
+++ b/pytorch_transformers/tests/tokenization_openai_test.py
@@ -0,0 +1,72 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+import json
+
+from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
+
+from .tokenization_tests_commons import CommonTestCases
+
+
+class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+    tokenizer_class = OpenAIGPTTokenizer
+
+    def setUp(self):
+        super(OpenAIGPTTokenizationTest, self).setUp()
+
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
+                 "w</w>", "r</w>", "t</w>",
+                 "lo", "low", "er</w>",
+                 "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        with open(self.vocab_file, "w") as fp:
+            fp.write(json.dumps(vocab_tokens))
+        with open(self.merges_file, "w") as fp:
+            fp.write("\n".join(merges))
+
+    def get_tokenizer(self, **kwargs):
+        return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self):
+        input_text = u"lower newer"
+        output_text = u"lower newer"
+        return input_text, output_text
+
+
+    def test_full_tokenizer(self):
+        tokenizer = OpenAIGPTTokenizer(self.vocab_file, self.merges_file)
+
+        text = "lower"
+        bpe_tokens = ["low", "er</w>"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + ["<unk>"]
+        input_bpe_tokens = [14, 15, 20]
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py
new file mode 100644
index 0000000..8add252
--- /dev/null
+++ b/pytorch_transformers/tests/tokenization_roberta_test.py
@@ -0,0 +1,98 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import json
+import unittest
+from io import open
+
+from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
+from .tokenization_tests_commons import CommonTestCases
+
+
+class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
+    tokenizer_class = RobertaTokenizer
+
+    def setUp(self):
+        super(RobertaTokenizationTest, self).setUp()
+
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
+                 "\u0120", "\u0120l", "\u0120n",
+                 "\u0120lo", "\u0120low", "er",
+                 "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self):
+        input_text = u"lower newer"
+        output_text = u" lower newer"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
+        text = "lower newer"
+        bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + [tokenizer.unk_token]
+        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    def roberta_dict_integration_testing(self):
+        tokenizer = self.get_tokenizer()
+
+        self.assertListEqual(
+            tokenizer.encode('Hello world!'),
+            [0, 31414, 232, 328, 2]
+        )
+        self.assertListEqual(
+            tokenizer.encode('Hello world! cécé herlolip 418'),
+            [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
+        )
+
+    def test_sequence_builders(self):
+        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+
+        text = tokenizer.encode("sequence builders")
+        text_2 = tokenizer.encode("multi-sequence build")
+
+        encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
+        encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
+
+        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
+        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
+
+        assert encoded_sentence == encoded_text_from_decode
+        assert encoded_pair == encoded_pair_from_decode
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
new file mode 100644
index 0000000..3da0494
--- /dev/null
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -0,0 +1,188 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import sys
+from io import open
+import tempfile
+import shutil
+import unittest
+
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+
+    class TemporaryDirectory(object):
+        """Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
+        def __enter__(self):
+            self.name = tempfile.mkdtemp()
+            return self.name
+        def __exit__(self, exc_type, exc_value, traceback):
+            shutil.rmtree(self.name)
+else:
+    import pickle
+    TemporaryDirectory = tempfile.TemporaryDirectory
+    unicode = str
+
+
+class CommonTestCases:
+
+    class CommonTokenizerTester(unittest.TestCase):
+
+        tokenizer_class = None
+
+        def setUp(self):
+            self.tmpdirname = tempfile.mkdtemp()
+
+        def tearDown(self):
+            shutil.rmtree(self.tmpdirname)
+
+        def get_tokenizer(self, **kwargs):
+            raise NotImplementedError
+
+        def get_input_output_texts(self):
+            raise NotImplementedError
+
+        def test_tokenizers_common_properties(self):
+            tokenizer = self.get_tokenizer()
+            attributes_list = ["bos_token", "eos_token", "unk_token", "sep_token",
+                                "pad_token", "cls_token", "mask_token"]
+            for attr in attributes_list:
+                self.assertTrue(hasattr(tokenizer, attr))
+                self.assertTrue(hasattr(tokenizer, attr + "_id"))
+
+            self.assertTrue(hasattr(tokenizer, "additional_special_tokens"))
+            self.assertTrue(hasattr(tokenizer, 'additional_special_tokens_ids'))
+
+            attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder",
+                                "added_tokens_decoder"]
+            for attr in attributes_list:
+                self.assertTrue(hasattr(tokenizer, attr))
+
+        def test_save_and_load_tokenizer(self):
+            # safety check on max_len default value so we are sure the test works
+            tokenizer = self.get_tokenizer()
+            self.assertNotEqual(tokenizer.max_len, 42)
+
+            # Now let's start the test
+            tokenizer = self.get_tokenizer(max_len=42)
+
+            before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
+
+            with TemporaryDirectory() as tmpdirname:
+                tokenizer.save_pretrained(tmpdirname)
+                tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
+
+                after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
+                self.assertListEqual(before_tokens, after_tokens)
+
+                self.assertEqual(tokenizer.max_len, 42)
+                tokenizer = self.tokenizer_class.from_pretrained(tmpdirname, max_len=43)
+                self.assertEqual(tokenizer.max_len, 43)
+
+        def test_pickle_tokenizer(self):
+            tokenizer = self.get_tokenizer()
+            self.assertIsNotNone(tokenizer)
+
+            text = u"Munich and Berlin are nice cities"
+            subwords = tokenizer.tokenize(text)
+
+            with TemporaryDirectory() as tmpdirname:
+
+                filename = os.path.join(tmpdirname, u"tokenizer.bin")
+                pickle.dump(tokenizer, open(filename, "wb"))
+
+                tokenizer_new = pickle.load(open(filename, "rb"))
+
+            subwords_loaded = tokenizer_new.tokenize(text)
+
+            self.assertListEqual(subwords, subwords_loaded)
+
+
+        def test_add_tokens_tokenizer(self):
+            tokenizer = self.get_tokenizer()
+
+            vocab_size = tokenizer.vocab_size
+            all_size = len(tokenizer)
+
+            self.assertNotEqual(vocab_size, 0)
+            self.assertEqual(vocab_size, all_size)
+
+            new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
+            added_toks = tokenizer.add_tokens(new_toks)
+            vocab_size_2 = tokenizer.vocab_size
+            all_size_2 = len(tokenizer)
+
+            self.assertNotEqual(vocab_size_2, 0)
+            self.assertEqual(vocab_size, vocab_size_2)
+            self.assertEqual(added_toks, len(new_toks))
+            self.assertEqual(all_size_2, all_size + len(new_toks))
+
+            tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l")
+            out_string = tokenizer.decode(tokens)
+
+            self.assertGreaterEqual(len(tokens), 4)
+            self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+            self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+
+            new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
+                          'pad_token': "<<<<<|||>|>>>>|>"}
+            added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
+            vocab_size_3 = tokenizer.vocab_size
+            all_size_3 = len(tokenizer)
+
+            self.assertNotEqual(vocab_size_3, 0)
+            self.assertEqual(vocab_size, vocab_size_3)
+            self.assertEqual(added_toks_2, len(new_toks_2))
+            self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
+
+            tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
+            out_string = tokenizer.decode(tokens)
+
+            self.assertGreaterEqual(len(tokens), 6)
+            self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+            self.assertGreater(tokens[0], tokens[1])
+            self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+            self.assertGreater(tokens[-2], tokens[-3])
+            self.assertEqual(tokens[0], tokenizer.eos_token_id)
+            self.assertEqual(tokens[-2], tokenizer.pad_token_id)
+
+
+        def test_required_methods_tokenizer(self):
+            tokenizer = self.get_tokenizer()
+            input_text, output_text = self.get_input_output_texts()
+
+            tokens = tokenizer.tokenize(input_text)
+            ids = tokenizer.convert_tokens_to_ids(tokens)
+            ids_2 = tokenizer.encode(input_text)
+            self.assertListEqual(ids, ids_2)
+
+            tokens_2 = tokenizer.convert_ids_to_tokens(ids)
+            text_2 = tokenizer.decode(ids)
+
+            self.assertEqual(text_2, output_text)
+
+            self.assertNotEqual(len(tokens_2), 0)
+            self.assertIsInstance(text_2, (str, unicode))
+
+
+        def test_pretrained_model_lists(self):
+            weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
+            weights_lists_2 = []
+            for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
+                weights_lists_2.append(list(map_list.keys()))
+
+            for weights_list_2 in weights_lists_2:
+                self.assertListEqual(weights_list, weights_list_2)
diff --git a/pytorch_transformers/tests/tokenization_transfo_xl_test.py b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
new file mode 100644
index 0000000..f881cf1
--- /dev/null
+++ b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
@@ -0,0 +1,74 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+from io import open
+
+from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
+
+from.tokenization_tests_commons import CommonTestCases
+
+class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+    tokenizer_class = TransfoXLTokenizer
+
+    def setUp(self):
+        super(TransfoXLTokenizationTest, self).setUp()
+
+        vocab_tokens = [
+            "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un",
+            "running", ",", "low", "l",
+        ]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_tokenizer(self, **kwargs):
+        kwargs['lower_case'] = True
+        return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self):
+        input_text = u"<unk> UNwanted , running"
+        output_text = u"<unk> unwanted, running"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True)
+
+        tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
+        self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
+
+    def test_full_tokenizer_lower(self):
+        tokenizer = TransfoXLTokenizer(lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
+            ["hello", "!", "how", "are", "you", "?"])
+
+    def test_full_tokenizer_no_lower(self):
+        tokenizer = TransfoXLTokenizer(lower_case=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
+            ["HeLLo", "!", "how", "Are", "yoU", "?"])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_utils_test.py b/pytorch_transformers/tests/tokenization_utils_test.py
new file mode 100644
index 0000000..26ec2d7
--- /dev/null
+++ b/pytorch_transformers/tests/tokenization_utils_test.py
@@ -0,0 +1,46 @@
+# coding=utf-8
+# Copyright 2018 HuggingFace Inc..
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import six
+
+from pytorch_transformers import PreTrainedTokenizer
+from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
+
+class TokenizerUtilsTest(unittest.TestCase):
+    def check_tokenizer_from_pretrained(self, tokenizer_class):
+        s3_models = list(tokenizer_class.max_model_input_sizes.keys())
+        for model_name in s3_models[:1]:
+            tokenizer = tokenizer_class.from_pretrained(model_name)
+            self.assertIsNotNone(tokenizer)
+            self.assertIsInstance(tokenizer, tokenizer_class)
+            self.assertIsInstance(tokenizer, PreTrainedTokenizer)
+
+            for special_tok in tokenizer.all_special_tokens:
+                if six.PY2:
+                    self.assertIsInstance(special_tok, unicode)
+                else:
+                    self.assertIsInstance(special_tok, str)
+                special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
+                self.assertIsInstance(special_tok_id, int)
+
+    def test_pretrained_tokenizers(self):
+        self.check_tokenizer_from_pretrained(GPT2Tokenizer)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_xlm_test.py b/pytorch_transformers/tests/tokenization_xlm_test.py
new file mode 100644
index 0000000..43f1e0c
--- /dev/null
+++ b/pytorch_transformers/tests/tokenization_xlm_test.py
@@ -0,0 +1,82 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+import json
+
+from pytorch_transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
+
+from .tokenization_tests_commons import CommonTestCases
+
+class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+    tokenizer_class = XLMTokenizer
+
+    def setUp(self):
+        super(XLMTokenizationTest, self).setUp()
+
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
+                 "w</w>", "r</w>", "t</w>",
+                 "lo", "low", "er</w>",
+                 "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        with open(self.vocab_file, "w") as fp:
+            fp.write(json.dumps(vocab_tokens))
+        with open(self.merges_file, "w") as fp:
+            fp.write("\n".join(merges))
+
+    def get_tokenizer(self, **kwargs):
+        return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self):
+        input_text = u"lower newer"
+        output_text = u"lower newer"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
+        tokenizer = XLMTokenizer(self.vocab_file, self.merges_file)
+
+        text = "lower"
+        bpe_tokens = ["low", "er</w>"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + ["<unk>"]
+        input_bpe_tokens = [14, 15, 20]
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    def test_sequence_builders(self):
+        tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
+
+        text = tokenizer.encode("sequence builders")
+        text_2 = tokenizer.encode("multi-sequence build")
+
+        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
+        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
+
+        assert encoded_sentence == [1] + text + [1]
+        assert encoded_pair == [1] + text + [1] + text_2 + [1]
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_xlnet_test.py b/pytorch_transformers/tests/tokenization_xlnet_test.py
new file mode 100644
index 0000000..c603ce5
--- /dev/null
+++ b/pytorch_transformers/tests/tokenization_xlnet_test.py
@@ -0,0 +1,106 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+
+from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
+
+from .tokenization_tests_commons import CommonTestCases
+
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                    'fixtures/test_sentencepiece.model')
+
+class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+    tokenizer_class = XLNetTokenizer
+
+    def setUp(self):
+        super(XLNetTokenizationTest, self).setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return XLNetTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self):
+        input_text = u"This is a test"
+        output_text = u"This is a test"
+        return input_text, output_text
+
+
+    def test_full_tokenizer(self):
+        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
+
+        tokens = tokenizer.tokenize(u'This is a test')
+        self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
+
+        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
+        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+                                    u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
+                                    u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                    SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(
+            ids, [8, 21, 84, 55, 24, 19, 7, 0,
+                602, 347, 347, 347, 3, 12, 66,
+                46, 72, 80, 6, 0, 4])
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+                                        u'or', u'n', SPIECE_UNDERLINE + u'in',
+                                        SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
+                                        SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                        SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
+                                        u'<unk>', u'.'])
+
+    def test_tokenizer_lower(self):
+        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
+        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
+        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'', u'i', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+                                      u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
+                                      u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                      SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
+        self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), [u"▁he", u"ll", u"o"])
+
+    def test_tokenizer_no_lower(self):
+        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False)
+        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
+        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or',
+                                      u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
+                                      u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                      SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
+
+    def test_sequence_builders(self):
+        tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
+
+        text = tokenizer.encode("sequence builders")
+        text_2 = tokenizer.encode("multi-sequence build")
+
+        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
+        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
+
+        assert encoded_sentence == text + [4, 3]
+        assert encoded_pair == text + [4] + text_2 + [4, 3]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/pytorch_transformers/tokenization_auto.py b/pytorch_transformers/tokenization_auto.py
new file mode 100644
index 0000000..889774b
--- /dev/null
+++ b/pytorch_transformers/tokenization_auto.py
@@ -0,0 +1,120 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Auto Model class. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+
+from .tokenization_bert import BertTokenizer
+from .tokenization_openai import OpenAIGPTTokenizer
+from .tokenization_gpt2 import GPT2Tokenizer
+from .tokenization_transfo_xl import TransfoXLTokenizer
+from .tokenization_xlnet import XLNetTokenizer
+from .tokenization_xlm import XLMTokenizer
+from .tokenization_roberta import RobertaTokenizer
+from .tokenization_distilbert import DistilBertTokenizer
+
+logger = logging.getLogger(__name__)
+
+class AutoTokenizer(object):
+    r""":class:`~pytorch_transformers.AutoTokenizer` is a generic tokenizer class
+        that will be instantiated as one of the tokenizer classes of the library
+        when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method take care of returning the correct tokenizer class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The tokenizer class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertTokenizer (DistilBert model)
+            - contains `roberta`: RobertaTokenizer (RoBERTa model)
+            - contains `bert`: BertTokenizer (Bert model)
+            - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
+            - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
+            - contains `xlnet`: XLNetTokenizer (XLNet model)
+            - contains `xlm`: XLMTokenizer (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throw an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoTokenizer is designed to be instantiated "
+            "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+        r""" Instantiate a one of the tokenizer classes of the library
+        from a pre-trained model vocabulary.
+
+        The tokenizer class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertTokenizer (DistilBert model)
+            - contains `roberta`: RobertaTokenizer (XLM model)
+            - contains `bert`: BertTokenizer (Bert model)
+            - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
+            - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
+            - contains `xlnet`: XLNetTokenizer (XLNet model)
+            - contains `xlm`: XLMTokenizer (XLM model)
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the vocabulary files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
+
+            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
+
+        Examples::
+
+            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')    # Download vocabulary from S3 and cache.
+            tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+
+        """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'gpt2' in pretrained_model_name_or_path:
+            return GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            return TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
new file mode 100644
index 0000000..b85a4cc
--- /dev/null
+++ b/pytorch_transformers/tokenization_bert.py
@@ -0,0 +1,457 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import logging
+import os
+import unicodedata
+from io import open
+
+from .tokenization_utils import PreTrainedTokenizer
+
+logger = logging.getLogger(__name__)
+
+VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+        'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+        'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
+        'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+        'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+        'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+        'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+        'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
+        'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
+        'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
+        'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
+        'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
+        'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'bert-base-uncased': 512,
+    'bert-large-uncased': 512,
+    'bert-base-cased': 512,
+    'bert-large-cased': 512,
+    'bert-base-multilingual-uncased': 512,
+    'bert-base-multilingual-cased': 512,
+    'bert-base-chinese': 512,
+    'bert-base-german-cased': 512,
+    'bert-large-uncased-whole-word-masking': 512,
+    'bert-large-cased-whole-word-masking': 512,
+    'bert-large-uncased-whole-word-masking-finetuned-squad': 512,
+    'bert-large-cased-whole-word-masking-finetuned-squad': 512,
+    'bert-base-cased-finetuned-mrpc': 512,
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    'bert-base-uncased': {'do_lower_case': True},
+    'bert-large-uncased': {'do_lower_case': True},
+    'bert-base-cased': {'do_lower_case': False},
+    'bert-large-cased': {'do_lower_case': False},
+    'bert-base-multilingual-uncased': {'do_lower_case': True},
+    'bert-base-multilingual-cased': {'do_lower_case': False},
+    'bert-base-chinese': {'do_lower_case': False},
+    'bert-base-german-cased': {'do_lower_case': False},
+    'bert-large-uncased-whole-word-masking': {'do_lower_case': True},
+    'bert-large-cased-whole-word-masking': {'do_lower_case': False},
+    'bert-large-uncased-whole-word-masking-finetuned-squad': {'do_lower_case': True},
+    'bert-large-cased-whole-word-masking-finetuned-squad': {'do_lower_case': False},
+    'bert-base-cased-finetuned-mrpc': {'do_lower_case': False},
+}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip('\n')
+        vocab[token] = index
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class BertTokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a BertTokenizer.
+    :class:`~pytorch_transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
+
+    Args:
+        vocab_file: Path to a one-wordpiece-per-line vocabulary file
+        do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
+        do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+        max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
+            minimum of this value (if specified) and the underlying BERT model's sequence length.
+        never_split: List of tokens which will never be split during tokenization. Only has an effect when
+            do_wordpiece_only=False
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
+                 unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
+                 mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs):
+        """Constructs a BertTokenizer.
+
+        Args:
+            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file
+            **do_lower_case**: (`optional`) boolean (default True)
+                Whether to lower case the input
+                Only has an effect when do_basic_tokenize=True
+            **do_basic_tokenize**: (`optional`) boolean (default True)
+                Whether to do basic tokenization before wordpiece.
+            **never_split**: (`optional`) list of string
+                List of tokens which will never be split during tokenization.
+                Only has an effect when do_basic_tokenize=True
+            **tokenize_chinese_chars**: (`optional`) boolean (default True)
+                Whether to tokenize Chinese characters.
+                This should likely be deactivated for Japanese:
+                see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
+        """
+        super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
+                                            pad_token=pad_token, cls_token=cls_token,
+                                            mask_token=mask_token, **kwargs)
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
+                                                  never_split=never_split,
+                                                  tokenize_chinese_chars=tokenize_chinese_chars)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def _tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+                for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                    split_tokens.append(sub_token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = ' '.join(tokens).replace(' ##', '').strip()
+        return out_string
+
+    def add_special_tokens_single_sentence(self, token_ids):
+        """
+        Adds special tokens to the a sequence for sequence classification tasks.
+        A BERT sequence has the following format: [CLS] X [SEP]
+        """
+        return [self.cls_token_id] + token_ids + [self.sep_token_id]
+
+    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+        """
+        Adds special tokens to a sequence pair for sequence classification tasks.
+        A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary to a directory or file."""
+        index = 0
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
+        else:
+            vocab_file = vocab_path
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
+                                   " Please check that the vocabulary is not corrupted!".format(vocab_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+        return (vocab_file,)
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True):
+        """ Constructs a BasicTokenizer.
+
+        Args:
+            **do_lower_case**: Whether to lower case the input.
+            **never_split**: (`optional`) list of str
+                Kept for backward compatibility purposes.
+                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
+                List of token not to split.
+            **tokenize_chinese_chars**: (`optional`) boolean (default True)
+                Whether to tokenize Chinese characters.
+                This should likely be deactivated for Japanese:
+                see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
+        """
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+
+    def tokenize(self, text, never_split=None):
+        """ Basic Tokenization of a piece of text.
+            Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
+
+        Args:
+            **never_split**: (`optional`) list of str
+                Kept for backward compatibility purposes.
+                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
+                List of token not to split.
+        """
+        never_split = self.never_split + (never_split if never_split is not None else [])
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case and token not in never_split:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if never_split is not None and text in never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+                (cp >= 0x3400 and cp <= 0x4DBF) or  #
+                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+                (cp >= 0x2B820 and cp <= 0x2CEAF) or
+                (cp >= 0xF900 and cp <= 0xFAFF) or  #
+                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
diff --git a/pytorch_transformers/tokenization_distilbert.py b/pytorch_transformers/tokenization_distilbert.py
new file mode 100644
index 0000000..5a6d02f
--- /dev/null
+++ b/pytorch_transformers/tokenization_distilbert.py
@@ -0,0 +1,62 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for DistilBERT."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import logging
+import os
+import unicodedata
+from io import open
+
+from .tokenization_bert import BertTokenizer
+
+logger = logging.getLogger(__name__)
+
+VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+        'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'distilbert-base-uncased': 512,
+    'distilbert-base-uncased-distilled-squad': 512,
+}
+
+
+class DistilBertTokenizer(BertTokenizer):
+    r"""
+    Constructs a DistilBertTokenizer.
+    :class:`~pytorch_transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
+
+    Args:
+        vocab_file: Path to a one-wordpiece-per-line vocabulary file
+        do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
+        do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+        max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
+            minimum of this value (if specified) and the underlying BERT model's sequence length.
+        never_split: List of tokens which will never be split during tokenization. Only has an effect when
+            do_wordpiece_only=False
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
new file mode 100644
index 0000000..4ebe1ad
--- /dev/null
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -0,0 +1,224 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import sys
+import json
+import logging
+import os
+import regex as re
+from io import open
+
+try:
+    from functools import lru_cache
+except ImportError:
+    # Just a dummy decorator to get the checks to run on python2
+    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
+    def lru_cache():
+        return lambda func: func
+
+from .tokenization_utils import PreTrainedTokenizer
+
+logger = logging.getLogger(__name__)
+
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'vocab.json',
+    'merges_file': 'merges.txt',
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
+        'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
+        'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
+    },
+    'merges_file':
+    {
+        'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
+        'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
+        'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'gpt2': 1024,
+    'gpt2-medium': 1024,
+    'gpt2-large': 1024,
+}
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings.
+    We specifically avoids mapping to whitespace/control characters the bpe code barfs on.
+    
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    """
+    _chr = unichr if sys.version_info[0] == 2 else chr
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [_chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+class GPT2Tokenizer(PreTrainedTokenizer):
+    """
+    GPT-2 BPE tokenizer. Peculiarities:
+        - Byte-level Byte-Pair-Encoding
+        - Requires a space to start the input string => will add a space is there isn't.
+          As a consequence, this tokenizer `encode` and `decode` method will not conserve
+          the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
+                 bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
+        super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
+        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
+        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
+
+        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text):
+        """ Tokenize a string. """
+        text = ' ' + text  # GPT-2 (and RoBERTa) tokenizers need at least one space to begin the sentence with.
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            if sys.version_info[0] == 2:
+                token = ''.join(self.byte_encoder[ord(b)] for b in token) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
+            else:
+                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        return self.decoder.get(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        text = ''.join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        return text
+
+    def save_vocabulary(self, save_directory):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
+
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    index = token_index
+                writer.write(' '.join(bpe_tokens) + u'\n')
+                index += 1
+
+        return vocab_file, merge_file
\ No newline at end of file
diff --git a/pytorch_transformers/tokenization_openai.py b/pytorch_transformers/tokenization_openai.py
new file mode 100644
index 0000000..0efbdb3
--- /dev/null
+++ b/pytorch_transformers/tokenization_openai.py
@@ -0,0 +1,208 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import json
+import logging
+import os
+import re
+from io import open
+
+from .tokenization_utils import PreTrainedTokenizer
+from .tokenization_bert import BasicTokenizer
+
+logger = logging.getLogger(__name__)
+
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'vocab.json',
+    'merges_file': 'merges.txt',
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",
+    },
+    'merges_file':
+    {
+        'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'openai-gpt': 512,
+}
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+    word is represented as tuple of symbols (symbols being variable-length strings)
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+def text_standardize(text):
+    """
+    fixes some issues the spacy tokenizer had on books corpus
+    also does some whitespace standardization
+    """
+    text = text.replace('—', '-')
+    text = text.replace('–', '-')
+    text = text.replace('―', '-')
+    text = text.replace('…', '...')
+    text = text.replace('´', "'")
+    text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
+    text = re.sub(r'\s*\n\s*', ' \n ', text)
+    text = re.sub(r'[^\S\n]+', ' ', text)
+    return text.strip()
+
+class OpenAIGPTTokenizer(PreTrainedTokenizer):
+    """
+    BPE tokenizer. Peculiarities:
+        - lower case all inputs
+        - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
+        super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)
+
+        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
+        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
+
+        try:
+            import ftfy
+            from spacy.lang.en import English
+            _nlp = English()
+            self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
+            self.fix_text = ftfy.fix_text
+        except ImportError:
+            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
+            self.nlp = BasicTokenizer(do_lower_case=True)
+            self.fix_text = None
+
+        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        self.decoder = {v:k for k,v in self.encoder.items()}
+        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        merges = [tuple(merge.split()) for merge in merges]
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def bpe(self, token):
+        word = tuple(token[:-1]) + (token[-1] + '</w>',)
+        if token in self.cache:
+            return self.cache[token]
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token+'</w>'
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        if word == '\n  </w>':
+            word = '\n</w>'
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text):
+        """ Tokenize a string. """
+        split_tokens = []
+        if self.fix_text is None:
+            # Using BERT's BasicTokenizer
+            text = self.nlp.tokenize(text)
+            for token in text:
+                split_tokens.extend([t for t in self.bpe(token).split(' ')])
+        else:
+            # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
+            text = self.nlp(text_standardize(self.fix_text(text)))
+            for token in text:
+                split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an id in a token (BPE) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = ''.join(tokens).replace('</w>', ' ').strip()
+        return out_string
+
+    def save_vocabulary(self, save_directory):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
+
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    index = token_index
+                writer.write(' '.join(bpe_tokens) + u'\n')
+                index += 1
+
+        return vocab_file, merge_file
diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py
new file mode 100644
index 0000000..6780875
--- /dev/null
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -0,0 +1,98 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for RoBERTa."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import sys
+import json
+import logging
+import os
+import regex as re
+from io import open
+
+from .tokenization_gpt2 import GPT2Tokenizer
+
+try:
+    from functools import lru_cache
+except ImportError:
+    # Just a dummy decorator to get the checks to run on python2
+    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
+    def lru_cache():
+        return lambda func: func
+
+logger = logging.getLogger(__name__)
+
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'vocab.json',
+    'merges_file': 'merges.txt',
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
+        'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
+        'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
+    },
+    'merges_file':
+    {
+        'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
+        'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
+        'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'roberta-base': 512,
+    'roberta-large': 512,
+    'roberta-large-mnli': 512,
+}
+
+
+class RobertaTokenizer(GPT2Tokenizer):
+    """
+    RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
+        - Byte-level Byte-Pair-Encoding
+        - Requires a space to start the input string => will add a space is there isn't.
+          As a consequence, this tokenizer `encode` and `decode` method will not conserve
+          the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
+                 cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
+        super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors,
+                                               bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
+                                               sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
+                                               mask_token=mask_token, **kwargs)
+
+    def add_special_tokens_single_sentence(self, token_ids):
+        """
+        Adds special tokens to a sequence for sequence classification tasks.
+        A RoBERTa sequence has the following format: <s> X </s>
+        """
+        return [self.cls_token_id] + token_ids + [self.sep_token_id]
+
+    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+        """
+        Adds special tokens to a sequence pair for sequence classification tasks.
+        A RoBERTa sequence pair has the following format: <s> A </s></s> B </s>
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
diff --git a/pytorch_transformers/tokenization_transfo_xl.py b/pytorch_transformers/tokenization_transfo_xl.py
new file mode 100644
index 0000000..66bc01c
--- /dev/null
+++ b/pytorch_transformers/tokenization_transfo_xl.py
@@ -0,0 +1,575 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization classes for Transformer XL model.
+    Adapted from https://github.com/kimiyoung/transformer-xl.
+"""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import glob
+import logging
+import os
+import sys
+from collections import Counter, OrderedDict
+from io import open
+
+import torch
+import numpy as np
+
+from .file_utils import cached_path
+from .tokenization_utils import PreTrainedTokenizer
+
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+
+
+logger = logging.getLogger(__name__)
+
+VOCAB_FILES_NAMES = {'pretrained_vocab_file': 'vocab.bin', 'vocab_file': 'vocab.txt'}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'pretrained_vocab_file':
+    {
+        'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'transfo-xl-wt103': None,
+}
+
+PRETRAINED_CORPUS_ARCHIVE_MAP = {
+    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin",
+}
+CORPUS_NAME = 'corpus.bin'
+
+class TransfoXLTokenizer(PreTrainedTokenizer):
+    """
+    Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, special=None, min_freq=0, max_size=None, lower_case=False,
+                 delimiter=None, vocab_file=None, pretrained_vocab_file=None,
+                 never_split=None, unk_token="<unk>", eos_token="<eos>",
+                 additional_special_tokens=["<formula>"], **kwargs):
+        super(TransfoXLTokenizer, self).__init__(unk_token=unk_token, eos_token=eos_token,
+                                                 additional_special_tokens=additional_special_tokens,
+                                                 **kwargs)
+
+        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
+        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
+
+        if never_split is None:
+            never_split = self.all_special_tokens
+        if special is None:
+            special = []
+        self.counter = Counter()
+        self.special = special
+        self.min_freq = min_freq
+        self.max_size = max_size
+        self.lower_case = lower_case
+        self.delimiter = delimiter
+        self.vocab_file = vocab_file
+        self.never_split = never_split
+
+        if pretrained_vocab_file is not None:
+            # Hack because, honestly this tokenizer was not made to be used
+            # in a library like ours, at all.
+            vocab_dict = torch.load(pretrained_vocab_file)
+            for key, value in vocab_dict.items():
+                if key not in self.__dict__:
+                    self.__dict__[key] = value
+
+        if vocab_file is not None:
+            self.build_vocab()
+
+    def count_file(self, path, verbose=False, add_eos=False):
+        if verbose: logger.info('counting file {} ...'.format(path))
+        assert os.path.exists(path)
+
+        sents = []
+        with open(path, 'r', encoding='utf-8') as f:
+            for idx, line in enumerate(f):
+                if verbose and idx > 0 and idx % 500000 == 0:
+                    logger.info('    line {}'.format(idx))
+                symbols = self.tokenize(line, add_eos=add_eos)
+                self.counter.update(symbols)
+                sents.append(symbols)
+
+        return sents
+
+    def count_sents(self, sents, verbose=False):
+        """
+            sents : a list of sentences, each a list of tokenized symbols
+        """
+        if verbose: logger.info('counting {} sents ...'.format(len(sents)))
+        for idx, symbols in enumerate(sents):
+            if verbose and idx > 0 and idx % 500000 == 0:
+                logger.info('    line {}'.format(idx))
+            self.counter.update(symbols)
+
+    def _build_from_file(self, vocab_file):
+        self.idx2sym = []
+        self.sym2idx = OrderedDict()
+
+        with open(vocab_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                symb = line.strip().split()[0]
+                self.add_symbol(symb)
+        if '<UNK>' in self.sym2idx:
+            self.unk_idx = self.sym2idx['<UNK>']
+        elif '<unk>' in self.sym2idx:
+            self.unk_idx = self.sym2idx['<unk>']
+        else:
+            raise ValueError('No <unkown> token in vocabulary')
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary to a directory or file."""
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['pretrained_vocab_file'])
+        torch.save(self.__dict__, vocab_file)
+        return (vocab_file,)
+
+    def build_vocab(self):
+        if self.vocab_file:
+            logger.info('building vocab from {}'.format(self.vocab_file))
+            self._build_from_file(self.vocab_file)
+            logger.info('final vocab size {}'.format(len(self)))
+        else:
+            logger.info('building vocab with min_freq={}, max_size={}'.format(
+                self.min_freq, self.max_size))
+            self.idx2sym = []
+            self.sym2idx = OrderedDict()
+
+            for sym in self.special:
+                self.add_special(sym)
+
+            for sym, cnt in self.counter.most_common(self.max_size):
+                if cnt < self.min_freq: break
+                self.add_symbol(sym)
+
+            logger.info('final vocab size {} from {} unique tokens'.format(
+                len(self), len(self.counter)))
+
+    def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
+            add_double_eos=False):
+        if verbose: logger.info('encoding file {} ...'.format(path))
+        assert os.path.exists(path)
+        encoded = []
+        with open(path, 'r', encoding='utf-8') as f:
+            for idx, line in enumerate(f):
+                if verbose and idx > 0 and idx % 500000 == 0:
+                    logger.info('    line {}'.format(idx))
+                symbols = self.tokenize(line, add_eos=add_eos,
+                    add_double_eos=add_double_eos)
+                encoded.append(self.convert_to_tensor(symbols))
+
+        if ordered:
+            encoded = torch.cat(encoded)
+
+        return encoded
+
+    def encode_sents(self, sents, ordered=False, verbose=False):
+        if verbose: logger.info('encoding {} sents ...'.format(len(sents)))
+        encoded = []
+        for idx, symbols in enumerate(sents):
+            if verbose and idx > 0 and idx % 500000 == 0:
+                logger.info('    line {}'.format(idx))
+            encoded.append(self.convert_to_tensor(symbols))
+
+        if ordered:
+            encoded = torch.cat(encoded)
+
+        return encoded
+
+    def add_special(self, sym):
+        if sym not in self.sym2idx:
+            self.idx2sym.append(sym)
+            self.sym2idx[sym] = len(self.idx2sym) - 1
+            setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym])
+
+    def add_symbol(self, sym):
+        if sym not in self.sym2idx:
+            self.idx2sym.append(sym)
+            self.sym2idx[sym] = len(self.idx2sym) - 1
+
+    def _convert_id_to_token(self, idx):
+        """Converts an id in a token (BPE) using the vocab."""
+        assert 0 <= idx < len(self), 'Index {} out of vocabulary range'.format(idx)
+        return self.idx2sym[idx]
+
+    def _convert_token_to_id(self, sym):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        if sym in self.sym2idx:
+            return self.sym2idx[sym]
+        else:
+            # logger.info('encounter unk {}'.format(sym))
+            # assert '<eos>' not in sym
+            if hasattr(self, 'unk_idx'):
+                return self.sym2idx.get(sym, self.unk_idx)
+            # Backward compatibility with pre-trained models
+            elif '<unk>' in self.sym2idx:
+                return self.sym2idx['<unk>']
+            elif '<UNK>' in self.sym2idx:
+                return self.sym2idx['<UNK>']
+            else:
+                raise ValueError('Token not in vocabulary and no <unk> token in vocabulary for replacement')
+
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = ' '.join(tokens).strip()
+        return out_string
+
+    def convert_to_tensor(self, symbols):
+        return torch.LongTensor(self.convert_tokens_to_ids(symbols))
+
+    @property
+    def vocab_size(self):
+        return len(self.idx2sym)
+
+    def _tokenize(self, line, add_eos=False, add_double_eos=False):
+        line = line.strip()
+        # convert to lower case
+        if self.lower_case:
+            line = line.lower()
+
+        # empty delimiter '' will evaluate False
+        if self.delimiter == '':
+            symbols = line
+        else:
+            symbols = line.split(self.delimiter)
+
+        if add_double_eos: # lm1b
+            return ['<S>'] + symbols + ['<S>']
+        elif add_eos:
+            return symbols + ['<eos>']
+        else:
+            return symbols
+
+
+class LMOrderedIterator(object):
+    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None):
+        """
+            data -- LongTensor -- the LongTensor is strictly ordered
+        """
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+
+        # Work out how cleanly we can divide the dataset into bsz parts.
+        self.n_step = data.size(0) // bsz
+
+        # Trim off any extra elements that wouldn't cleanly fit (remainders).
+        data = data.narrow(0, 0, self.n_step * bsz)
+
+        # Evenly divide the data across the bsz batches.
+        self.data = data.view(bsz, -1).t().contiguous().to(device)
+
+        # Number of mini-batches
+        self.n_batch = (self.n_step + self.bptt - 1) // self.bptt
+
+    def get_batch(self, i, bptt=None):
+        if bptt is None: bptt = self.bptt
+        seq_len = min(bptt, self.data.size(0) - 1 - i)
+
+        end_idx = i + seq_len
+        beg_idx = max(0, i - self.ext_len)
+
+        data = self.data[beg_idx:end_idx]
+        target = self.data[i+1:i+1+seq_len]
+
+        data_out = data.transpose(0, 1).contiguous().to(self.device)
+        target_out = target.transpose(0, 1).contiguous().to(self.device)
+
+        return data_out, target_out, seq_len
+
+    def get_fixlen_iter(self, start=0):
+        for i in range(start, self.data.size(0) - 1, self.bptt):
+            yield self.get_batch(i)
+
+    def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3):
+        max_len = self.bptt + max_deviation * std
+        i = start
+        while True:
+            bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
+            bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))
+            data, target, seq_len = self.get_batch(i, bptt)
+            i += seq_len
+            yield data, target, seq_len
+            if i >= self.data.size(0) - 2:
+                break
+
+    def __iter__(self):
+        return self.get_fixlen_iter()
+
+
+class LMShuffledIterator(object):
+    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None, shuffle=False):
+        """
+            data -- list[LongTensor] -- there is no order among the LongTensors
+        """
+        self.data = data
+
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+        self.shuffle = shuffle
+
+    def get_sent_stream(self):
+        # index iterator
+        epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \
+            else np.array(range(len(self.data)))
+
+        # sentence iterator
+        for idx in epoch_indices:
+            yield self.data[idx]
+
+    def stream_iterator(self, sent_stream):
+        # streams for each data in the batch
+        streams = [None] * self.bsz
+
+        data = torch.LongTensor(self.bptt, self.bsz)
+        target = torch.LongTensor(self.bptt, self.bsz)
+
+        n_retain = 0
+
+        while True:
+            # data   : [n_retain+bptt x bsz]
+            # target : [bptt x bsz]
+            data[n_retain:].fill_(-1)
+            target.fill_(-1)
+
+            valid_batch = True
+
+            for i in range(self.bsz):
+                n_filled = 0
+                try:
+                    while n_filled < self.bptt:
+                        if streams[i] is None or len(streams[i]) <= 1:
+                            streams[i] = next(sent_stream)
+                        # number of new tokens to fill in
+                        n_new = min(len(streams[i]) - 1, self.bptt - n_filled)
+                        # first n_retain tokens are retained from last batch
+                        data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \
+                            streams[i][:n_new]
+                        target[n_filled:n_filled+n_new, i] = \
+                            streams[i][1:n_new+1]
+                        streams[i] = streams[i][n_new:]
+                        n_filled += n_new
+                except StopIteration:
+                    valid_batch = False
+                    break
+
+            if not valid_batch:
+                return
+
+            data_out = data.transpose(0, 1).contiguous().to(self.device)
+            target_out = target.transpose(0, 1).contiguous().to(self.device)
+
+            yield data_out, target_out, self.bptt
+
+            n_retain = min(data.size(0), self.ext_len)
+            if n_retain > 0:
+                data[:n_retain] = data[-n_retain:]
+            data.resize_(n_retain + self.bptt, data.size(1))
+
+    def __iter__(self):
+        # sent_stream is an iterator
+        sent_stream = self.get_sent_stream()
+
+        for batch in self.stream_iterator(sent_stream):
+            yield batch
+
+
+class LMMultiFileIterator(LMShuffledIterator):
+    def __init__(self, paths, vocab, bsz, bptt, device='cpu', ext_len=None,
+        shuffle=False):
+
+        self.paths = paths
+        self.vocab = vocab
+
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+        self.shuffle = shuffle
+
+    def get_sent_stream(self, path):
+        sents = self.vocab.encode_file(path, add_double_eos=True)
+        if self.shuffle:
+            np.random.shuffle(sents)
+        sent_stream = iter(sents)
+
+        return sent_stream
+
+    def __iter__(self):
+        if self.shuffle:
+            np.random.shuffle(self.paths)
+
+        for path in self.paths:
+            # sent_stream is an iterator
+            sent_stream = self.get_sent_stream(path)
+            for batch in self.stream_iterator(sent_stream):
+                yield batch
+
+
+class TransfoXLCorpus(object):
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a pre-processed corpus.
+        """
+        vocab = TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        if pretrained_model_name_or_path in PRETRAINED_CORPUS_ARCHIVE_MAP:
+            corpus_file = PRETRAINED_CORPUS_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            corpus_file = os.path.join(pretrained_model_name_or_path, CORPUS_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Corpus '{}' was not found in corpus list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    corpus_file))
+            return None
+        if resolved_corpus_file == corpus_file:
+            logger.info("loading corpus file {}".format(corpus_file))
+        else:
+            logger.info("loading corpus file {} from cache at {}".format(
+                corpus_file, resolved_corpus_file))
+
+        # Instantiate tokenizer.
+        corpus = cls(*inputs, **kwargs)
+        corpus_dict = torch.load(resolved_corpus_file)
+        for key, value in corpus_dict.items():
+            corpus.__dict__[key] = value
+        corpus.vocab = vocab
+        if corpus.train is not None:
+            corpus.train = torch.tensor(corpus.train, dtype=torch.long)
+        if corpus.valid is not None:
+            corpus.valid = torch.tensor(corpus.valid, dtype=torch.long)
+        if corpus.test is not None:
+            corpus.test = torch.tensor(corpus.test, dtype=torch.long)
+        return corpus
+
+    def __init__(self, *args, **kwargs):
+        self.vocab = TransfoXLTokenizer(*args, **kwargs)
+        self.dataset = None
+        self.train = None
+        self.valid = None
+        self.test = None
+
+    def build_corpus(self, path, dataset):
+        self.dataset = dataset
+
+        if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']:
+            self.vocab.count_file(os.path.join(path, 'train.txt'))
+            self.vocab.count_file(os.path.join(path, 'valid.txt'))
+            self.vocab.count_file(os.path.join(path, 'test.txt'))
+        elif self.dataset == 'wt103':
+            self.vocab.count_file(os.path.join(path, 'train.txt'))
+        elif self.dataset == 'lm1b':
+            train_path_pattern = os.path.join(
+                path, '1-billion-word-language-modeling-benchmark-r13output',
+                'training-monolingual.tokenized.shuffled', 'news.en-*')
+            train_paths = glob.glob(train_path_pattern)
+            # the vocab will load from file when build_vocab() is called
+
+        self.vocab.build_vocab()
+
+        if self.dataset in ['ptb', 'wt2', 'wt103']:
+            self.train = self.vocab.encode_file(
+                os.path.join(path, 'train.txt'), ordered=True)
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=True)
+            self.test = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=True)
+        elif self.dataset in ['enwik8', 'text8']:
+            self.train = self.vocab.encode_file(
+                os.path.join(path, 'train.txt'), ordered=True, add_eos=False)
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=True, add_eos=False)
+            self.test = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=True, add_eos=False)
+        elif self.dataset == 'lm1b':
+            self.train = train_paths
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True)
+            self.test = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True)
+
+    def get_iterator(self, split, *args, **kwargs):
+        if split == 'train':
+            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
+                data_iter = LMOrderedIterator(self.train, *args, **kwargs)
+            elif self.dataset == 'lm1b':
+                kwargs['shuffle'] = True
+                data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)
+        elif split in ['valid', 'test']:
+            data = self.valid if split == 'valid' else self.test
+            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
+                data_iter = LMOrderedIterator(data, *args, **kwargs)
+            elif self.dataset == 'lm1b':
+                data_iter = LMShuffledIterator(data, *args, **kwargs)
+
+        return data_iter
+
+
+def get_lm_corpus(datadir, dataset):
+    fn = os.path.join(datadir, 'cache.pt')
+    fn_pickle = os.path.join(datadir, 'cache.pkl')
+    if os.path.exists(fn):
+        logger.info('Loading cached dataset...')
+        corpus = torch.load(fn_pickle)
+    elif os.path.exists(fn):
+        logger.info('Loading cached dataset from pickle...')
+        with open(fn, "rb") as fp:
+            corpus = pickle.load(fp)
+    else:
+        logger.info('Producing dataset {}...'.format(dataset))
+        kwargs = {}
+        if dataset in ['wt103', 'wt2']:
+            kwargs['special'] = ['<eos>']
+            kwargs['lower_case'] = False
+        elif dataset == 'ptb':
+            kwargs['special'] = ['<eos>']
+            kwargs['lower_case'] = True
+        elif dataset == 'lm1b':
+            kwargs['special'] = []
+            kwargs['lower_case'] = False
+            kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt')
+        elif dataset in ['enwik8', 'text8']:
+            pass
+
+        corpus = TransfoXLCorpus(datadir, dataset, **kwargs)
+        torch.save(corpus, fn)
+
+    return corpus
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
new file mode 100644
index 0000000..1e2cd59
--- /dev/null
+++ b/pytorch_transformers/tokenization_utils.py
@@ -0,0 +1,815 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import logging
+import os
+import json
+import six
+import copy
+from io import open
+
+from .file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
+ADDED_TOKENS_FILE = 'added_tokens.json'
+TOKENIZER_CONFIG_FILE = 'tokenizer_config.json'
+
+class PreTrainedTokenizer(object):
+    """ Base class for all tokenizers.
+    Handle all the shared methods for tokenization and special tokens as well as methods dowloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
+
+    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
+
+    Class attributes (overridden by derived classes):
+
+        - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
+        - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
+        - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
+        - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method.
+
+    Parameters:
+
+        - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id``
+
+        - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id``
+
+        - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id``
+
+        - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id``
+
+        - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id``
+
+        - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id``
+
+        - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id``
+
+        - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids``
+    """
+    vocab_files_names = {}
+    pretrained_vocab_files_map = {}
+    pretrained_init_configuration = {}
+    max_model_input_sizes = {}
+
+    SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token", "sep_token",
+                                 "pad_token", "cls_token", "mask_token",
+                                 "additional_special_tokens"]
+
+    @property
+    def bos_token(self):
+        """ Beginning of sentence token (string). Log an error if used while not having been set. """
+        if self._bos_token is None:
+            logger.error("Using bos_token, but it is not set yet.")
+        return self._bos_token
+
+    @property
+    def eos_token(self):
+        """ End of sentence token (string). Log an error if used while not having been set. """
+        if self._eos_token is None:
+            logger.error("Using eos_token, but it is not set yet.")
+        return self._eos_token
+
+    @property
+    def unk_token(self):
+        """ Unknown token (string). Log an error if used while not having been set. """
+        if self._unk_token is None:
+            logger.error("Using unk_token, but it is not set yet.")
+        return self._unk_token
+
+    @property
+    def sep_token(self):
+        """ Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
+        if self._sep_token is None:
+            logger.error("Using sep_token, but it is not set yet.")
+        return self._sep_token
+
+    @property
+    def pad_token(self):
+        """ Padding token (string). Log an error if used while not having been set. """
+        if self._pad_token is None:
+            logger.error("Using pad_token, but it is not set yet.")
+        return self._pad_token
+
+    @property
+    def cls_token(self):
+        """ Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
+        if self._cls_token is None:
+            logger.error("Using cls_token, but it is not set yet.")
+        return self._cls_token
+
+    @property
+    def mask_token(self):
+        """ Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
+        if self._mask_token is None:
+            logger.error("Using mask_token, but it is not set yet.")
+        return self._mask_token
+
+    @property
+    def additional_special_tokens(self):
+        """ All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """
+        if self._additional_special_tokens is None:
+            logger.error("Using additional_special_tokens, but it is not set yet.")
+        return self._additional_special_tokens
+
+    @bos_token.setter
+    def bos_token(self, value):
+        self._bos_token = value
+
+    @eos_token.setter
+    def eos_token(self, value):
+        self._eos_token = value
+
+    @unk_token.setter
+    def unk_token(self, value):
+        self._unk_token = value
+
+    @sep_token.setter
+    def sep_token(self, value):
+        self._sep_token = value
+
+    @pad_token.setter
+    def pad_token(self, value):
+        self._pad_token = value
+
+    @cls_token.setter
+    def cls_token(self, value):
+        self._cls_token = value
+
+    @mask_token.setter
+    def mask_token(self, value):
+        self._mask_token = value
+
+    @additional_special_tokens.setter
+    def additional_special_tokens(self, value):
+        self._additional_special_tokens = value
+
+    @property
+    def bos_token_id(self):
+        """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """
+        return self.convert_tokens_to_ids(self.bos_token)
+
+    @property
+    def eos_token_id(self):
+        """ Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """
+        return self.convert_tokens_to_ids(self.eos_token)
+
+    @property
+    def unk_token_id(self):
+        """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """
+        return self.convert_tokens_to_ids(self.unk_token)
+
+    @property
+    def sep_token_id(self):
+        """ Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
+        return self.convert_tokens_to_ids(self.sep_token)
+
+    @property
+    def pad_token_id(self):
+        """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """
+        return self.convert_tokens_to_ids(self.pad_token)
+
+    @property
+    def cls_token_id(self):
+        """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
+        return self.convert_tokens_to_ids(self.cls_token)
+
+    @property
+    def mask_token_id(self):
+        """ Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
+        return self.convert_tokens_to_ids(self.mask_token)
+
+    @property
+    def additional_special_tokens_ids(self):
+        """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """
+        return self.convert_tokens_to_ids(self.additional_special_tokens)
+
+    def __init__(self, max_len=None, **kwargs):
+        self._bos_token = None
+        self._eos_token = None
+        self._unk_token = None
+        self._sep_token = None
+        self._pad_token = None
+        self._cls_token = None
+        self._mask_token = None
+        self._additional_special_tokens = []
+
+        self.max_len = max_len if max_len is not None else int(1e12)
+
+        # Added tokens
+        self.added_tokens_encoder = {}
+        self.added_tokens_decoder = {}
+
+        # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
+        self.init_inputs = ()
+        self.init_kwargs = {}
+
+        for key, value in kwargs.items():
+            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
+                if key == 'additional_special_tokens':
+                    assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value)
+                else:
+                    assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
+                setattr(self, key, value)
+
+
+    @classmethod
+    def from_pretrained(cls, *inputs, **kwargs):
+        r"""
+        Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
+
+        Args:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the vocabulary files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
+
+            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
+
+        Examples::
+
+            # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer
+
+            # Download vocabulary from S3 and cache.
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+
+            # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
+            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
+
+            # If the tokenizer uses a single vocabulary file, you can point directly to this file
+            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
+
+            # You can link tokens to special vocabulary when instantiating
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
+            # You should be sure '<unk>' is in the vocabulary when doing that.
+            # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
+            assert tokenizer.unk_token == '<unk>'
+
+        """
+        return cls._from_pretrained(*inputs, **kwargs)
+
+
+    @classmethod
+    def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
+        cache_dir = kwargs.pop('cache_dir', None)
+        force_download = kwargs.pop('force_download', False)
+        proxies = kwargs.pop('proxies', None)
+
+        s3_models = list(cls.max_model_input_sizes.keys())
+        vocab_files = {}
+        init_configuration = {}
+        if pretrained_model_name_or_path in s3_models:
+            # Get the vocabulary from AWS S3 bucket
+            for file_id, map_list in cls.pretrained_vocab_files_map.items():
+                vocab_files[file_id] = map_list[pretrained_model_name_or_path]
+            if cls.pretrained_init_configuration and pretrained_model_name_or_path in cls.pretrained_init_configuration:
+                init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path]
+        else:
+            # Get the vocabulary from local files
+            logger.info(
+                "Model name '{}' not found in model shortcut name list ({}). "
+                "Assuming '{}' is a path or url to a directory containing tokenizer files.".format(
+                    pretrained_model_name_or_path, ', '.join(s3_models),
+                    pretrained_model_name_or_path))
+
+            # Look for the tokenizer main vocabulary files
+            for file_id, file_name in cls.vocab_files_names.items():
+                if os.path.isdir(pretrained_model_name_or_path):
+                    # If a directory is provided we look for the standard filenames
+                    full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
+                else:
+                    # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
+                    full_file_name = pretrained_model_name_or_path
+                if not os.path.exists(full_file_name):
+                    logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
+                    full_file_name = None
+                vocab_files[file_id] = full_file_name
+
+            # Look for the additional tokens files
+            additional_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
+                                      'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE,
+                                      'tokenizer_config_file': TOKENIZER_CONFIG_FILE,
+                                      }
+
+            # If a path to a file was provided, get the parent directory
+            saved_directory = pretrained_model_name_or_path
+            if os.path.exists(saved_directory) and not os.path.isdir(saved_directory):
+                saved_directory = os.path.dirname(saved_directory)
+
+            for file_id, file_name in additional_files_names.items():
+                full_file_name = os.path.join(saved_directory, file_name)
+                if not os.path.exists(full_file_name):
+                    logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
+                    full_file_name = None
+                vocab_files[file_id] = full_file_name
+
+            if all(full_file_name is None for full_file_name in vocab_files.values()):
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find tokenizer files"
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ', '.join(s3_models),
+                        pretrained_model_name_or_path, ))
+                return None
+
+        # Get files from url, cache, or disk depending on the case
+        try:
+            resolved_vocab_files = {}
+            for file_id, file_path in vocab_files.items():
+                if file_path is None:
+                    resolved_vocab_files[file_id] = None
+                else:
+                    resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
+        except EnvironmentError as e:
+            if pretrained_model_name_or_path in s3_models:
+                logger.error("Couldn't reach server to download vocabulary.")
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find files {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ', '.join(s3_models),
+                        pretrained_model_name_or_path, str(vocab_files.keys())))
+            raise e
+
+        for file_id, file_path in vocab_files.items():
+            if file_path == resolved_vocab_files[file_id]:
+                logger.info("loading file {}".format(file_path))
+            else:
+                logger.info("loading file {} from cache at {}".format(
+                    file_path, resolved_vocab_files[file_id]))
+
+        # Prepare tokenizer initialization kwargs
+        # Did we saved some inputs and kwargs to reload ?
+        tokenizer_config_file = resolved_vocab_files.pop('tokenizer_config_file', None)
+        if tokenizer_config_file is not None:
+            init_kwargs = json.load(open(tokenizer_config_file, encoding="utf-8"))
+            saved_init_inputs = init_kwargs.pop('init_inputs', ())
+            if not init_inputs:
+                init_inputs = saved_init_inputs
+        else:
+            init_kwargs = init_configuration
+
+        # Update with newly provided kwargs
+        init_kwargs.update(kwargs)
+
+        # Set max length if needed
+        if pretrained_model_name_or_path in cls.max_model_input_sizes:
+            # if we're using a pretrained model, ensure the tokenizer
+            # wont index sequences longer than the number of positional embeddings
+            max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
+            if max_len is not None and isinstance(max_len, (int, float)):
+                init_kwargs['max_len'] = min(init_kwargs.get('max_len', int(1e12)), max_len)
+
+        # Merge resolved_vocab_files arguments in init_kwargs.
+        added_tokens_file = resolved_vocab_files.pop('added_tokens_file', None)
+        special_tokens_map_file = resolved_vocab_files.pop('special_tokens_map_file', None)
+        for args_name, file_path in resolved_vocab_files.items():
+            if args_name not in init_kwargs:
+                init_kwargs[args_name] = file_path
+        if special_tokens_map_file is not None:
+            special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8"))
+            for key, value in special_tokens_map.items():
+                if key not in init_kwargs:
+                    init_kwargs[key] = value
+
+        # Instantiate tokenizer.
+        tokenizer = cls(*init_inputs, **init_kwargs)
+
+        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
+        tokenizer.init_inputs = init_inputs
+        tokenizer.init_kwargs = init_kwargs
+
+        # Add supplementary tokens.
+        if added_tokens_file is not None:
+            added_tok_encoder = json.load(open(added_tokens_file, encoding="utf-8"))
+            added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
+            tokenizer.added_tokens_encoder.update(added_tok_encoder)
+            tokenizer.added_tokens_decoder.update(added_tok_decoder)
+
+        return tokenizer
+
+
+    def save_pretrained(self, save_directory):
+        """ Save the tokenizer vocabulary files together with:
+                - added tokens,
+                - special-tokens-to-class-attributes-mapping,
+                - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).
+
+            This won't save modifications other than (added tokens and special token mapping) you may have
+            applied to the tokenizer after the instantion (e.g. modifying tokenizer.do_lower_case after creation).
+
+            This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Saving directory ({}) should be a directory".format(save_directory))
+            return
+
+        special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
+        added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
+        tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
+
+        tokenizer_config = copy.deepcopy(self.init_kwargs)
+        tokenizer_config['init_inputs'] = copy.deepcopy(self.init_inputs)
+        for file_id in self.vocab_files_names.keys():
+            tokenizer_config.pop(file_id, None)
+
+        with open(tokenizer_config_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(tokenizer_config, ensure_ascii=False))
+
+        with open(special_tokens_map_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
+
+        with open(added_tokens_file, 'w', encoding='utf-8') as f:
+            if self.added_tokens_encoder:
+                out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False)
+            else:
+                out_str = u"{}"
+            f.write(out_str)
+
+        vocab_files = self.save_vocabulary(save_directory)
+
+        return vocab_files + (special_tokens_map_file, added_tokens_file)
+
+
+    def save_vocabulary(self, save_directory):
+        """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
+            and special token mappings.
+
+            Please use :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
+        """
+        raise NotImplementedError
+
+
+    def vocab_size(self):
+        """ Size of the base vocabulary (without the added tokens) """
+        raise NotImplementedError
+
+
+    def __len__(self):
+        """ Size of the full vocabulary with the added tokens """
+        return self.vocab_size + len(self.added_tokens_encoder)
+
+
+    def add_tokens(self, new_tokens):
+        """
+        Add a list of new tokens to the tokenizer class. If the new tokens are not in the
+        vocabulary, they are added to it with indices starting from length of the current vocabulary.
+
+        Args:
+            new_tokens: list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
+
+        Returns:
+            Number of tokens added to the vocabulary.
+
+        Examples::
+
+            # Let's see how to increase the vocabulary of Bert model and tokenizer
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            model = BertModel.from_pretrained('bert-base-uncased')
+
+            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
+            print('We have added', num_added_toks, 'tokens')
+            model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
+        """
+        if not new_tokens:
+            return 0
+
+        to_add_tokens = []
+        for token in new_tokens:
+            assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
+            if token != self.unk_token and \
+                    self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token):
+                to_add_tokens.append(token)
+                logger.info("Adding %s to the vocabulary", token)
+
+        added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(to_add_tokens))
+        added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
+        self.added_tokens_encoder.update(added_tok_encoder)
+        self.added_tokens_decoder.update(added_tok_decoder)
+
+        return len(to_add_tokens)
+
+
+    def add_special_tokens(self, special_tokens_dict):
+        """
+        Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
+        to class attributes. If special tokens are NOT in the vocabulary, they are added
+        to it (indexed starting from the last index of the current vocabulary).
+
+        Using `add_special_tokens` will ensure your special tokens can be used in several ways:
+
+        - special tokens are carefully handled by the tokenizer (they are never split)
+        - you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts.
+
+        When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '</s>')
+
+        Args:
+            special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
+                [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
+                ``additional_special_tokens``].
+
+                Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
+
+        Returns:
+            Number of tokens added to the vocabulary.
+
+        Examples::
+
+            # Let's see how to add a new classification token to GPT-2
+            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+            model = GPT2Model.from_pretrained('gpt2')
+
+            special_tokens_dict = {'cls_token': '<CLS>'}
+
+            num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
+            print('We have added', num_added_toks, 'tokens')
+            model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
+
+            assert tokenizer.cls_token == '<CLS>'
+        """
+        if not special_tokens_dict:
+            return 0
+
+        added_tokens = 0
+        for key, value in special_tokens_dict.items():
+            assert key in self.SPECIAL_TOKENS_ATTRIBUTES
+            if key == 'additional_special_tokens':
+                assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value)
+                added_tokens += self.add_tokens(value)
+            else:
+                assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
+                added_tokens += self.add_tokens([value])
+            logger.info("Assigning %s to the %s key of the tokenizer", value, key)
+            setattr(self, key, value)
+
+        return added_tokens
+
+    def tokenize(self, text, **kwargs):
+        """ Converts a string in a sequence of tokens (string), using the tokenizer.
+            Split in words for word-based vocabulary or sub-words for sub-word-based
+            vocabularies (BPE/SentencePieces/WordPieces).
+
+            Take care of added tokens.
+        """
+        def split_on_token(tok, text):
+            result = []
+            split_text = text.split(tok)
+            for i, sub_text in enumerate(split_text):
+                sub_text = sub_text.strip()
+                if i == 0 and not sub_text:
+                    result += [tok]
+                elif i == len(split_text) - 1:
+                    if sub_text:
+                        result += [sub_text]
+                    else:
+                        pass
+                else:
+                    if sub_text:
+                        result += [sub_text]
+                    result += [tok]
+            return result
+
+        def split_on_tokens(tok_list, text):
+            if not text:
+                return []
+            if not tok_list:
+                return self._tokenize(text, **kwargs)
+
+            tokenized_text = []
+            text_list = [text]
+            for tok in tok_list:
+                tokenized_text = []
+                for sub_text in text_list:
+                    if sub_text not in self.added_tokens_encoder \
+                            and sub_text not in self.all_special_tokens:
+                        tokenized_text += split_on_token(tok, sub_text)
+                    else:
+                        tokenized_text += [sub_text]
+                text_list = tokenized_text
+
+            return sum((self._tokenize(token, **kwargs) if token not \
+                    in self.added_tokens_encoder and token not in self.all_special_tokens \
+                    else [token] for token in tokenized_text), [])
+
+        added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens
+        tokenized_text = split_on_tokens(added_tokens, text)
+        return tokenized_text
+
+    def _tokenize(self, text, **kwargs):
+        """ Converts a string in a sequence of tokens (string), using the tokenizer.
+            Split in words for word-based vocabulary or sub-words for sub-word-based
+            vocabularies (BPE/SentencePieces/WordPieces).
+
+            Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+
+    def convert_tokens_to_ids(self, tokens):
+        """ Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
+            (resp. a sequence of ids), using the vocabulary.
+        """
+        if tokens is None:
+            return None
+
+        if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):
+            return self._convert_token_to_id_with_added_voc(tokens)
+
+        ids = []
+        for token in tokens:
+            ids.append(self._convert_token_to_id_with_added_voc(token))
+        if len(ids) > self.max_len:
+            logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
+                           "for this model ({} > {}). Running this sequence through the model will result in "
+                           "indexing errors".format(len(ids), self.max_len))
+        return ids
+
+    def _convert_token_to_id_with_added_voc(self, token):
+        if token is None:
+            return None
+
+        if token in self.added_tokens_encoder:
+            return self.added_tokens_encoder[token]
+        return self._convert_token_to_id(token)
+
+    def _convert_token_to_id(self, token):
+        raise NotImplementedError
+
+    def encode(self, text, text_pair=None, add_special_tokens=False, **kwargs):
+        """
+        Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
+        
+        Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
+
+        Args:
+            text: The first sequence to be encoded.
+            text_pair: Optional second sequence to be encoded.
+            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
+                to their model.
+            **kwargs: passed to the `self.tokenize()` method
+        """
+        if text_pair is None:
+            if add_special_tokens:
+                return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text, **kwargs)))
+            else:
+                return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
+
+        first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text, **kwargs)]
+        second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair, **kwargs)]
+
+        if add_special_tokens:
+            return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens)
+        else:
+            return first_sentence_tokens, second_sentence_tokens
+
+    def add_special_tokens_single_sentence(self, token_ids):
+        logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
+        return token_ids
+
+    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+        logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
+        return token_ids_0 + token_ids_1
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """ Converts a single index or a sequence of indices (integers) in a token "
+            (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
+
+            Args:
+                skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
+        """
+        if isinstance(ids, int):
+            if ids in self.added_tokens_decoder:
+                return self.added_tokens_decoder[ids]
+            else:
+                return self._convert_id_to_token(ids)
+        tokens = []
+        for index in ids:
+            if skip_special_tokens and index in self.all_special_ids:
+                continue
+            if index in self.added_tokens_decoder:
+                tokens.append(self.added_tokens_decoder[index])
+            else:
+                tokens.append(self._convert_id_to_token(index))
+        return tokens
+
+    def _convert_id_to_token(self, index):
+        raise NotImplementedError
+
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string.
+            The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids))
+            but we often want to remove sub-word tokenization artifacts at the same time.
+        """
+        return ' '.join(self.convert_ids_to_tokens(tokens))
+
+    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+        """
+        Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
+        with options to remove special tokens and clean up tokenization spaces.
+        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
+        """
+        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+
+        # To avoid mixing byte-level and unicode for byte-level BPT
+        # we need to build string separatly for added tokens and byte-level tokens
+        # cf. https://github.com/huggingface/pytorch-transformers/issues/1133
+        sub_texts = []
+        current_sub_text = []
+        for token in filtered_tokens:
+            if skip_special_tokens and token in self.all_special_ids:
+                continue
+            if token in self.added_tokens_encoder:
+                if current_sub_text:
+                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+                    current_sub_text = []
+                sub_texts.append(" " + token)
+            else:
+                current_sub_text.append(token)
+        if current_sub_text:
+            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+        text = ''.join(sub_texts)
+
+        if self._sep_token is not None and self._sep_token in text:
+            text = text.replace(self._cls_token, self._sep_token)
+            split_text = list(filter(lambda sentence: len(sentence) > 0, text.split(self._sep_token)))
+            if clean_up_tokenization_spaces:
+                clean_text = [self.clean_up_tokenization(text) for text in split_text]
+                return clean_text
+            else:
+                return split_text
+        else:
+            if clean_up_tokenization_spaces:
+                clean_text = self.clean_up_tokenization(text)
+                return clean_text
+            else:
+                return text
+
+    @property
+    def special_tokens_map(self):
+        """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their
+            values ('<unk>', '<cls>'...)
+        """
+        set_attr = {}
+        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
+            attr_value = getattr(self, "_" + attr)
+            if attr_value:
+                set_attr[attr] = attr_value
+        return set_attr
+
+    @property
+    def all_special_tokens(self):
+        """ List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes
+            (cls_token, unk_token...).
+        """
+        all_toks = []
+        set_attr = self.special_tokens_map
+        for attr_value in set_attr.values():
+            all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
+        all_toks = list(set(all_toks))
+        return all_toks
+
+    @property
+    def all_special_ids(self):
+        """ List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to
+            class attributes (cls_token, unk_token...).
+        """
+        all_toks = self.all_special_tokens
+        all_ids = list(self._convert_token_to_id(t) for t in all_toks)
+        return all_ids
+
+    @staticmethod
+    def clean_up_tokenization(out_string):
+        """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
+        """
+        out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
+                        ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
+                        ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
+        return out_string
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
new file mode 100644
index 0000000..f723138
--- /dev/null
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -0,0 +1,794 @@
+# coding=utf-8
+# Copyright 2019 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import json
+import logging
+import os
+import re
+import sys
+import unicodedata
+from io import open
+
+import sacremoses as sm
+
+from .tokenization_utils import PreTrainedTokenizer
+from .tokenization_bert import BasicTokenizer
+
+logger = logging.getLogger(__name__)
+
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'vocab.json',
+    'merges_file': 'merges.txt',
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json",
+        'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-vocab.json",
+        'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-vocab.json",
+        'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-vocab.json",
+        'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-vocab.json",
+        'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json",
+        'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json",
+        'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json",
+        'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json",
+        'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-vocab.json",
+    },
+    'merges_file':
+    {
+        'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
+        'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
+        'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
+        'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-merges.txt",
+        'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-merges.txt",
+        'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt",
+        'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
+        'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
+        'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt",
+        'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-merges.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'xlm-mlm-en-2048': 512,
+    'xlm-mlm-ende-1024': 512,
+    'xlm-mlm-enfr-1024': 512,
+    'xlm-mlm-enro-1024': 512,
+    'xlm-mlm-tlm-xnli15-1024': 512,
+    'xlm-mlm-xnli15-1024': 512,
+    'xlm-clm-enfr-1024': 512,
+    'xlm-clm-ende-1024': 512,
+    'xlm-mlm-17-1280': 512,
+    'xlm-mlm-100-1280': 512,
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    'xlm-mlm-en-2048': {"do_lowercase_and_remove_accent": True},
+    'xlm-mlm-ende-1024': { "do_lowercase_and_remove_accent": True,
+                            "id2lang": { "0": "de",
+                                        "1": "en"},
+                           "lang2id": { "de": 0,
+                                        "en": 1 }},
+    'xlm-mlm-enfr-1024': { "do_lowercase_and_remove_accent": True,
+                           "id2lang": { "0": "en",
+                                        "1": "fr"},
+                           "lang2id": { "en": 0,
+                                        "fr": 1 }},
+    'xlm-mlm-enro-1024': { "do_lowercase_and_remove_accent": True,
+                           "id2lang": { "0": "en",
+                                        "1": "ro"},
+                           "lang2id": { "en": 0,
+                                        "ro": 1 }},
+    'xlm-mlm-tlm-xnli15-1024': { "do_lowercase_and_remove_accent": True,
+                                 "id2lang": {   "0": "ar",
+                                                "1": "bg",
+                                                "2": "de",
+                                                "3": "el",
+                                                "4": "en",
+                                                "5": "es",
+                                                "6": "fr",
+                                                "7": "hi",
+                                                "8": "ru",
+                                                "9": "sw",
+                                                "10": "th",
+                                                "11": "tr",
+                                                "12": "ur",
+                                                "13": "vi",
+                                                "14": "zh"},
+                                 "lang2id": {   "ar": 0,
+                                                "bg": 1,
+                                                "de": 2,
+                                                "el": 3,
+                                                "en": 4,
+                                                "es": 5,
+                                                "fr": 6,
+                                                "hi": 7,
+                                                "ru": 8,
+                                                "sw": 9,
+                                                "th": 10,
+                                                "tr": 11,
+                                                "ur": 12,
+                                                "vi": 13,
+                                                "zh": 14 }},
+    'xlm-mlm-xnli15-1024': { "do_lowercase_and_remove_accent": True,
+                             "id2lang": {   "0": "ar",
+                                                "1": "bg",
+                                                "2": "de",
+                                                "3": "el",
+                                                "4": "en",
+                                                "5": "es",
+                                                "6": "fr",
+                                                "7": "hi",
+                                                "8": "ru",
+                                                "9": "sw",
+                                                "10": "th",
+                                                "11": "tr",
+                                                "12": "ur",
+                                                "13": "vi",
+                                                "14": "zh"},
+                                 "lang2id": {   "ar": 0,
+                                                "bg": 1,
+                                                "de": 2,
+                                                "el": 3,
+                                                "en": 4,
+                                                "es": 5,
+                                                "fr": 6,
+                                                "hi": 7,
+                                                "ru": 8,
+                                                "sw": 9,
+                                                "th": 10,
+                                                "tr": 11,
+                                                "ur": 12,
+                                                "vi": 13,
+                                                "zh": 14 }},
+    'xlm-clm-enfr-1024': { "do_lowercase_and_remove_accent": True,
+                           "id2lang": { "0": "en",
+                                        "1": "fr"},
+                           "lang2id": { "en": 0,
+                                        "fr": 1 }},
+    'xlm-clm-ende-1024': { "do_lowercase_and_remove_accent": True,
+                           "id2lang": { "0": "de",
+                                        "1": "en"},
+                           "lang2id": { "de": 0,
+                                        "en": 1 }},
+    'xlm-mlm-17-1280': {"do_lowercase_and_remove_accent": False,
+                        "id2lang": {
+                            "0": "ar",
+                            "1": "de",
+                            "2": "en",
+                            "3": "es",
+                            "4": "fr",
+                            "5": "hi",
+                            "6": "it",
+                            "7": "ja",
+                            "8": "ko",
+                            "9": "nl",
+                            "10": "pl",
+                            "11": "pt",
+                            "12": "ru",
+                            "13": "sv",
+                            "14": "tr",
+                            "15": "vi",
+                            "16": "zh"
+                        },
+                        "lang2id": {
+                            "ar": 0,
+                            "de": 1,
+                            "en": 2,
+                            "es": 3,
+                            "fr": 4,
+                            "hi": 5,
+                            "it": 6,
+                            "ja": 7,
+                            "ko": 8,
+                            "nl": 9,
+                            "pl": 10,
+                            "pt": 11,
+                            "ru": 12,
+                            "sv": 13,
+                            "tr": 14,
+                            "vi": 15,
+                            "zh": 16}},
+    'xlm-mlm-100-1280': {"do_lowercase_and_remove_accent": False,
+                        "id2lang": {
+                            "0": "af",
+                            "1": "als",
+                            "2": "am",
+                            "3": "an",
+                            "4": "ang",
+                            "5": "ar",
+                            "6": "arz",
+                            "7": "ast",
+                            "8": "az",
+                            "9": "bar",
+                            "10": "be",
+                            "11": "bg",
+                            "12": "bn",
+                            "13": "br",
+                            "14": "bs",
+                            "15": "ca",
+                            "16": "ceb",
+                            "17": "ckb",
+                            "18": "cs",
+                            "19": "cy",
+                            "20": "da",
+                            "21": "de",
+                            "22": "el",
+                            "23": "en",
+                            "24": "eo",
+                            "25": "es",
+                            "26": "et",
+                            "27": "eu",
+                            "28": "fa",
+                            "29": "fi",
+                            "30": "fr",
+                            "31": "fy",
+                            "32": "ga",
+                            "33": "gan",
+                            "34": "gl",
+                            "35": "gu",
+                            "36": "he",
+                            "37": "hi",
+                            "38": "hr",
+                            "39": "hu",
+                            "40": "hy",
+                            "41": "ia",
+                            "42": "id",
+                            "43": "is",
+                            "44": "it",
+                            "45": "ja",
+                            "46": "jv",
+                            "47": "ka",
+                            "48": "kk",
+                            "49": "kn",
+                            "50": "ko",
+                            "51": "ku",
+                            "52": "la",
+                            "53": "lb",
+                            "54": "lt",
+                            "55": "lv",
+                            "56": "mk",
+                            "57": "ml",
+                            "58": "mn",
+                            "59": "mr",
+                            "60": "ms",
+                            "61": "my",
+                            "62": "nds",
+                            "63": "ne",
+                            "64": "nl",
+                            "65": "nn",
+                            "66": "no",
+                            "67": "oc",
+                            "68": "pl",
+                            "69": "pt",
+                            "70": "ro",
+                            "71": "ru",
+                            "72": "scn",
+                            "73": "sco",
+                            "74": "sh",
+                            "75": "si",
+                            "76": "simple",
+                            "77": "sk",
+                            "78": "sl",
+                            "79": "sq",
+                            "80": "sr",
+                            "81": "sv",
+                            "82": "sw",
+                            "83": "ta",
+                            "84": "te",
+                            "85": "th",
+                            "86": "tl",
+                            "87": "tr",
+                            "88": "tt",
+                            "89": "uk",
+                            "90": "ur",
+                            "91": "uz",
+                            "92": "vi",
+                            "93": "war",
+                            "94": "wuu",
+                            "95": "yi",
+                            "96": "zh",
+                            "97": "zh_classical",
+                            "98": "zh_min_nan",
+                            "99": "zh_yue"
+                        },
+                        "lang2id": {
+                            "af": 0,
+                            "als": 1,
+                            "am": 2,
+                            "an": 3,
+                            "ang": 4,
+                            "ar": 5,
+                            "arz": 6,
+                            "ast": 7,
+                            "az": 8,
+                            "bar": 9,
+                            "be": 10,
+                            "bg": 11,
+                            "bn": 12,
+                            "br": 13,
+                            "bs": 14,
+                            "ca": 15,
+                            "ceb": 16,
+                            "ckb": 17,
+                            "cs": 18,
+                            "cy": 19,
+                            "da": 20,
+                            "de": 21,
+                            "el": 22,
+                            "en": 23,
+                            "eo": 24,
+                            "es": 25,
+                            "et": 26,
+                            "eu": 27,
+                            "fa": 28,
+                            "fi": 29,
+                            "fr": 30,
+                            "fy": 31,
+                            "ga": 32,
+                            "gan": 33,
+                            "gl": 34,
+                            "gu": 35,
+                            "he": 36,
+                            "hi": 37,
+                            "hr": 38,
+                            "hu": 39,
+                            "hy": 40,
+                            "ia": 41,
+                            "id": 42,
+                            "is": 43,
+                            "it": 44,
+                            "ja": 45,
+                            "jv": 46,
+                            "ka": 47,
+                            "kk": 48,
+                            "kn": 49,
+                            "ko": 50,
+                            "ku": 51,
+                            "la": 52,
+                            "lb": 53,
+                            "lt": 54,
+                            "lv": 55,
+                            "mk": 56,
+                            "ml": 57,
+                            "mn": 58,
+                            "mr": 59,
+                            "ms": 60,
+                            "my": 61,
+                            "nds": 62,
+                            "ne": 63,
+                            "nl": 64,
+                            "nn": 65,
+                            "no": 66,
+                            "oc": 67,
+                            "pl": 68,
+                            "pt": 69,
+                            "ro": 70,
+                            "ru": 71,
+                            "scn": 72,
+                            "sco": 73,
+                            "sh": 74,
+                            "si": 75,
+                            "simple": 76,
+                            "sk": 77,
+                            "sl": 78,
+                            "sq": 79,
+                            "sr": 80,
+                            "sv": 81,
+                            "sw": 82,
+                            "ta": 83,
+                            "te": 84,
+                            "th": 85,
+                            "tl": 86,
+                            "tr": 87,
+                            "tt": 88,
+                            "uk": 89,
+                            "ur": 90,
+                            "uz": 91,
+                            "vi": 92,
+                            "war": 93,
+                            "wuu": 94,
+                            "yi": 95,
+                            "zh": 96,
+                            "zh_classical": 97,
+                            "zh_min_nan": 98,
+                            "zh_yue": 99
+                        }},
+}
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+    word is represented as tuple of symbols (symbols being variable-length strings)
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def lowercase_and_remove_accent(text):
+    """
+    Lowercase and strips accents from a piece of text based on
+    https://github.com/facebookresearch/XLM/blob/master/tools/lowercase_and_remove_accent.py
+    """
+    text = ' '.join(text)
+    text = text.lower()
+    text = unicodedata.normalize("NFD", text)
+    output = []
+    for char in text:
+        cat = unicodedata.category(char)
+        if cat == "Mn":
+            continue
+        output.append(char)
+    return "".join(output).lower().split(' ')
+
+
+def replace_unicode_punct(text):
+    '''
+    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
+    '''
+    text = text.replace('，', ',')
+    text = re.sub(r'。\s*', '. ', text)
+    text = text.replace('、', ',')
+    text = text.replace('”', '"')
+    text = text.replace('“', '"')
+    text = text.replace('∶', ':')
+    text = text.replace('：', ':')
+    text = text.replace('？', '?')
+    text = text.replace('《', '"')
+    text = text.replace('》', '"')
+    text = text.replace('）', ')')
+    text = text.replace('！', '!')
+    text = text.replace('（', '(')
+    text = text.replace('；', ';')
+    text = text.replace('１', '"')
+    text = text.replace('」', '"')
+    text = text.replace('「', '"')
+    text = text.replace('０', '0')
+    text = text.replace('３', '3')
+    text = text.replace('２', '2')
+    text = text.replace('５', '5')
+    text = text.replace('６', '6')
+    text = text.replace('９', '9')
+    text = text.replace('７', '7')
+    text = text.replace('８', '8')
+    text = text.replace('４', '4')
+    text = re.sub(r'．\s*', '. ', text)
+    text = text.replace('～', '~')
+    text = text.replace('’', '\'')
+    text = text.replace('…', '...')
+    text = text.replace('━', '-')
+    text = text.replace('〈', '<')
+    text = text.replace('〉', '>')
+    text = text.replace('【', '[')
+    text = text.replace('】', ']')
+    text = text.replace('％', '%')
+    return text
+
+
+def remove_non_printing_char(text):
+    '''
+    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
+    '''
+    output = []
+    for char in text:
+        cat = unicodedata.category(char)
+        if cat.startswith('C'):
+            continue
+        output.append(char)
+    return "".join(output)
+
+
+def romanian_preprocessing(text):
+    '''Sennrich's WMT16 scripts for Romanian preprocessing, used by model `xlm-mlm-enro-1024`'''
+    # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/normalise-romanian.py
+    text = text.replace("\u015e", "\u0218").replace("\u015f", "\u0219")
+    text = text.replace("\u0162", "\u021a").replace("\u0163", "\u021b")
+    # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/remove-diacritics.py
+    text = text.replace("\u0218", "S").replace("\u0219", "s") #s-comma
+    text = text.replace("\u021a", "T").replace("\u021b", "t") #t-comma
+    text = text.replace("\u0102", "A").replace("\u0103", "a")
+    text = text.replace("\u00C2", "A").replace("\u00E2", "a")
+    text = text.replace("\u00CE", "I").replace("\u00EE", "i")
+    return text
+
+
+class XLMTokenizer(PreTrainedTokenizer):
+    """
+    BPE tokenizer for XLM
+
+        - Moses preprocessing & tokenization for most supported languages
+
+        - Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP)
+
+        - (optionally) lower case & normalize all inputs text
+
+        - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
+        (ex: "__classify__") to a vocabulary
+        
+        - `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
+
+        - `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
+
+        - `do_lowercase_and_remove_accent` controle lower casing and accent (automatically set for pretrained vocabularies)
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
+                 sep_token="</s>", pad_token="<pad>", cls_token="</s>",
+                 mask_token="<special1>", additional_special_tokens=["<special0>",
+                 "<special1>", "<special2>", "<special3>", "<special4>", "<special5>",
+                 "<special6>", "<special7>", "<special8>", "<special9>"],
+                 lang2id=None, id2lang=None, do_lowercase_and_remove_accent=True,
+                 **kwargs):
+        super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token,
+                                           sep_token=sep_token, pad_token=pad_token,
+                                           cls_token=cls_token, mask_token=mask_token,
+                                           additional_special_tokens=additional_special_tokens,
+                                           **kwargs)
+
+        # cache of sm.MosesPunctNormalizer instance
+        self.cache_moses_punct_normalizer = dict()
+        # cache of sm.MosesTokenizer instance
+        self.cache_moses_tokenizer = dict()
+        self.lang_with_custom_tokenizer = set(['zh', 'th', 'ja'])
+        # True for current supported model (v1.2.0), False for XLM-17 & 100
+        self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent
+        self.lang2id = lang2id
+        self.id2lang = id2lang
+        if lang2id is not None and id2lang is not None:
+            assert len(lang2id) == len(id2lang)
+
+        self.ja_word_tokenizer = None
+        self.zh_word_tokenizer = None
+
+        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        self.decoder = {v:k for k,v in self.encoder.items()}
+        merges = open(merges_file, encoding='utf-8').read().split('\n')[:-1]
+        merges = [tuple(merge.split()[:2]) for merge in merges]
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+
+    def moses_punct_norm(self, text, lang):
+        if lang not in self.cache_moses_punct_normalizer:
+            punct_normalizer = sm.MosesPunctNormalizer(lang=lang)
+            self.cache_moses_punct_normalizer[lang] = punct_normalizer
+        else:
+            punct_normalizer = self.cache_moses_punct_normalizer[lang]
+        return punct_normalizer.normalize(text)
+
+    def moses_tokenize(self, text, lang):
+        if lang not in self.cache_moses_tokenizer:
+            moses_tokenizer = sm.MosesTokenizer(lang=lang)
+            self.cache_moses_tokenizer[lang] = moses_tokenizer
+        else:
+            moses_tokenizer = self.cache_moses_tokenizer[lang]
+        return moses_tokenizer.tokenize(text, return_str=False, escape=False)
+
+    def moses_pipeline(self, text, lang):
+        text = replace_unicode_punct(text)
+        text = self.moses_punct_norm(text, lang)
+        text = remove_non_printing_char(text)
+        return text
+
+    def ja_tokenize(self, text):
+        if self.ja_word_tokenizer is None:
+            try:
+                import Mykytea
+                self.ja_word_tokenizer = Mykytea.Mykytea('-model %s/local/share/kytea/model.bin' % os.path.expanduser('~'))
+            except (AttributeError, ImportError) as e:
+                logger.error("Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps")
+                logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
+                logger.error("2. autoreconf -i")
+                logger.error("3. ./configure --prefix=$HOME/local")
+                logger.error("4. make && make install")
+                logger.error("5. pip install kytea")
+                raise e
+        return list(self.ja_word_tokenizer.getWS(text))
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def bpe(self, token):
+        word = tuple(token[:-1]) + (token[-1] + '</w>',)
+        if token in self.cache:
+            return self.cache[token]
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token+'</w>'
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        if word == '\n  </w>':
+            word = '\n</w>'
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text, lang='en', bypass_tokenizer=False):
+        """
+        Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizerself. Otherwise, we use Moses.
+
+        Details of tokenization:
+        - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
+            - Install with `pip install sacremoses`
+        - [pythainlp](https://github.com/PyThaiNLP/pythainlp): Thai tokenizer
+            - Install with `pip install pythainlp`
+        - [kytea](https://github.com/chezou/Mykytea-python): Japanese tokenizer, wrapper of [KyTea](https://github.com/neubig/kytea)
+            - Install with the following steps:
+            ```
+            git clone git@github.com:neubig/kytea.git && cd kytea
+            autoreconf -i
+            ./configure --prefix=$HOME/local
+            make && make install
+            pip install kytea
+            ```
+        - [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer *
+            - Install with `pip install jieba`
+
+        \* The original XLM used [Stanford Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip).
+        However, the wrapper (`nltk.tokenize.stanford_segmenter`) is slow due to JVM overhead, and it will be deprecated.
+        Jieba is a lot faster and pip-installable. Note there is some mismatch with the Stanford Segmenter. It should be fine
+        if you fine-tune the model with Chinese supervisionself. If you want the same exact behaviour, use the original XLM
+        [preprocessing script](https://github.com/facebookresearch/XLM/tree/master/tools) to tokenize the sentence externally,
+        and set `bypass_tokenizer=True` to bypass the tokenizer.
+
+        Args:
+            - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported languages. However, we don't enforce it.
+            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)  (bool). If True, we only apply BPE.
+
+        Returns:
+            List of tokens.
+        """
+        if lang and self.lang2id and lang not in self.lang2id:
+            logger.error("Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model.")
+        if bypass_tokenizer:
+            text = text.split()
+        elif lang not in self.lang_with_custom_tokenizer:
+            text = self.moses_pipeline(text, lang=lang)
+            # TODO: make sure we are using `xlm-mlm-enro-1024`, since XLM-100 doesn't have this step
+            if lang == 'ro':
+                text = romanian_preprocessing(text)
+            text = self.moses_tokenize(text, lang=lang)
+        elif lang == 'th':
+            text = self.moses_pipeline(text, lang=lang)
+            try:
+                if 'pythainlp' not in sys.modules:
+                    from pythainlp.tokenize import word_tokenize as th_word_tokenize
+                else:
+                    th_word_tokenize = sys.modules['pythainlp'].word_tokenize
+            except (AttributeError, ImportError) as e:
+                logger.error("Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps")
+                logger.error("1. pip install pythainlp")
+                raise e
+            text = th_word_tokenize(text)
+        elif lang == 'zh':
+            try:
+                if 'jieba' not in sys.modules:
+                    import jieba
+                else:
+                    jieba = sys.modules['jieba']
+            except (AttributeError, ImportError) as e:
+                logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps")
+                logger.error("1. pip install jieba")
+                raise e
+            text = ' '.join(jieba.cut(text))
+            text = self.moses_pipeline(text, lang=lang)
+            text = text.split()
+        elif lang == 'ja':
+            text = self.moses_pipeline(text, lang=lang)
+            text = self.ja_tokenize(text)
+        else:
+            raise ValueError('It should not reach here')
+
+        if self.do_lowercase_and_remove_accent and not bypass_tokenizer:
+            text = lowercase_and_remove_accent(text)
+
+        split_tokens = []
+        for token in text:
+            if token:
+                split_tokens.extend([t for t in self.bpe(token).split(' ')])
+
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = ''.join(tokens).replace('</w>', ' ').strip()
+        return out_string
+
+    def add_special_tokens_single_sentence(self, token_ids):
+        """
+        Adds special tokens to a sequence for sequence classification tasks.
+        An XLM sequence has the following format: [CLS] X [SEP]
+        """
+        return [self.cls_token_id] + token_ids + [self.sep_token_id]
+
+    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+        """
+        Adds special tokens to a sequence pair for sequence classification tasks.
+        An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP]
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def save_vocabulary(self, save_directory):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
+
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    index = token_index
+                writer.write(' '.join(bpe_tokens) + u'\n')
+                index += 1
+
+        return vocab_file, merge_file
diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
new file mode 100644
index 0000000..230095d
--- /dev/null
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -0,0 +1,214 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization classes for XLNet model."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import logging
+import os
+from shutil import copyfile
+
+import unicodedata
+import six
+
+from .tokenization_utils import PreTrainedTokenizer
+
+logger = logging.getLogger(__name__)
+
+VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+    'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model",
+    'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'xlnet-base-cased': None,
+    'xlnet-large-cased': None,
+}
+
+SPIECE_UNDERLINE = u'▁'
+
+# Segments (not really needed)
+SEG_ID_A   = 0
+SEG_ID_B   = 1
+SEG_ID_CLS = 2
+SEG_ID_SEP = 3
+SEG_ID_PAD = 4
+
+class XLNetTokenizer(PreTrainedTokenizer):
+    """
+        SentencePiece based tokenizer. Peculiarities:
+
+            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, vocab_file,
+                 do_lower_case=False, remove_space=True, keep_accents=False,
+                 bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>",
+                 pad_token="<pad>", cls_token="<cls>", mask_token="<mask>",
+                 additional_special_tokens=["<eop>", "<eod>"], **kwargs):
+        super(XLNetTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token,
+                                             unk_token=unk_token, sep_token=sep_token,
+                                             pad_token=pad_token, cls_token=cls_token,
+                                             mask_token=mask_token, additional_special_tokens=
+                                             additional_special_tokens, **kwargs)
+
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
+
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
+                           "pip install sentencepiece")
+
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.vocab_file = vocab_file
+
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(vocab_file)
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model)
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
+                           "pip install sentencepiece")
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(self.vocab_file)
+
+    def preprocess_text(self, inputs):
+        if self.remove_space:
+            outputs = ' '.join(inputs.strip().split())
+        else:
+            outputs = inputs
+        outputs = outputs.replace("``", '"').replace("''", '"')
+
+        if six.PY2 and isinstance(outputs, str):
+            outputs = outputs.decode('utf-8')
+
+        if not self.keep_accents:
+            outputs = unicodedata.normalize('NFKD', outputs)
+            outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
+        if self.do_lower_case:
+            outputs = outputs.lower()
+
+        return outputs
+
+    def _tokenize(self, text, return_unicode=True, sample=False):
+        """ Tokenize a string.
+            return_unicode is used only for py2
+        """
+        text = self.preprocess_text(text)
+        # note(zhiliny): in some systems, sentencepiece only accepts str for py2
+        if six.PY2 and isinstance(text, unicode):
+            text = text.encode('utf-8')
+
+        if not sample:
+            pieces = self.sp_model.EncodeAsPieces(text)
+        else:
+            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
+        new_pieces = []
+        for piece in pieces:
+            if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
+                cur_pieces = self.sp_model.EncodeAsPieces(
+                    piece[:-1].replace(SPIECE_UNDERLINE, ''))
+                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
+                    if len(cur_pieces[0]) == 1:
+                        cur_pieces = cur_pieces[1:]
+                    else:
+                        cur_pieces[0] = cur_pieces[0][1:]
+                cur_pieces.append(piece[-1])
+                new_pieces.extend(cur_pieces)
+            else:
+                new_pieces.append(piece)
+
+        # note(zhiliny): convert back to unicode for py2
+        if six.PY2 and return_unicode:
+            ret_pieces = []
+            for piece in new_pieces:
+                if isinstance(piece, str):
+                    piece = piece.decode('utf-8')
+                ret_pieces.append(piece)
+            new_pieces = ret_pieces
+
+        return new_pieces
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.sp_model.PieceToId(token)
+
+    def _convert_id_to_token(self, index, return_unicode=True):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        if six.PY2 and return_unicode and isinstance(token, str):
+            token = token.decode('utf-8')
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
+        return out_string
+
+    def add_special_tokens_single_sentence(self, token_ids):
+        """
+        Adds special tokens to a sequence pair for sequence classification tasks.
+        An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS]
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        return token_ids + sep + cls
+
+    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+        """
+        Adds special tokens to a sequence for sequence classification tasks.
+        An XLNet sequence has the following format: X [SEP][CLS]
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        return token_ids_0 + sep + token_ids_1 + sep + cls
+
+    def save_vocabulary(self, save_directory):
+        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
+            to a directory.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
diff --git a/pytorch_transformers/utils_glue.py b/pytorch_transformers/utils_glue.py
new file mode 100644
index 0000000..f7203e5
--- /dev/null
+++ b/pytorch_transformers/utils_glue.py
@@ -0,0 +1,672 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT classification fine-tuning: utilities to work with GLUE tasks """
+
+from __future__ import absolute_import, division, print_function
+
+import csv
+import logging
+import os
+import sys
+from io import open
+
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import matthews_corrcoef, f1_score
+from .tokenization_utils import PreTrainedTokenizer
+
+logger = logging.getLogger(__name__)
+from typing import List
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None, tokenized_text_a: List[str]=None, tokenized_text_b: List[str]=None):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+            text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding="utf-8-sig") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+            return lines
+
+
+class MrpcProcessor(DataProcessor):
+    """Processor for the MRPC data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            text_b = line[4]
+            label = line[0]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class SearchProcessor(DataProcessor):
+    """Processor for the Search data set (BEN version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {}".format(os.path.join(data_dir, "Snopes.train.tsv")))
+        return self._create_examples(
+            self._read_tsv2(os.path.join(data_dir, "Snopes.train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir, tokenizer: PreTrainedTokenizer=None):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv2(os.path.join(data_dir, "Snopes.dev.tsv")), "dev", tokenizer=tokenizer)
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _read_tsv2(cls, input_file, quotechar=None, tokenizer:PreTrainedTokenizer=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding="utf-8-sig") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+            return lines
+    def _create_examples(self, lines, set_type, tokenizer:PreTrainedTokenizer=None):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        from tqdm import tqdm
+        for (i, line) in tqdm(enumerate(lines)):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[1]
+            # tokenized_text_a = tokenizer.tokenize(text_a)
+            text_b = line[3]
+            # tokenized_text_b = tokenizer.tokenize(text_b)
+            label = line[4]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
+            "dev_matched")
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[8]
+            text_b = line[9]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliMismatchedProcessor(MnliProcessor):
+    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")),
+            "dev_matched")
+
+
+class ColaProcessor(DataProcessor):
+    """Processor for the CoLA data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            label = line[1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class Sst2Processor(DataProcessor):
+    """Processor for the SST-2 data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[0]
+            label = line[1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class StsbProcessor(DataProcessor):
+    """Processor for the STS-B data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return [None]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[7]
+            text_b = line[8]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QqpProcessor(DataProcessor):
+    """Processor for the QQP data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            try:
+                text_a = line[3]
+                text_b = line[4]
+                label = line[5]
+            except IndexError:
+                continue
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QnliProcessor(DataProcessor):
+    """Processor for the QNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), 
+            "dev_matched")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class RteProcessor(DataProcessor):
+    """Processor for the RTE data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class WnliProcessor(DataProcessor):
+    """Processor for the WNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+def convert_examples_to_features(examples, label_list, max_seq_length,
+                                 tokenizer, output_mode,
+                                 cls_token_at_end=False,
+                                 cls_token='[CLS]',
+                                 cls_token_segment_id=1,
+                                 sep_token='[SEP]',
+                                 sep_token_extra=False,
+                                 pad_on_left=False,
+                                 pad_token=0,
+                                 pad_token_segment_id=0,
+                                 sequence_a_segment_id=0, 
+                                 sequence_b_segment_id=1,
+                                 mask_padding_with_zero=True,
+                                 tokenize_text=True):
+    """ Loads a data file into a list of `InputBatch`s
+        `cls_token_at_end` define the location of the CLS token:
+            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
+            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
+        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
+    """
+
+    label_map = {label : i for i, label in enumerate(label_list)}
+    from tqdm import tqdm
+    features = []
+    ex_index = -1
+    for example in tqdm(examples):
+        ex_index += 1
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+        if tokenize_text: tokens_a = tokenizer.tokenize(example.text_a)
+        else: tokens_a = example.text_a.split()
+
+        tokens_b = None
+        if example.text_b:
+            if tokenize_text: tokens_b = tokenizer.tokenize(example.text_b)
+            else: tokens_b = example.text_b.split()
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
+            special_tokens_count = 4 if sep_token_extra else 3
+            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
+        else:
+            # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
+            special_tokens_count = 3 if sep_token_extra else 2
+            if len(tokens_a) > max_seq_length - special_tokens_count:
+                tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
+
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids:   0   0   0   0  0     0   0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambiguously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = tokens_a + [sep_token]
+        if sep_token_extra:
+            # roberta uses an extra separator b/w pairs of sentences
+            tokens += [sep_token]
+        segment_ids = [sequence_a_segment_id] * len(tokens)
+
+        if tokens_b:
+            tokens += tokens_b + [sep_token]
+            segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
+
+        if cls_token_at_end:
+            tokens = tokens + [cls_token]
+            segment_ids = segment_ids + [cls_token_segment_id]
+        else:
+            tokens = [cls_token] + tokens
+            segment_ids = [cls_token_segment_id] + segment_ids
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        padding_length = max_seq_length - len(input_ids)
+        if pad_on_left:
+            input_ids = ([pad_token] * padding_length) + input_ids
+            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
+            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
+        else:
+            input_ids = input_ids + ([pad_token] * padding_length)
+            input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+            segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+
+        if output_mode == "classification":
+            label_id = label_map[example.label]
+        elif output_mode == "regression":
+            label_id = float(example.label)
+        else:
+            raise KeyError(output_mode)
+
+        if ex_index < 5:
+            logger.info("*** Example ***")
+            logger.info("guid: %s" % (example.guid))
+            logger.info("tokens: %s" % " ".join(
+                    [str(x) for x in tokens]))
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+            logger.info("label: %s (id = %d)" % (example.label, label_id))
+
+        features.append(
+                InputFeatures(input_ids=input_ids,
+                              input_mask=input_mask,
+                              segment_ids=segment_ids,
+                              label_id=label_id))
+    return features
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def simple_accuracy(preds, labels):
+    return (preds == labels).mean()
+
+
+def acc_and_f1(preds, labels):
+    acc = simple_accuracy(preds, labels)
+    f1 = f1_score(y_true=labels, y_pred=preds)
+    return {
+        "acc": acc,
+        "f1": f1,
+        "acc_and_f1": (acc + f1) / 2,
+    }
+
+
+def pearson_and_spearman(preds, labels):
+    pearson_corr = pearsonr(preds, labels)[0]
+    spearman_corr = spearmanr(preds, labels)[0]
+    return {
+        "pearson": pearson_corr,
+        "spearmanr": spearman_corr,
+        "corr": (pearson_corr + spearman_corr) / 2,
+    }
+
+
+def compute_metrics(task_name, preds, labels):
+    assert len(preds) == len(labels)
+    if task_name == "cola":
+        return {"mcc": matthews_corrcoef(labels, preds)}
+    elif task_name == "sst-2":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "mrpc":
+        return acc_and_f1(preds, labels)
+    elif task_name == "search":
+        return acc_and_f1(preds, labels)
+    elif task_name == "sts-b":
+        return pearson_and_spearman(preds, labels)
+    elif task_name == "qqp":
+        return acc_and_f1(preds, labels)
+    elif task_name == "mnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "mnli-mm":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "qnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "rte":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "wnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    else:
+        raise KeyError(task_name)
+
+processors = {
+    "cola": ColaProcessor,
+    "mnli": MnliProcessor,
+    "mnli-mm": MnliMismatchedProcessor,
+    "mrpc": MrpcProcessor,
+    "search": SearchProcessor,
+    "sst-2": Sst2Processor,
+    "sts-b": StsbProcessor,
+    "qqp": QqpProcessor,
+    "qnli": QnliProcessor,
+    "rte": RteProcessor,
+    "wnli": WnliProcessor,
+}
+
+output_modes = {
+    "cola": "classification",
+    "mnli": "classification",
+    "mnli-mm": "classification",
+    "mrpc": "classification",
+    "search": "classification",
+    "sst-2": "classification",
+    "sts-b": "regression",
+    "qqp": "classification",
+    "qnli": "classification",
+    "rte": "classification",
+    "wnli": "classification",
+}
+
+GLUE_TASKS_NUM_LABELS = {
+    "cola": 2,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
+}