diff --git a/examples/emotion/README.md b/examples/emotion/README.md
new file mode 100644
index 0000000..ce5f7f6
--- /dev/null
+++ b/examples/emotion/README.md
@@ -0,0 +1,49 @@
+# Emotion
+
+## Authors
+
+**Armando Fortes**
+
+Homepage: https://atfortes.github.io/
+
+Contact: fmq22@mails.tsinghua.edu.cn
+
+## Task Description
+
+Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. We follow the train-validation-test split configuration from [Huggingface](https://huggingface.co/datasets/emotion). Therefore, we use 16000 samples for training, 2000 samples for validation, and 2000 samples for testing. The goal of the task is: given an English Twitter message, classify whether it is shows sadness, joy, love, anger, fear, or surprise.
+
+We perform prompt-based fine-tuning on the ```glm-roberta-large``` model and use prompt templates from [promptsource](https://github.com/bigscience-workshop/promptsource).
+
+## Running Commands
+
+You can run `python finetune.py --help` to see the usage of all the supported configurations. Using the default configuration as presented in the following command will reproduce the [reported results](#results). 
+
+```bash
+python finetune.py 
+```
+
+## Results
+
+Using the above commands allows us to use the model version from best performing epoch on the validation set to test the performance on the test set. Accordingly, accuracy for ```glm-roberta-large``` on the ```emotion``` dataset increased from **25.85%** before fine-tuning to **93.35%** after fine-tuning, while the respective performance on the validation set was **94.45%**.
+
+## Reference
+
+```bibtex
+@inproceedings{saravia-etal-2018-carer,
+    title = "{CARER}: Contextualized Affect Representations for Emotion Recognition",
+    author = "Saravia, Elvis  and
+      Liu, Hsien-Chi Toby  and
+      Huang, Yen-Hao  and
+      Wu, Junlin  and
+      Chen, Yi-Shin",
+    booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
+    month = oct # "-" # nov,
+    year = "2018",
+    address = "Brussels, Belgium",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/D18-1404",
+    doi = "10.18653/v1/D18-1404",
+    pages = "3687--3697",
+    abstract = "Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.",
+}
+```
diff --git a/examples/emotion/dataset.py b/examples/emotion/dataset.py
new file mode 100644
index 0000000..7afeffb
--- /dev/null
+++ b/examples/emotion/dataset.py
@@ -0,0 +1,35 @@
+import torch
+from tqdm import tqdm
+from datasets import load_dataset
+from promptsource.templates import DatasetTemplates
+
+
+class MultipleChoiceDataset(torch.utils.data.Dataset):
+    def __init__(self, dataset_name, split, prompt_name, tokenizer):
+        super(MultipleChoiceDataset, self).__init__()
+        self.dataset_name = dataset_name
+        self.split = split
+        self.prompt = DatasetTemplates(self.dataset_name)[prompt_name]
+        self.tokenizer = tokenizer
+
+        self.data = []
+        if '/' in self.dataset_name:
+            iters = load_dataset(self.dataset_name.split('/')[0], self.dataset_name.split('/')[1], split=self.split)
+        else:
+            iters = load_dataset(self.dataset_name, split=self.split)
+        for sample in tqdm(iters):
+            self.data.append(dict(zip(
+                ['inputs_pretokenized', 'choices_pretokenized', 'label'],
+                self.prompting_single_sample(sample)
+            )))
+
+    def prompting_single_sample(self, sample):
+        inputs_pretokenized, _ = tuple(self.prompt.apply(sample))
+        choices_pretokenized = self.prompt.answer_choices.split(' ||| ')
+        return inputs_pretokenized + f" {self.tokenizer.mask_token}", choices_pretokenized, sample['label']
+    
+    def __len__(self):
+        return len(self.data)
+    
+    def __getitem__(self, index):
+        return self.data[index]
diff --git a/examples/emotion/eval_utils.py b/examples/emotion/eval_utils.py
new file mode 100644
index 0000000..17191cc
--- /dev/null
+++ b/examples/emotion/eval_utils.py
@@ -0,0 +1,30 @@
+import torch
+import numpy as np
+import torch.nn.functional as F
+from tqdm import tqdm
+from sklearn.metrics import accuracy_score
+from multiple_choice_utils import cond_log_prob, flatten_labels
+
+
+def evaluate(model, tokenizer, data_loader, split):
+    valid_loss = 0.
+    valid_labels = []
+    valid_preds = []
+
+    model.eval()
+
+    with torch.no_grad():
+        for _, sample in tqdm(enumerate(data_loader, start=1), desc=split, total=len(data_loader)):
+            logits = cond_log_prob(model, tokenizer, sample["inputs_pretokenized"], flatten_labels(sample['choices_pretokenized']))
+
+            labels = sample["label"].cuda()
+            loss = F.nll_loss(logits, labels)
+            valid_loss += loss.item()
+            valid_preds.extend(torch.argmax(logits, dim=-1).cpu().numpy().tolist())
+            valid_labels.extend(np.array(sample["label"]).tolist())
+
+    valid_loss = valid_loss / len(data_loader)
+    valid_acc = accuracy_score(valid_preds, valid_labels)
+    print(f"[{split.upper()}] loss={valid_loss}, acc={valid_acc}")
+
+    return valid_loss, valid_acc
diff --git a/examples/emotion/finetune.py b/examples/emotion/finetune.py
new file mode 100644
index 0000000..f84ebdb
--- /dev/null
+++ b/examples/emotion/finetune.py
@@ -0,0 +1,61 @@
+import torch
+import argparse
+import warnings
+from train_utils import train
+from eval_utils import evaluate
+from dataset import MultipleChoiceDataset
+from torch.utils.data import Dataset, DataLoader, random_split
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig, get_linear_schedule_with_warmup
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-mt', '--model_type', type=str, default='BAAI/glm-roberta-large')
+    parser.add_argument('-dn', '--dataset_name', type=str, default='emotion')
+    parser.add_argument('-pn', '--prompt_name', type=str, default='select_emotion_label_from_list')
+    parser.add_argument('-bs', '--batch_size', type=int, default=16)
+    parser.add_argument('-lr', '--learning_rate', type=float, default=1e-5)
+    parser.add_argument('-en', '--epoch_num', type=int, default=10)
+    parser.add_argument('-es', '--early_stopping', type=int, default=2)
+    parser.add_argument('-cd', '--ckpt_dir', type=str, default='./')
+    args = parser.parse_args()
+    print(args)
+
+    # Load model
+    tokenizer = AutoTokenizer.from_pretrained(args.model_type, trust_remote_code=True, revision='main')
+    model = AutoModelForSeq2SeqLM.from_pretrained(args.model_type, trust_remote_code=True, revision='main').cuda()
+
+    # Load data
+    train_dataset = MultipleChoiceDataset(args.dataset_name, 'train', args.prompt_name, tokenizer)
+    valid_dataset = MultipleChoiceDataset(args.dataset_name, 'validation', args.prompt_name, tokenizer)
+    test_dataset = MultipleChoiceDataset(args.dataset_name, 'test', args.prompt_name, tokenizer)
+
+    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True)
+    valid_loader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False)
+    test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)
+
+    # Configure training model, optimizer, and scheduler
+    model = model.float()
+    model.train()
+    num_training_steps = args.epoch_num * (len(train_dataset) // args.batch_size)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate)
+    scheduler = get_linear_schedule_with_warmup(optimizer,
+                                                num_warmup_steps=int(num_training_steps * 0.06),
+                                                num_training_steps=num_training_steps)
+
+    print('Performance on test set BEFORE fine-tuning:')
+    evaluate(model, tokenizer, test_loader, 'test')
+    
+    print('TRAINING...')
+    ckpt_path = args.ckpt_dir + \
+                f"{args.model_type.split('/')[1] if '/' in args.model_type else args.model_type}-" + \
+                f"{args.dataset_name.split('/')[1] if '/' in args.dataset_name else args.dataset_name}.ckpt"
+    model = train(model, tokenizer, train_loader, valid_loader, optimizer, scheduler, ckpt_path,
+                  args.epoch_num, args.early_stopping)
+    
+    print('Performance on test set AFTER fine-tuning:')
+    evaluate(model, tokenizer, test_loader, 'test')
+
+if __name__ == '__main__':
+    warnings.filterwarnings('ignore')
+    main()
diff --git a/examples/emotion/multiple_choice_utils.py b/examples/emotion/multiple_choice_utils.py
new file mode 100644
index 0000000..5bd4103
--- /dev/null
+++ b/examples/emotion/multiple_choice_utils.py
@@ -0,0 +1,121 @@
+'''
+Acknowledgement: Code adapted from Aohan Zeng and Xiao Liu.
+'''
+
+import torch
+import numpy as np
+import torch.nn.functional as F
+from typing import List
+from scipy.linalg import block_diag
+
+
+def flatten_labels(compacted_labels):
+    batch_size = len(compacted_labels[0])
+    num_of_classes = len(compacted_labels)
+    return [[compacted_labels[i][idx] for i in range(num_of_classes)] for idx in range(batch_size)]
+
+
+def build_multiple_choice_sample(tokenizer, context, choices):
+    context_id = tokenizer(context)['input_ids']
+
+    division = len(context_id)
+    mask_position = context_id.index(tokenizer.mask_token_id)
+
+    token = np.array(context_id, dtype=np.int64)
+    attention_mask = [np.ones((division, division), dtype=np.int64)]
+    position_id = np.arange(division, dtype=np.int64)
+    block_position_id = np.zeros(division, dtype=np.int64)
+
+    choice_target_id = []
+    choice_id = []
+
+    for choice_str in choices:
+        choice = np.array(tokenizer(choice_str)['input_ids'][1:-1], dtype=np.int64)
+
+        choice_id.append(choice)
+        choice_target_id.append(np.arange(len(token), len(token) + len(choice), dtype=np.int64))
+        attention_mask.append(np.tril(np.ones((len(choice), len(choice)), dtype=np.int64)))
+
+        token = np.concatenate((token, [tokenizer.sop_token_id], choice[:-1]))
+        position_id = np.concatenate((position_id, [mask_position] * len(choice)))
+        block_position_id = np.concatenate((block_position_id, np.arange(1, 1 + len(choice), dtype=np.int64)))
+
+    attention_mask = block_diag(*attention_mask)
+    attention_mask[division:, :division] = 1
+
+    return {
+        "token": token,
+        "position_id": np.stack((position_id, block_position_id)),
+        "attention_mask": attention_mask,
+        "choices": choice_id,
+        "choice_target_ids": choice_target_id
+    }
+
+
+def pad_batch(tokens, position_ids, attention_mask, max_seq_length):
+    pad_length = max_seq_length - len(tokens)
+    attention_mask = np.pad(
+        attention_mask,
+        pad_width=((0, pad_length),),
+        mode="constant",
+        constant_values=0,
+    )
+    tokens = np.concatenate((tokens, np.zeros(pad_length, dtype=np.int64)))
+    position_ids = np.concatenate((position_ids, position_ids[..., -1:].repeat(pad_length, -1)), axis=-1)
+    return tokens, position_ids, attention_mask
+    
+    
+def collate_fn(samples):
+    TILE = 16
+    length_to_pad = (max(map(lambda spl: len(spl["token"]), samples)) + TILE - 1) // TILE * TILE
+
+    token_batch, position_id_batch, attention_mask_batch = [], [], []
+    choices_batch, choice_target_ids_batch = [], []
+
+    for sample in samples:
+        token, position_id, attention_mask = pad_batch(
+            sample["token"], sample["position_id"], sample["attention_mask"], length_to_pad
+        )
+        token_batch.append(token)
+        position_id_batch.append(position_id)
+        attention_mask_batch.append(attention_mask)
+        choices_batch.append(sample["choices"])
+        choice_target_ids_batch.append(sample["choice_target_ids"])
+
+    return {
+        "tokens": torch.tensor(np.array(token_batch), dtype=torch.int64),
+        "position_ids": torch.tensor(np.array(position_id_batch), dtype=torch.int64),
+        "attention_mask": torch.tensor(np.array(attention_mask_batch), dtype=torch.int64),
+        "choices": choices_batch,
+        "choice_target_ids": choice_target_ids_batch,
+    }
+
+
+def cond_log_prob(model, tokenizer, context: List[str], choices: List[List[str]]) -> List[List[float]]:
+    """
+    Compute conditonal probability for one or more continuation/infilling options.
+    :return The log probablity of each option.
+    """
+    if not isinstance(context, list):
+        context = [context]
+        choices = [choices]
+    choices = [[(' ' + choice) for choice in choice_pair] for choice_pair in choices]  # Feature of SentencePiece tokenizer
+        
+    samples = [build_multiple_choice_sample(tokenizer, ctx, ch) for ctx, ch in zip(context, choices)]
+    
+    batch = collate_fn(samples)
+    
+    logits = model.forward(input_ids=batch['tokens'].cuda(),
+                        attention_mask=batch['attention_mask'].cuda().unsqueeze(1),
+                        position_ids=batch['position_ids'].cuda())['logits']
+    
+    log_probs = []
+    
+    for output, choices, choice_target_ids in zip(F.log_softmax(logits, dim=-1), batch['choices'], batch['choice_target_ids']):
+        log_probs_single = []
+        for choice, choice_target_id in zip(choices, choice_target_ids):
+            tmp = output[choice_target_id, choice]
+            log_probs_single.append(tmp.sum())
+        log_probs.append(torch.stack(log_probs_single))
+    
+    return torch.stack(log_probs)
diff --git a/examples/emotion/requirements.txt b/examples/emotion/requirements.txt
new file mode 100644
index 0000000..7118ad2
--- /dev/null
+++ b/examples/emotion/requirements.txt
@@ -0,0 +1,7 @@
+transformers
+scipy
+datasets
+promptsource
+scikit_learn
+sentencepiece
+tqdm
diff --git a/examples/emotion/train_utils.py b/examples/emotion/train_utils.py
new file mode 100644
index 0000000..ed25260
--- /dev/null
+++ b/examples/emotion/train_utils.py
@@ -0,0 +1,53 @@
+import torch
+import numpy as np
+import torch.nn.functional as F
+from tqdm import tqdm
+from eval_utils import evaluate
+from multiple_choice_utils import cond_log_prob, flatten_labels
+
+
+def train(model, tokenizer, train_loader, valid_loader, optimizer, scheduler, ckpt_path, epoch_num, early_stopping=-1):
+    
+    best_acc = 0.
+    early_stopping_counter = early_stopping
+
+    for e in range(1, epoch_num + 1):
+        print(f"EPOCH {e}")
+        train_loss_value = 0.
+        tqdm_vars = {"lr": np.nan, "loss": np.nan}
+        tbar = tqdm(enumerate(train_loader, start=1), desc="train", total=len(train_loader),
+                    postfix=tqdm_vars)
+
+        model.train()
+
+        for _, sample in tbar:
+            logits = cond_log_prob(model, tokenizer, sample["inputs_pretokenized"], flatten_labels(sample['choices_pretokenized']))
+            labels = sample["label"].cuda()
+            loss = F.nll_loss(logits, labels)
+            train_loss_value += loss.item()
+
+            loss.backward()
+            optimizer.step()
+            scheduler.step()
+            optimizer.zero_grad()
+
+            tqdm_vars["lr"] = optimizer.state_dict()["param_groups"][0]["lr"]
+            tqdm_vars["loss"] = train_loss_value
+            tbar.set_postfix(tqdm_vars)
+            train_loss_value = 0.
+
+        _, valid_acc = evaluate(model, tokenizer, valid_loader, 'valid')
+
+        if early_stopping >= 0:
+            if valid_acc > best_acc:
+                best_acc = valid_acc
+                early_stopping_counter = early_stopping
+                torch.save(model, ckpt_path)
+            else:
+                early_stopping_counter -= 1
+
+            if early_stopping_counter <= 0:
+                print('EARLY STOPPING...')
+                break
+
+    return torch.load(ckpt_path)
diff --git a/examples/rotten_tomatoes/README.md b/examples/rotten_tomatoes/README.md
new file mode 100644
index 0000000..d1984e3
--- /dev/null
+++ b/examples/rotten_tomatoes/README.md
@@ -0,0 +1,39 @@
+# Rotten Tomatoes
+
+## Authors
+
+**Armando Fortes**
+
+Homepage: https://atfortes.github.io/
+
+Contact: fmq22@mails.tsinghua.edu.cn
+
+## Task Description
+
+Movie Review Dataset. This dataset contains 5,331 positive and 5,331 negative processed sentences from Rotten Tomatoes movie reviews. We follow the train-validation-test split configuration from [HuggingFace](https://huggingface.co/datasets/rotten_tomatoes). Therefore, we use 8530 samples for training, 1066 samples for validation, and 1066 samples for testing. The goal of the task is: given a Rotten Tomatoes movie review, classify whether it is positive or negative.
+
+We perform prompt-based fine-tuning on the ```glm-roberta-large``` model and use prompt templates from [promptsource](https://github.com/bigscience-workshop/promptsource).
+
+## Running Commands
+
+You can run `python finetune.py --help` to see the usage of all the supported configurations. Using the default configuration as presented in the following command will reproduce the [reported results](#results). 
+
+```bash
+python finetune.py 
+```
+
+## Results
+
+Using the above commands allows us to use the model version from best performing epoch on the validation set to test the performance on the test set. Accordingly, accuracy for ```glm-roberta-large``` on the ```rotten_tomatoes``` dataset increased from **50.75%** before fine-tuning to **88.93%** after fine-tuning, while the respective performance on the validation set was **90.24%**.
+
+## Reference
+
+```bibtex
+@InProceedings{Pang+Lee:05a,
+  author =       {Bo Pang and Lillian Lee},
+  title =        {Seeing stars: Exploiting class relationships for sentiment
+                  categorization with respect to rating scales},
+  booktitle =    {Proceedings of the ACL},
+  year =         2005
+}
+```
diff --git a/examples/rotten_tomatoes/dataset.py b/examples/rotten_tomatoes/dataset.py
new file mode 100644
index 0000000..7afeffb
--- /dev/null
+++ b/examples/rotten_tomatoes/dataset.py
@@ -0,0 +1,35 @@
+import torch
+from tqdm import tqdm
+from datasets import load_dataset
+from promptsource.templates import DatasetTemplates
+
+
+class MultipleChoiceDataset(torch.utils.data.Dataset):
+    def __init__(self, dataset_name, split, prompt_name, tokenizer):
+        super(MultipleChoiceDataset, self).__init__()
+        self.dataset_name = dataset_name
+        self.split = split
+        self.prompt = DatasetTemplates(self.dataset_name)[prompt_name]
+        self.tokenizer = tokenizer
+
+        self.data = []
+        if '/' in self.dataset_name:
+            iters = load_dataset(self.dataset_name.split('/')[0], self.dataset_name.split('/')[1], split=self.split)
+        else:
+            iters = load_dataset(self.dataset_name, split=self.split)
+        for sample in tqdm(iters):
+            self.data.append(dict(zip(
+                ['inputs_pretokenized', 'choices_pretokenized', 'label'],
+                self.prompting_single_sample(sample)
+            )))
+
+    def prompting_single_sample(self, sample):
+        inputs_pretokenized, _ = tuple(self.prompt.apply(sample))
+        choices_pretokenized = self.prompt.answer_choices.split(' ||| ')
+        return inputs_pretokenized + f" {self.tokenizer.mask_token}", choices_pretokenized, sample['label']
+    
+    def __len__(self):
+        return len(self.data)
+    
+    def __getitem__(self, index):
+        return self.data[index]
diff --git a/examples/rotten_tomatoes/eval_utils.py b/examples/rotten_tomatoes/eval_utils.py
new file mode 100644
index 0000000..17191cc
--- /dev/null
+++ b/examples/rotten_tomatoes/eval_utils.py
@@ -0,0 +1,30 @@
+import torch
+import numpy as np
+import torch.nn.functional as F
+from tqdm import tqdm
+from sklearn.metrics import accuracy_score
+from multiple_choice_utils import cond_log_prob, flatten_labels
+
+
+def evaluate(model, tokenizer, data_loader, split):
+    valid_loss = 0.
+    valid_labels = []
+    valid_preds = []
+
+    model.eval()
+
+    with torch.no_grad():
+        for _, sample in tqdm(enumerate(data_loader, start=1), desc=split, total=len(data_loader)):
+            logits = cond_log_prob(model, tokenizer, sample["inputs_pretokenized"], flatten_labels(sample['choices_pretokenized']))
+
+            labels = sample["label"].cuda()
+            loss = F.nll_loss(logits, labels)
+            valid_loss += loss.item()
+            valid_preds.extend(torch.argmax(logits, dim=-1).cpu().numpy().tolist())
+            valid_labels.extend(np.array(sample["label"]).tolist())
+
+    valid_loss = valid_loss / len(data_loader)
+    valid_acc = accuracy_score(valid_preds, valid_labels)
+    print(f"[{split.upper()}] loss={valid_loss}, acc={valid_acc}")
+
+    return valid_loss, valid_acc
diff --git a/examples/rotten_tomatoes/finetune.py b/examples/rotten_tomatoes/finetune.py
new file mode 100644
index 0000000..ea7e39d
--- /dev/null
+++ b/examples/rotten_tomatoes/finetune.py
@@ -0,0 +1,61 @@
+import torch
+import argparse
+import warnings
+from train_utils import train
+from eval_utils import evaluate
+from dataset import MultipleChoiceDataset
+from torch.utils.data import Dataset, DataLoader, random_split
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig, get_linear_schedule_with_warmup
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-mt', '--model_type', type=str, default='BAAI/glm-roberta-large')
+    parser.add_argument('-dn', '--dataset_name', type=str, default='rotten_tomatoes')
+    parser.add_argument('-pn', '--prompt_name', type=str, default='Sentiment with choices ')
+    parser.add_argument('-bs', '--batch_size', type=int, default=16)
+    parser.add_argument('-lr', '--learning_rate', type=float, default=1e-5)
+    parser.add_argument('-en', '--epoch_num', type=int, default=10)
+    parser.add_argument('-es', '--early_stopping', type=int, default=2)
+    parser.add_argument('-cd', '--ckpt_dir', type=str, default='./')
+    args = parser.parse_args()
+    print(args)
+
+    # Load model
+    tokenizer = AutoTokenizer.from_pretrained(args.model_type, trust_remote_code=True, revision='main')
+    model = AutoModelForSeq2SeqLM.from_pretrained(args.model_type, trust_remote_code=True, revision='main').cuda()
+
+    # Load data
+    train_dataset = MultipleChoiceDataset(args.dataset_name, 'train', args.prompt_name, tokenizer)
+    valid_dataset = MultipleChoiceDataset(args.dataset_name, 'validation', args.prompt_name, tokenizer)
+    test_dataset = MultipleChoiceDataset(args.dataset_name, 'test', args.prompt_name, tokenizer)
+
+    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True)
+    valid_loader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False)
+    test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)
+
+    # Configure training model, optimizer, and scheduler
+    model = model.float()
+    model.train()
+    num_training_steps = args.epoch_num * (len(train_dataset) // args.batch_size)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate)
+    scheduler = get_linear_schedule_with_warmup(optimizer,
+                                                num_warmup_steps=int(num_training_steps * 0.06),
+                                                num_training_steps=num_training_steps)
+
+    print('Performance on test set BEFORE fine-tuning:')
+    evaluate(model, tokenizer, test_loader, 'test')
+    
+    print('TRAINING...')
+    ckpt_path = args.ckpt_dir + \
+                f"{args.model_type.split('/')[1] if '/' in args.model_type else args.model_type}-" + \
+                f"{args.dataset_name.split('/')[1] if '/' in args.dataset_name else args.dataset_name}.ckpt"
+    model = train(model, tokenizer, train_loader, valid_loader, optimizer, scheduler, ckpt_path,
+                  args.epoch_num, args.early_stopping)
+    
+    print('Performance on test set AFTER fine-tuning:')
+    evaluate(model, tokenizer, test_loader, 'test')
+
+if __name__ == '__main__':
+    warnings.filterwarnings('ignore')
+    main()
diff --git a/examples/rotten_tomatoes/multiple_choice_utils.py b/examples/rotten_tomatoes/multiple_choice_utils.py
new file mode 100644
index 0000000..5bd4103
--- /dev/null
+++ b/examples/rotten_tomatoes/multiple_choice_utils.py
@@ -0,0 +1,121 @@
+'''
+Acknowledgement: Code adapted from Aohan Zeng and Xiao Liu.
+'''
+
+import torch
+import numpy as np
+import torch.nn.functional as F
+from typing import List
+from scipy.linalg import block_diag
+
+
+def flatten_labels(compacted_labels):
+    batch_size = len(compacted_labels[0])
+    num_of_classes = len(compacted_labels)
+    return [[compacted_labels[i][idx] for i in range(num_of_classes)] for idx in range(batch_size)]
+
+
+def build_multiple_choice_sample(tokenizer, context, choices):
+    context_id = tokenizer(context)['input_ids']
+
+    division = len(context_id)
+    mask_position = context_id.index(tokenizer.mask_token_id)
+
+    token = np.array(context_id, dtype=np.int64)
+    attention_mask = [np.ones((division, division), dtype=np.int64)]
+    position_id = np.arange(division, dtype=np.int64)
+    block_position_id = np.zeros(division, dtype=np.int64)
+
+    choice_target_id = []
+    choice_id = []
+
+    for choice_str in choices:
+        choice = np.array(tokenizer(choice_str)['input_ids'][1:-1], dtype=np.int64)
+
+        choice_id.append(choice)
+        choice_target_id.append(np.arange(len(token), len(token) + len(choice), dtype=np.int64))
+        attention_mask.append(np.tril(np.ones((len(choice), len(choice)), dtype=np.int64)))
+
+        token = np.concatenate((token, [tokenizer.sop_token_id], choice[:-1]))
+        position_id = np.concatenate((position_id, [mask_position] * len(choice)))
+        block_position_id = np.concatenate((block_position_id, np.arange(1, 1 + len(choice), dtype=np.int64)))
+
+    attention_mask = block_diag(*attention_mask)
+    attention_mask[division:, :division] = 1
+
+    return {
+        "token": token,
+        "position_id": np.stack((position_id, block_position_id)),
+        "attention_mask": attention_mask,
+        "choices": choice_id,
+        "choice_target_ids": choice_target_id
+    }
+
+
+def pad_batch(tokens, position_ids, attention_mask, max_seq_length):
+    pad_length = max_seq_length - len(tokens)
+    attention_mask = np.pad(
+        attention_mask,
+        pad_width=((0, pad_length),),
+        mode="constant",
+        constant_values=0,
+    )
+    tokens = np.concatenate((tokens, np.zeros(pad_length, dtype=np.int64)))
+    position_ids = np.concatenate((position_ids, position_ids[..., -1:].repeat(pad_length, -1)), axis=-1)
+    return tokens, position_ids, attention_mask
+    
+    
+def collate_fn(samples):
+    TILE = 16
+    length_to_pad = (max(map(lambda spl: len(spl["token"]), samples)) + TILE - 1) // TILE * TILE
+
+    token_batch, position_id_batch, attention_mask_batch = [], [], []
+    choices_batch, choice_target_ids_batch = [], []
+
+    for sample in samples:
+        token, position_id, attention_mask = pad_batch(
+            sample["token"], sample["position_id"], sample["attention_mask"], length_to_pad
+        )
+        token_batch.append(token)
+        position_id_batch.append(position_id)
+        attention_mask_batch.append(attention_mask)
+        choices_batch.append(sample["choices"])
+        choice_target_ids_batch.append(sample["choice_target_ids"])
+
+    return {
+        "tokens": torch.tensor(np.array(token_batch), dtype=torch.int64),
+        "position_ids": torch.tensor(np.array(position_id_batch), dtype=torch.int64),
+        "attention_mask": torch.tensor(np.array(attention_mask_batch), dtype=torch.int64),
+        "choices": choices_batch,
+        "choice_target_ids": choice_target_ids_batch,
+    }
+
+
+def cond_log_prob(model, tokenizer, context: List[str], choices: List[List[str]]) -> List[List[float]]:
+    """
+    Compute conditonal probability for one or more continuation/infilling options.
+    :return The log probablity of each option.
+    """
+    if not isinstance(context, list):
+        context = [context]
+        choices = [choices]
+    choices = [[(' ' + choice) for choice in choice_pair] for choice_pair in choices]  # Feature of SentencePiece tokenizer
+        
+    samples = [build_multiple_choice_sample(tokenizer, ctx, ch) for ctx, ch in zip(context, choices)]
+    
+    batch = collate_fn(samples)
+    
+    logits = model.forward(input_ids=batch['tokens'].cuda(),
+                        attention_mask=batch['attention_mask'].cuda().unsqueeze(1),
+                        position_ids=batch['position_ids'].cuda())['logits']
+    
+    log_probs = []
+    
+    for output, choices, choice_target_ids in zip(F.log_softmax(logits, dim=-1), batch['choices'], batch['choice_target_ids']):
+        log_probs_single = []
+        for choice, choice_target_id in zip(choices, choice_target_ids):
+            tmp = output[choice_target_id, choice]
+            log_probs_single.append(tmp.sum())
+        log_probs.append(torch.stack(log_probs_single))
+    
+    return torch.stack(log_probs)
diff --git a/examples/rotten_tomatoes/requirements.txt b/examples/rotten_tomatoes/requirements.txt
new file mode 100644
index 0000000..7118ad2
--- /dev/null
+++ b/examples/rotten_tomatoes/requirements.txt
@@ -0,0 +1,7 @@
+transformers
+scipy
+datasets
+promptsource
+scikit_learn
+sentencepiece
+tqdm
diff --git a/examples/rotten_tomatoes/train_utils.py b/examples/rotten_tomatoes/train_utils.py
new file mode 100644
index 0000000..ed25260
--- /dev/null
+++ b/examples/rotten_tomatoes/train_utils.py
@@ -0,0 +1,53 @@
+import torch
+import numpy as np
+import torch.nn.functional as F
+from tqdm import tqdm
+from eval_utils import evaluate
+from multiple_choice_utils import cond_log_prob, flatten_labels
+
+
+def train(model, tokenizer, train_loader, valid_loader, optimizer, scheduler, ckpt_path, epoch_num, early_stopping=-1):
+    
+    best_acc = 0.
+    early_stopping_counter = early_stopping
+
+    for e in range(1, epoch_num + 1):
+        print(f"EPOCH {e}")
+        train_loss_value = 0.
+        tqdm_vars = {"lr": np.nan, "loss": np.nan}
+        tbar = tqdm(enumerate(train_loader, start=1), desc="train", total=len(train_loader),
+                    postfix=tqdm_vars)
+
+        model.train()
+
+        for _, sample in tbar:
+            logits = cond_log_prob(model, tokenizer, sample["inputs_pretokenized"], flatten_labels(sample['choices_pretokenized']))
+            labels = sample["label"].cuda()
+            loss = F.nll_loss(logits, labels)
+            train_loss_value += loss.item()
+
+            loss.backward()
+            optimizer.step()
+            scheduler.step()
+            optimizer.zero_grad()
+
+            tqdm_vars["lr"] = optimizer.state_dict()["param_groups"][0]["lr"]
+            tqdm_vars["loss"] = train_loss_value
+            tbar.set_postfix(tqdm_vars)
+            train_loss_value = 0.
+
+        _, valid_acc = evaluate(model, tokenizer, valid_loader, 'valid')
+
+        if early_stopping >= 0:
+            if valid_acc > best_acc:
+                best_acc = valid_acc
+                early_stopping_counter = early_stopping
+                torch.save(model, ckpt_path)
+            else:
+                early_stopping_counter -= 1
+
+            if early_stopping_counter <= 0:
+                print('EARLY STOPPING...')
+                break
+
+    return torch.load(ckpt_path)