aoyulong · Oct 8, 2022
diff --git a/‎examples/README.md
+1 b/‎examples/README.md
+1
diff --git a/‎examples/question_generation/README.md
+5 b/‎examples/question_generation/README.md
+5
diff --git a/‎examples/question_generation/t5/README.md
+208 b/‎examples/question_generation/t5/README.md
+208
diff --git a/‎examples/question_generation/t5/finetune.py
+324 b/‎examples/question_generation/t5/finetune.py
+324
diff --git a/‎examples/question_generation/t5/finetune_run.sh
+29 b/‎examples/question_generation/t5/finetune_run.sh
+29
diff --git a/‎examples/question_generation/t5/generate.py
+240 b/‎examples/question_generation/t5/generate.py
+240
diff --git a/‎examples/question_generation/t5/generate_run.sh
+29 b/‎examples/question_generation/t5/generate_run.sh
+29
diff --git a/‎examples/question_generation/t5/requirements.txt
+2 b/‎examples/question_generation/t5/requirements.txt
+2
diff --git a/‎examples/question_generation/t5/utils.py
+187 b/‎examples/question_generation/t5/utils.py
+187
diff --git a/‎examples/question_generation/unimo-text/README.md
+309 b/‎examples/question_generation/unimo-text/README.md
+309
diff --git a/‎examples/question_generation/unimo-text/deploy/paddle_inference/README.md
+54 b/‎examples/question_generation/unimo-text/deploy/paddle_inference/README.md
+54
diff --git a/‎examples/question_generation/unimo-text/deploy/paddle_inference/infer_utils.py
+289 b/‎examples/question_generation/unimo-text/deploy/paddle_inference/infer_utils.py
+289
diff --git a/‎examples/question_generation/unimo-text/deploy/paddle_inference/inference.py
+266 b/‎examples/question_generation/unimo-text/deploy/paddle_inference/inference.py
+266
diff --git a/‎examples/question_generation/unimo-text/deploy/paddle_serving/README.md
+150 b/‎examples/question_generation/unimo-text/deploy/paddle_serving/README.md
+150
diff --git a/‎examples/question_generation/unimo-text/deploy/paddle_serving/config.yml
+59 b/‎examples/question_generation/unimo-text/deploy/paddle_serving/config.yml
+59
diff --git a/‎examples/question_generation/unimo-text/deploy/paddle_serving/infer_utils.py
+289 b/‎examples/question_generation/unimo-text/deploy/paddle_serving/infer_utils.py
+289
diff --git a/‎examples/question_generation/unimo-text/deploy/paddle_serving/pipeline_client.py
+54 b/‎examples/question_generation/unimo-text/deploy/paddle_serving/pipeline_client.py
+54
diff --git a/‎examples/question_generation/unimo-text/deploy/paddle_serving/pipeline_service.py
+82 b/‎examples/question_generation/unimo-text/deploy/paddle_serving/pipeline_service.py
+82
@@ -20,6 +20,7 @@ PaddleNLP provides rich application examples covering mainstream NLP task to hel
 | text_correction  |[文本纠错 (Text Correction)](./text_correction/):star: |
 | semantic_indexing | [语义索引 (Semantic Indexing)](./semantic_indexing/)|
 | information_extraction | [信息抽取 (Information Extraction)](./information_extraction/) |
+| question_generation | [问题生成 (Question Generation)](./question_generation/) |
 
 ## NLP 系统应用 (NLP System Applications)
 
 
@@ -0,0 +1,5 @@
+# 问题生成
+
+Question Generation（QG），即问题生成，指的是给定一段上下文和答案，自动生成一个流畅且符合上下文主题的问句。问题生成技术在教育、咨询、搜索、问答等多个领域均有着巨大的应用价值。
+
+PaddleNLP提供英文和中文问题生成任务示例，分别基于英文预训练语言模型[t5](./t5)和中文预训练语言模型[unimo-text](./unimo-text)。
@@ -0,0 +1,208 @@
+# 问题生成(Question Generation)
+
+## 简介
+
+Question Generation（QG），即问题生成，指的是给定一段上下文（passage或sentence），自动生成一个流畅且符合上下文主题的问句。问题生成通常可以分为两个分支，即无答案问题生成（answer-agnostic question generation）和有答案问题生成（answer-aware question generation）。
+
+本项目是T5在 PaddlePaddle上开源实现的有答案问题生成的例子，包含了在SQuAD数据集上微调和生成的代码。
+
+## 快速开始
+
+### 环境依赖
+
+- nltk
+- evaluate
+
+
+安装方式：`pip install -r requirements.txt`
+
+### 代码结构说明
+
+以下是本项目主要代码结构及说明：
+
+```text
+.
+├── finetune.py # 模型微调主程序入口
+├── generate.py # 模型生成主程序入口
+├── utils.py # 定义参数及一些工具函数
+├── requirements.txt # 环境依赖文件
+└── README.md # 文档说明
+```
+
+### 数据准备
+
+#### 数据加载
+**SQuAD**（Stanford Question Answering Dataset）数据集是一个英文问答数据集，现有的问题生成研究主要在该数据集上进行评价。**SQuAD**中的数据由段落、问题、答案3个主要部分组成，其中段落从维基百科中获取，问题和答案通过众包的方式由人工标注。
+
+为了方便用户快速测试，PaddleNLP Dataset API内置了Squad数据集，一键即可完成数据集加载，示例代码如下：
+
+```python
+from paddlenlp.datasets import load_dataset
+train_set, dev_set, test_set = load_dataset("squad",  splits=["train_v1", "dev_v1"])
+```
+
+#### 数据处理
+针对**SQuAD**数据集，我们需要将QA任务格式的数据进行转换从而得到text2text形式的数据，默认构造方式如下，其他形式输入数据用户可以在convert_example函数中自行定义
+```text
+answer: {answer_text} context: {context_text}
+question: {question_text}
+```
+具体案例如下，
+```text
+answer: the Miller–Rabin primality test context: The property of being prime (or not) is called primality. A simple but slow method of verifying the primality of a given number n is known as trial division. It consists of testing whether n is a multiple of any integer between 2 and . Algorithms much more efficient than trial division have been devised to test the primality of large numbers. These include the Miller–Rabin primality test, which is fast but has a small probability of error, and the AKS primality test, which always produces the correct answer in polynomial time but is too slow to be practical. Particularly fast methods are available for numbers of special forms, such as Mersenne numbers. As of January 2016[update], the largest known prime number has 22,338,618 decimal digits.
+
+question: What is the name of the process which confirms the primality of a number n?
+```
+
+### 模型训练
+
+运行如下命令即可在训练集上进行finetune，并在验证集上进行验证
+
+```shell
+# GPU启动，参数`--gpus`指定训练所用的GPU卡号，可以是单卡，也可以多卡
+# 例如使用1号和2号卡，则：`--gpu 1,2`
+unset CUDA_VISIBLE_DEVICES
+python -m paddle.distributed.launch --gpus 1,2 finetune.py \
+    --model_name_or_path=t5-base \
+    --dataset_name=squad \
+    --output_dir=output \
+    --max_source_length=1024 \
+    --max_target_length=142 \
+    --learning_rate=1e-4 \
+    --num_train_epochs=6 \
+    --logging_steps=100 \
+    --save_steps=1000 \
+    --seed=42 \
+    --train_batch_size=20 \
+    --eval_batch_size=64 \
+    --warmup_proportion=0.1 \
+    --ignore_pad_token_for_loss=True \
+    --device=gpu
+```
+
+其中参数释义如下：
+- `gpus` 指示了训练所用的GPU
+
+- `model_name_or_path` 指示了finetune使用的预训练模型，可以是PaddleNLP提供的预训练模型，或者是本地的模型。如果使用本地的模型，则配置为本地模型的目录地址，例如: ./checkpoints/model_xx/，目录中需包含paddle模型参数model_state.pdparams。如果使用PaddleNLP提供的预训练模型，可以选择下面其中之一。
+
+   | PaddleNLP提供的预训练模型        |
+   |---------------------------------|
+   | t5-base |
+   | t5-large |
+
+- `dataset_name` 表示训练的数据集。
+
+- `output_dir` 表示模型的保存路径。
+
+- `max_source_length` 表示输入序列的长度，超过该长度将被截断。
+
+- `max_target_length` 表示输出的最大长度。
+
+- `learning_rate` 表示基础学习率大小，将与learning rate scheduler产生的值相乘作为当前学习率。
+
+- `num_train_epochs` 表示训练轮数。
+
+- `epochs` 表示训练轮数。
+
+- `logging_steps` 表示日志打印间隔。
+
+- `save_steps` 表示模型保存及评估间隔。
+
+- `seed` 表示随机数生成器的种子。
+
+- `train_batch_size` 表示训练每张卡上的样本数目。
+
+- `eval_batch_size` 表示预测单卡上的样本数目。
+
+- `warmup_proportion` 表示warmup_steps所占总步数的比例。学习率逐渐升高到基础学习率（即上面配置的learning_rate）所需要的迭代数。
+
+- `device` 表示使用的设备。
+
+程序运行时将会自动进行训练和验证，训练过程中会自动保存模型在指定的`output_dir`中。如：
+
+```text
+./output/
+├── t5_model_1000.pdparams
+│   ├── model_config.json
+│   ├── model_state.pdparams
+│   ├── special_tokens_map.json
+│   ├── spiece.model
+│   └── tokenizer_config.json
+└── ...
+```
+
+**NOTE:** 如需恢复模型训练，只需指定`model_name_or_path`为本地微调模型的路径即可。
+
+### 模型预测
+
+运行如下命令即可在验证集上进行测试
+
+```shell
+# GPU启动，预测仅支持单卡
+export CUDA_VISIBLE_DEVICES=0
+python generate.py \
+    --model_name_or_path=t5-base-finetuned-question-generation-ap \
+    --dataset_name=squad \
+    --output_path=generate.txt \
+    --max_source_length=1024 \
+    --max_target_length=142 \
+    --decode_strategy=greedy_search \
+    --top_k=2 \
+    --top_p=1.0 \
+    --num_beams=1 \
+    --length_penalty=0.0 \
+    --batch_size=64 \
+    --seed=42 \
+    --ignore_pad_token_for_loss=True \
+    --logging_steps=100 \
+    --device=gpu
+```
+
+其中参数释义如下：
+- `model_name_or_path` 指示了预测使用的模型，可以是PaddleNLP提供的预训练模型，或者是本地的模型。如果使用本地的模型，则配置为本地模型的目录地址，例如: ./checkpoints/model_xx/，目录中需包含paddle模型参数model_state.pdparams。如果使用PaddleNLP提供的预训练模型，可以选择下面其中之一。
+
+   | PaddleNLP提供的预训练模型        |
+   |---------------------------------|
+   | t5-base |
+   | t5-large |
+   | mrm8488/t5-base-finetuned-question-generation-ap |
+
+- `dataset_name` 表示预测的数据集。
+
+- `output_path` 表示预测结果的保存路径。
+
+- `max_source_length` 表示输入序列的长度，超过该长度将被截断。
+
+- `max_target_length` 表示输出的最大长度。
+
+- `decode_strategy` 表示预测解码时采取的策略，可选"sampling"、"greedy_search"和"beam_search"之一。
+
+- `top_k` 表示采用"sampling"解码策略时，token的概率按从大到小排序，生成的token只从前`top_k`个中进行采样。
+
+- `top_p` 表示采用"sampling"解码策略时，从词表中采样并选择概率之和大于给定阈值`top_p`的token。
+
+- `num_beams` 表示besm search的beam size。
+
+- `length_penalty` 表示besm search生成长度的指数惩罚。
+
+- `batch_size` 表示每次迭代**单卡**上的样本数目。
+
+- `seed` 表示随机数生成器的种子。
+
+- `logging_steps` 表示日志打印间隔。
+
+- `device` 表示使用的设备。
+
+程序运行结束后会将预测生成的问题保存在`output_path`中。同时终端中会输出评估结果。
+
+采用社区微调模型mrm8488/t5-base-finetuned-question-generation-ap在验证集上有如下结果：
+
+|   model_name_or_path    |     BLEU-1     |     BLEU-2     |    BLEU-3    |    BLEU-4    |
+| :----------------------: | :-------------: | :-------------: |:-------------: |:-------------: |
+|        [mrm8488/t5-base-finetuned-question-generation-ap](https://huggingface.co/mrm8488/t5-base-finetuned-question-generation-ap )      | 50.11 | 35.83 | 27.68 |  22.03 |
+
+
+
+
+## 参考文献
+1. Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., Zhou, Y., Li, W. and Liu, P.J., 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res., 21(140), pp.1-67.
@@ -0,0 +1,324 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import argparse
+import random
+import time
+import distutils.util
+from pprint import pprint
+from functools import partial
+from tqdm import tqdm
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+from paddle.io import BatchSampler, DistributedBatchSampler, DataLoader
+from paddlenlp.transformers import T5ForConditionalGeneration, T5Tokenizer
+from paddlenlp.transformers import LinearDecayWithWarmup
+from paddlenlp.utils.log import logger
+from paddlenlp.datasets import load_dataset
+from paddlenlp.data import Tuple, Stack, Pad
+from utils import convert_example, compute_metrics
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument("--model_name_or_path",
+                        default="t5-base",
+                        type=str,
+                        required=True,
+                        help="Path to pre-trained model. ")
+    parser.add_argument(
+        "--dataset_name",
+        default="squad",
+        type=str,
+        required=True,
+        help="The name of the dataset to use. Selected in the list: " + "squad")
+    parser.add_argument(
+        "--output_dir",
+        default="output",
+        type=str,
+        required=True,
+        help=
+        "The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--max_source_length",
+        default=1024,
+        type=int,
+        help="The maximum total input sequence length after "
+        "tokenization.Sequences longer than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument(
+        "--min_target_length",
+        default=0,
+        type=int,
+        help=
+        "The minimum total sequence length for target text when generating. ")
+    parser.add_argument(
+        "--max_target_length",
+        default=142,
+        type=int,
+        help="The maximum total sequence length for target text after "
+        "tokenization. Sequences longer than this will be truncated, sequences shorter will be padded."
+        "during ``evaluate`` and ``predict``.",
+    )
+    parser.add_argument("--learning_rate",
+                        default=1e-4,
+                        type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--num_train_epochs",
+        default=3,
+        type=int,
+        help="Total number of training epochs to perform.",
+    )
+    parser.add_argument("--logging_steps",
+                        type=int,
+                        default=100,
+                        help="Log every X updates steps.")
+    parser.add_argument("--save_steps",
+                        type=int,
+                        default=100,
+                        help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--train_batch_size",
+        default=20,
+        type=int,
+        help="Batch size per GPU/CPU for training.",
+    )
+    parser.add_argument(
+        "--eval_batch_size",
+        default=12,
+        type=int,
+        help="Batch size per GPU/CPU for evaluation.",
+    )
+    parser.add_argument("--weight_decay",
+                        default=0.0,
+                        type=float,
+                        help="Weight decay if we apply some.")
+    parser.add_argument(
+        "--warmup_steps",
+        default=0,
+        type=int,
+        help=
+        "Linear warmup over warmup_steps. If > 0: Override warmup_proportion")
+    parser.add_argument("--warmup_proportion",
+                        default=0.1,
+                        type=float,
+                        help="Linear warmup proportion over total steps.")
+    parser.add_argument("--adam_epsilon",
+                        default=1e-6,
+                        type=float,
+                        help="Epsilon for Adam optimizer.")
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help=
+        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--seed",
+                        default=42,
+                        type=int,
+                        help="random seed for initialization")
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        type=str,
+        choices=["cpu", "gpu", "xpu"],
+        help="The device to select to train the model, is must be cpu/gpu/xpu.")
+    parser.add_argument("--use_amp",
+                        default=False,
+                        type=distutils.util.strtobool,
+                        help="Enable mixed precision training.")
+    parser.add_argument("--scale_loss",
+                        default=2**15,
+                        type=float,
+                        help="The value of scale_loss for fp16.")
+    args = parser.parse_args()
+    return args
+
+
+def set_seed(args):
+    # Use the same data seed(for data shuffle) for all procs to guarantee data
+    # consistency after sharding.
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    # Maybe different op seeds(for dropout) for different procs is better. By:
+    # `paddle.seed(args.seed + paddle.distributed.get_rank())`
+    paddle.seed(args.seed)
+
+
+@paddle.no_grad()
+def evaluate(model, data_loader, tokenizer, ignore_pad_token_for_loss,
+             min_target_length, max_target_length):
+    model.eval()
+    all_preds = []
+    all_labels = []
+    model = model._layers if isinstance(model, paddle.DataParallel) else model
+    for batch in tqdm(data_loader, total=len(data_loader), desc="Eval step"):
+        input_ids, _, _, labels = batch
+        preds = model.generate(input_ids=input_ids,
+                               min_length=min_target_length,
+                               max_length=max_target_length,
+                               use_cache=True)[0]
+        all_preds.extend(preds.numpy())
+        all_labels.extend(labels.numpy())
+    bleu_result, decoded_preds, decoded_labels = compute_metrics(
+        all_preds, all_labels, tokenizer, ignore_pad_token_for_loss)
+    logger.info(bleu_result)
+    model.train()
+
+
+def do_train(args):
+    paddle.set_device(args.device)
+    if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.init_parallel_env()
+
+    set_seed(args)
+    tokenizer = T5Tokenizer.from_pretrained(args.model_name_or_path)
+    model = T5ForConditionalGeneration.from_pretrained(args.model_name_or_path)
+    trans_func = partial(
+        convert_example,
+        tokenizer=tokenizer,
+        decoder_start_token_id=model.t5.bos_token_id,
+        max_source_length=args.max_source_length,
+        max_target_length=args.max_target_length,
+        ignore_pad_token_for_loss=args.ignore_pad_token_for_loss)
+    logger.info("Loading train and dev dataset: %s" % args.dataset_name)
+    train_set, dev_set = load_dataset(args.dataset_name,
+                                      splits=["train_v1", "dev_v1"])
+    logger.info("Loaded train and dev dataset: %s" % args.dataset_name)
+    train_set = train_set.map(trans_func, lazy=True)
+    train_batch_sampler = DistributedBatchSampler(
+        train_set, batch_size=args.train_batch_size, shuffle=True)
+
+    batchify_fn = lambda samples, fn=Tuple(
+        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # input_ids
+        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
+            ),  # attention_mask
+        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
+            ),  # decoder_input_ids
+        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # labels
+    ): fn(samples)
+    train_data_loader = DataLoader(dataset=train_set,
+                                   batch_sampler=train_batch_sampler,
+                                   num_workers=0,
+                                   collate_fn=batchify_fn,
+                                   return_list=True)
+    dev_set = dev_set.map(trans_func, lazy=True)
+    dev_batch_sampler = BatchSampler(dev_set,
+                                     batch_size=args.eval_batch_size,
+                                     shuffle=False)
+    dev_data_loader = DataLoader(dataset=dev_set,
+                                 batch_sampler=dev_batch_sampler,
+                                 num_workers=0,
+                                 collate_fn=batchify_fn,
+                                 return_list=True)
+
+    if paddle.distributed.get_world_size() > 1:
+        model = paddle.DataParallel(model)
+
+    num_training_steps = args.max_steps if args.max_steps > 0 else (
+        len(train_data_loader) * args.num_train_epochs)
+    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion
+
+    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
+                                         warmup)
+
+    # Generate parameter names needed to perform weight decay.
+    # All bias and LayerNorm parameters are excluded.
+    decay_params = [
+        p.name for n, p in model.named_parameters()
+        if not any(nd in n for nd in ["bias", "norm"])
+    ]
+    optimizer = paddle.optimizer.AdamW(
+        learning_rate=lr_scheduler,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=args.adam_epsilon,
+        parameters=model.parameters(),
+        weight_decay=args.weight_decay,
+        apply_decay_param_fun=lambda x: x in decay_params)
+
+    if args.use_amp:
+        scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)
+    global_step = 0
+    tic_train = time.time()
+    for epoch in tqdm(range(args.num_train_epochs), desc="Epoch"):
+        for step, batch in tqdm(enumerate(train_data_loader),
+                                desc="Train step",
+                                total=len(train_data_loader)):
+            global_step += 1
+            input_ids, attention_mask, decoder_input_ids, labels = batch
+            with paddle.amp.auto_cast(
+                    args.use_amp,
+                    custom_white_list=["layer_norm", "softmax", "gelu"]):
+                output = model(input_ids,
+                               attention_mask,
+                               decoder_input_ids,
+                               labels=labels)
+                loss = output[0]
+            if args.use_amp:
+                scaled_loss = scaler.scale(loss)
+                scaled_loss.backward()
+                scaler.minimize(optimizer, scaled_loss)
+            else:
+                loss.backward()
+                optimizer.step()
+            lr_scheduler.step()
+            optimizer.clear_grad()
+            if global_step % args.logging_steps == 0:
+                logger.info(
+                    "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
+                    % (global_step, num_training_steps, epoch, step,
+                       paddle.distributed.get_rank(), loss, optimizer.get_lr(),
+                       args.logging_steps / (time.time() - tic_train)))
+                tic_train = time.time()
+            if global_step % args.save_steps == 0 or global_step == num_training_steps:
+                tic_eval = time.time()
+                evaluate(model, dev_data_loader, tokenizer,
+                         args.ignore_pad_token_for_loss, args.min_target_length,
+                         args.max_target_length)
+                logger.info("eval done total : %s s" % (time.time() - tic_eval))
+                if paddle.distributed.get_rank() == 0:
+                    output_dir = os.path.join(
+                        args.output_dir, "t5_model_%d.pdparams" % global_step)
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    # Need better way to get inner model of DataParallel
+                    model_to_save = model._layers if isinstance(
+                        model, paddle.DataParallel) else model
+                    model_to_save.save_pretrained(output_dir)
+                    tokenizer.save_pretrained(output_dir)
+            if global_step >= num_training_steps:
+                return
+    if paddle.distributed.get_rank() == 0:
+        output_dir = os.path.join(args.output_dir,
+                                  "t5_model_final_%d.pdparams" % global_step)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        # Need better way to get inner model of DataParallel
+        model_to_save = model._layers if isinstance(
+            model, paddle.DataParallel) else model
+        model_to_save.save_pretrained(output_dir)
+        tokenizer.save_pretrained(output_dir)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    pprint(args)
+    do_train(args)
@@ -0,0 +1,29 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+python -m paddle.distributed.launch --gpus 4,5,6,7 finetune.py \
+    --model_name_or_path=t5-base \
+    --dataset_name=squad \
+    --output_dir=output \
+    --max_source_length=1024 \
+    --max_target_length=142 \
+    --learning_rate=1e-4 \
+    --num_train_epochs=6 \
+    --logging_steps=100 \
+    --save_steps=1000 \
+    --seed=42 \
+    --train_batch_size=8 \
+    --eval_batch_size=64 \
+    --warmup_proportion=0.1 \
+    --device=gpu
@@ -0,0 +1,240 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import argparse
+import random
+import time
+from functools import partial
+from pprint import pprint
+import numpy as np
+import paddle
+from paddle.io import BatchSampler, DataLoader
+from paddlenlp.datasets import load_dataset
+from paddlenlp.data import Tuple, Stack, Pad
+from paddlenlp.transformers import T5ForConditionalGeneration, T5Tokenizer
+from utils import convert_example, compute_metrics
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument("--model_name_or_path",
+                        default="t5-base",
+                        type=str,
+                        required=True,
+                        help="Path to pre-trained model. ")
+    parser.add_argument(
+        "--dataset_name",
+        default="squad",
+        type=str,
+        required=True,
+        help="The name of the dataset to use. Selected in the list: " + "squad")
+    parser.add_argument(
+        '--output_path',
+        type=str,
+        default='generate.txt',
+        help='The file path where the infer result will be saved.')
+    parser.add_argument(
+        "--max_source_length",
+        default=1024,
+        type=int,
+        help="The maximum total input sequence length after "
+        "tokenization.Sequences longer than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument(
+        "--min_target_length",
+        default=0,
+        type=int,
+        help=
+        "The minimum total sequence length for target text when generating. ")
+    parser.add_argument(
+        "--max_target_length",
+        default=142,
+        type=int,
+        help="The maximum total sequence length for target text after "
+        "tokenization. Sequences longer than this will be truncated, sequences shorter will be padded."
+        "during ``evaluate`` and ``predict``.",
+    )
+    parser.add_argument('--decode_strategy',
+                        default='greedy_search',
+                        type=str,
+                        help='The decode strategy in generation.')
+    parser.add_argument(
+        '--top_k',
+        default=2,
+        type=int,
+        help=
+        'The number of highest probability vocabulary tokens to keep for top-k sampling.'
+    )
+    parser.add_argument('--top_p',
+                        default=1.0,
+                        type=float,
+                        help='The cumulative probability for top-p sampling.')
+    parser.add_argument('--num_beams',
+                        default=1,
+                        type=int,
+                        help='The number of beams for beam search.')
+    parser.add_argument(
+        '--length_penalty',
+        default=0.6,
+        type=float,
+        help='The exponential penalty to the sequence length for beam search.')
+    parser.add_argument(
+        '--early_stopping',
+        default=False,
+        type=eval,
+        help=
+        'Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.'
+    )
+    parser.add_argument("--diversity_rate",
+                        default=0.0,
+                        type=float,
+                        help="The diversity of beam search. ")
+    parser.add_argument(
+        '--faster',
+        action='store_true',
+        help='Whether to process inference using faster transformer. ')
+    parser.add_argument(
+        '--use_fp16_decoding',
+        action='store_true',
+        help=
+        'Whether to use fp16 when using faster transformer. Only works when using faster transformer. '
+    )
+    parser.add_argument(
+        "--batch_size",
+        default=64,
+        type=int,
+        help="Batch size per GPU/CPU for testing or evaluation.")
+    parser.add_argument("--seed",
+                        default=42,
+                        type=int,
+                        help="random seed for initialization")
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        type=str,
+        choices=["cpu", "gpu", "xpu"],
+        help="The device to select to train the model, is must be cpu/gpu/xpu.")
+    parser.add_argument("--logging_steps",
+                        type=int,
+                        default=100,
+                        help="Log every X updates steps.")
+    parser.add_argument("--is_debug",
+                        default=False,
+                        type=bool,
+                        help="Whether to debug.")
+    args = parser.parse_args()
+    return args
+
+
+def set_seed(args):
+    # Use the same data seed(for data shuffle) for all procs to guarantee data
+    # consistency after sharding.
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    # Maybe different op seeds(for dropout) for different procs is better. By:
+    # `paddle.seed(args.seed + paddle.distributed.get_rank())`
+    paddle.seed(args.seed)
+
+
+@paddle.no_grad()
+def generate(args):
+    paddle.set_device(args.device)
+    set_seed(args)
+    tokenizer = T5Tokenizer.from_pretrained(args.model_name_or_path)
+    model = T5ForConditionalGeneration.from_pretrained(args.model_name_or_path)
+    dataset = load_dataset(args.dataset_name, splits=["dev_v1"])
+    # dataset = load_dataset(args.dataset_name, splits=["dev_v2"])
+    trans_func = partial(
+        convert_example,
+        tokenizer=tokenizer,
+        decoder_start_token_id=model.t5.bos_token_id,
+        max_source_length=args.max_source_length,
+        max_target_length=args.max_target_length,
+        ignore_pad_token_for_loss=args.ignore_pad_token_for_loss,
+        is_train=False)
+
+    batchify_fn = lambda samples, fn=Tuple(
+        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # input_ids
+        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
+            ),  # attention_mask
+        Pad(axis=0, pad_val=-100, dtype="int64"),  # mem_seq_lens
+        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
+            ),  # decoder_input_ids
+        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # labels
+    ): fn(samples)
+
+    dataset = dataset.map(trans_func, lazy=True)
+
+    # debug
+    if args.is_debug:
+        dataset.data = dataset.data[:20]
+        dataset.new_data = dataset.new_data[:20]
+
+    batch_sampler = BatchSampler(dataset,
+                                 batch_size=args.batch_size,
+                                 shuffle=False)
+    data_loader = DataLoader(dataset=dataset,
+                             batch_sampler=batch_sampler,
+                             num_workers=0,
+                             collate_fn=batchify_fn,
+                             return_list=True)
+    data_loader.pin_memory = False
+
+    model.eval()
+    total_time = 0.0
+    start_time = time.time()
+    all_preds = []
+    all_labels = []
+    for step, batch in enumerate(data_loader):
+        input_ids, _, mem_seq_lens, _, labels = batch
+        preds, _ = model.generate(input_ids=input_ids,
+                                  max_length=args.max_target_length,
+                                  min_length=args.min_target_length,
+                                  decode_strategy=args.decode_strategy,
+                                  top_k=args.top_k,
+                                  top_p=args.top_p,
+                                  num_beams=args.num_beams,
+                                  length_penalty=args.length_penalty,
+                                  early_stopping=args.early_stopping,
+                                  diversity_rate=args.diversity_rate,
+                                  use_faster=args.faster)
+        total_time += (time.time() - start_time)
+        if step % args.logging_steps == 0:
+            print('step %d - %.3fs/step' %
+                  (step, total_time / args.logging_steps))
+            total_time = 0.0
+        all_preds.extend(preds.numpy())
+        all_labels.extend(labels.numpy())
+        start_time = time.time()
+
+    bleu_result, decoded_preds, decoded_labels = compute_metrics(
+        all_preds, all_labels, tokenizer, args.ignore_pad_token_for_loss)
+    print("BLEU result: ", bleu_result)
+    with open(args.output_path, 'w', encoding='utf-8') as fout:
+        for decoded_pred in decoded_preds:
+            fout.write(' '.join(decoded_pred) + '\n')
+    print('Save generated result into: %s' % args.output_path)
+    with open(args.output_path + '.reference.txt', 'w',
+              encoding='utf-8') as fout:
+        for decoded_label in decoded_labels:
+            fout.write(' '.join(decoded_label) + '\n')
+    print('Save referenced labels into: %s' % args.output_path +
+          '.reference.txt')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    pprint(args)
+    generate(args)
@@ -0,0 +1,29 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+python generate.py \
+    --model_name_or_path=mrm8488/t5-base-finetuned-question-generation-ap \
+    --dataset_name=squad \
+    --output_path=generate.txt \
+    --max_source_length=1024 \
+    --max_target_length=142 \
+    --decode_strategy=greedy_search \
+    --top_k=2 \
+    --top_p=1.0 \
+    --num_beams=1 \
+    --length_penalty=0.0 \
+    --batch_size=64 \
+    --seed=42 \
+    --logging_steps=20 \
+    --device=gpu
@@ -0,0 +1,2 @@
+nltk==3.6.2
+evaluate==0.2.2
@@ -0,0 +1,187 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import nltk
+from paddlenlp.metrics import BLEU
+import evaluate
+
+
+def convert_example(example,
+                    tokenizer,
+                    decoder_start_token_id,
+                    max_source_length,
+                    max_target_length,
+                    ignore_pad_token_for_loss=True,
+                    is_train=True):
+    """
+    Convert a example into necessary features.
+    """
+    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+    # in one example possible giving several features when a context is long, each of those features having a
+    # context that overlaps a bit the context of the previous feature.
+    #NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is
+    # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead.
+    context = example['context']
+    question = example['question']
+    try:
+        answer = example['answers'][0]
+    except:
+        print(example['context'])
+        print(example['question'])
+        print(example['answers'])
+        print(example['answer_starts'])
+        print(example['is_impossible'])
+
+    input_seq = f'answer: {answer} context: {context} </s>'
+    output_seq = f'question: {question} </s>'
+
+    labels = tokenizer(
+        output_seq,
+        max_seq_len=max_target_length,
+        pad_to_max_seq_len=True,
+        truncation_strategy="longest_first",
+    )
+
+    output_ids = [decoder_start_token_id] + labels["input_ids"][:-1]
+
+    if ignore_pad_token_for_loss:
+        labels["input_ids"] = [(l if l != tokenizer.pad_token_id else -100)
+                               for l in labels["input_ids"]]
+
+    if is_train:
+        input_ids = tokenizer(input_seq,
+                              max_seq_len=max_source_length,
+                              pad_to_max_seq_len=True,
+                              truncation_strategy="longest_first",
+                              return_attention_mask=True,
+                              return_length=False)
+        return input_ids["input_ids"], input_ids[
+            "attention_mask"], output_ids, labels["input_ids"]
+    else:
+        input_ids = tokenizer(input_seq,
+                              max_seq_len=max_source_length,
+                              pad_to_max_seq_len=True,
+                              truncation_strategy="longest_first",
+                              return_attention_mask=True,
+                              return_length=True)
+        return input_ids["input_ids"], input_ids["attention_mask"], \
+        input_ids["length"], output_ids, labels["input_ids"]
+
+
+def compute_metrics(preds, labels, tokenizer, ignore_pad_token_for_loss=True):
+
+    def compute_bleu(predictions,
+                     references,
+                     rouge_types=None,
+                     use_stemmer=True):
+        bleu1 = BLEU(n_size=1)
+        bleu2 = BLEU(n_size=2)
+        bleu3 = BLEU(n_size=3)
+        bleu4 = BLEU(n_size=4)
+        assert len(predictions) == len(references)
+        for i in range(len(predictions)):
+            bleu1.add_inst(predictions[i], [references[i]])
+            bleu2.add_inst(predictions[i], [references[i]])
+            bleu3.add_inst(predictions[i], [references[i]])
+            bleu4.add_inst(predictions[i], [references[i]])
+        result = {
+            'BLEU-1': bleu1.score() * 100,
+            'BLEU-2': bleu2.score() * 100,
+            'BLEU-3': bleu3.score() * 100,
+            'BLEU-4': bleu4.score() * 100
+        }
+        return result
+
+    def compute_bleu_hf(predictions,
+                        references,
+                        rouge_types=None,
+                        use_stemmer=True):
+        predictions = [' '.join(prediction) for prediction in predictions]
+        references = [[' '.join(reference)] for reference in references]
+
+        bleu = evaluate.load("bleu")
+        assert len(predictions) == len(references)
+        bleu1_results = bleu.compute(predictions=predictions,
+                                     references=references,
+                                     max_order=1)
+        bleu2_results = bleu.compute(predictions=predictions,
+                                     references=references,
+                                     max_order=2)
+        bleu3_results = bleu.compute(predictions=predictions,
+                                     references=references,
+                                     max_order=3)
+        bleu4_results = bleu.compute(predictions=predictions,
+                                     references=references,
+                                     max_order=4)
+
+        result = {
+            'BLEU-1': bleu1_results['bleu'] * 100,
+            'BLEU-2': bleu2_results['bleu'] * 100,
+            'BLEU-3': bleu3_results['bleu'] * 100,
+            'BLEU-4': bleu4_results['bleu'] * 100
+        }
+        return result
+
+    def post_process_text(preds, labels):
+        preds = [pred.strip() for pred in preds]
+        labels = [label.strip() for label in labels]
+        preds = [pred.strip('question:') for pred in preds]
+        labels = [label.strip('question:') for label in labels]
+        spreds = [pred.strip() for pred in preds]
+        labels = [label.strip() for label in labels]
+
+        #  expects newline after each sentence
+        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
+        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
+
+        preds = [pred.split() for pred in preds]
+        labels = [label.split() for label in labels]
+
+        return preds, labels
+
+    def post_process_seq(seq,
+                         bos_idx,
+                         eos_idx,
+                         output_bos=False,
+                         output_eos=False):
+        """
+        Post-process the decoded sequence.
+        """
+        eos_pos = len(seq) - 1
+        for i, idx in enumerate(seq):
+            if idx == eos_idx:
+                eos_pos = i
+                break
+        seq = [
+            idx for idx in seq[:eos_pos + 1]
+            if (output_bos or idx != bos_idx) and (output_eos or idx != eos_idx)
+        ]
+        return seq
+
+    if ignore_pad_token_for_loss:
+        labels = np.asarray(labels)
+        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+    decoded_preds, decoded_labels = [], []
+    for pred, label in zip(preds, labels):
+        pred_id = post_process_seq(pred, tokenizer.bos_token_id,
+                                   tokenizer.eos_token_id)
+        label_id = post_process_seq(label, tokenizer.bos_token_id,
+                                    tokenizer.eos_token_id)
+        decoded_preds.append(tokenizer.decode(pred_id))
+        decoded_labels.append(tokenizer.decode(label_id))
+    decoded_preds, decoded_labels = post_process_text(decoded_preds,
+                                                      decoded_labels)
+    # bleu_result = compute_bleu(decoded_preds, decoded_labels)
+    bleu_result = compute_bleu_hf(decoded_preds, decoded_labels)
+    return bleu_result, decoded_preds, decoded_labels
@@ -0,0 +1,309 @@
+# 问题生成
+
+
+**目录**
+- [问题生成](#问题生成)
+  - [简介](#简介)
+    - [基于预训练语言模型的问题生成](#基于预训练语言模型的问题生成)
+  <!-- - [效果展示](#效果展示)
+  - [开箱即用](#开箱即用) -->
+  - [训练定制](#训练定制)
+    - [环境依赖](#环境依赖)
+    - [代码结构说明](#代码结构说明)
+    - [问题生成应用定制训练全流程介绍](#问题生成定制训练全流程介绍)
+    - [数据准备](#数据准备)
+      - [数据加载](#数据加载)
+      - [数据处理](#数据处理)
+      - [从本地文件创建数据集（可选）](#从本地文件创建数据集（可选）)
+    - [模型训练](#模型训练)
+    - [模型预测](#模型预测)
+    - [模型转换部署](#模型转换部署)
+      - [FasterTransformer加速及模型静态图导出](#fastertransformer加速及模型静态图导出)
+      - [模型部署](#模型部署)
+  - [References](#references)
+
+## 简介
+Question Generation（QG），即问题生成，指的是给定一段上下文，自动生成一个流畅且符合上下文主题的问句。问题生成通常可以分为，无答案问题生成和有答案问题生成，这里只关注应用更广的有答案问题生成。
+
+问题生成技术在教育、咨询、搜索、推荐等多个领域均有着巨大的应用价值。具体来说，问题生成可广泛应用于问答系统语料库构建，事实性问题生成，教育行业题库生成，对话提问，聊天机器人意图理解，对话式搜索意图提问，闲聊机器人主动提问等等场景。
+
+### 基于预训练语言模型的问题生成
+
+基于预训练语言模型（Pretrained Language Models, PLMs）范式的问题生成是目前最常用、效果最好(SOTA)的方式。
+预训练模型是在超大规模的语料采用无监督或者弱监督的方式进行预训练，能够学习如何准确地理解自然语言并以自然语言的形式流畅表达，这两项都是完成文本生成任务的重要能力。
+
+PaddleNLP提供了方便易用的接口，可指定模型名或模型参数文件路径通过from_pretrained()方法加载不同网络结构的预训练模型，且相应预训练模型权重下载速度快速、稳定。
+Transformer预训练模型汇总包含了如 ERNIE、BERT、T5、UNIMO等主流预训练模型。下面以中文unimo-text-1.0模型为例，演示如何加载预训练模型和分词器：
+```
+from paddlenlp.transformers import  ErnieForGeneration, ErnieTokenizer
+model_name = "ernie-1.0"
+model = UNIMOLMHeadModel.from_pretrained(model_name)
+tokenizer = UNIMOTokenizer.from_pretrained(model_name)
+```
+<!--
+## 效果展示
+
+## 开箱即用 -->
+
+## 训练定制
+
+### 环境依赖
+- nltk
+- evaluate
+- tqdm
+
+安装方式：`pip install -r requirements.txt`
+
+### 代码结构说明
+
+以下是本项目主要代码结构及说明：
+
+```text
+├── deploy # 部署
+│   ├── paddle_inference # PaddleInference高性能推理部署
+│   │   ├── inference_unimo_text.py # 推理部署脚本
+│   │   └── README.md # 说明文档
+│   └── paddle_serving
+│       ├── config.yml # 配置文件
+│       ├── pipeline_client.py # 客户端程序
+│       ├── pipeline_service.py # 服务器程序
+│       └── README.md # 说明文档
+├── export_model.py # 动态图参数导出静态图参数脚本
+├── train.py # 训练评估脚本
+├── utils.py # 工具函数脚本
+└── README.md # 说明文档
+```
+
+### 问题生成定制训练全流程介绍
+接下来，我们将按数据准备、训练、预测、推理部署等四个阶段对问题生成应用的全流程进行介绍。
+1. **数据准备**
+- 如果没有已标注的数据集，我们推荐doccano数据标注工具([doccano](https://github.com/doccano/doccano))。
+- 如果已有标注好的本地数据集，我们需要根据将数据集整理为文档要求的格式，请参考[从本地文件创建数据集](###从本地文件创建数据集)。
+
+2. **模型训练**
+
+- 数据准备完成后，可以开始使用我们的数据集对预训练模型进行微调训练。我们可以根据任务需求，调整可配置参数，选择使用GPU或CPU进行模型训练，脚本默认保存在开发集最佳表现模型。中文任务默认使用"unimo-text-1.0"模型，unimo-text-1.0还支持large模型，详见[UNIMO模型汇总](https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers/UNIMO/contents.html)，可以根据任务和设备需求进行选择。
+
+
+3. **模型预测**
+
+- 训练结束后，我们可以加载保存的最佳模型进行模型测试，打印模型预测结果。
+
+4. **模型转换部署**
+- 在现实部署场景中，我们通常不仅对模型的精度表现有要求，也需要考虑模型性能上的表现。我们可以使用模型裁剪进一步压缩模型体积，问题生成应用已提供裁剪API对上一步微调后的模型进行裁剪，模型裁剪之后会默认导出静态图模型。
+
+- 模型部署需要将保存的最佳模型参数（动态图）导出成静态图参数，用于后续的推理部署。
+
+- 问题生成应用提供了基于Paddle Serving的本地部署predictor，并且支持在GPU设备使用Faster Generation进行加速。
+
+- 问题生成应用提供了基于Paddle Serving的服务端部署方案。
+
+### 数据准备
+#### 数据加载
+[**DuReader_QG**数据集](https://www.luge.ai/#/luge/dataDetail?id=8)是一个中文问答数据集，我们使用该数据集作为应用案例进行实验。**DuReader_QG**中的数据主要由由上下文、问题、答案3个主要部分组成，其任务描述为给定上下文p和答案a，生成自然语言表述的问题q，且该问题符合段落和上下文的限制。
+
+为了方便用户快速测试，PaddleNLP Dataset API内置了DuReader_QG数据集，一键即可完成数据集加载，示例代码如下：
+
+```python
+from paddlenlp.datasets import load_dataset
+train_ds, dev_ds = load_dataset('dureader_qg', splits=('train', 'dev'))
+```
+
+#### 数据处理
+针对**DuReader_QG**数据集，我们需要将QA任务格式的数据进行转换从而得到text2text形式的数据，我们默认使用模版的方式构造输入数据，默认模版如下，其他形式输入数据用户可以在convert_example函数中自行定义。
+```text
+答案: <answer_text> 上下文: <context_text>
+问题: <question_text>
+```
+
+#### 从本地文件创建数据集（可选）
+在许多情况下，我们需要使用本地数据集来训练我们的文本分类模型，本项目支持使用固定格式本地数据集文件进行训练。
+使用本地文件，只需要在模型训练时指定`train_file` 为本地训练数据地址，`predict_file` 为本地测试数据地址即可。
+
+本地数据集目录结构如下：
+
+```text
+data/
+├── train.json # 训练数据集文件
+├── dev.json # 开发数据集文件
+└── test.json # 可选，待预测数据文件
+```
+本地数据集文件格式如下：
+- train.json/dev.json/test.json 文件格式：
+```text
+{
+  "source": <context_text>,
+  "title": <answer_text>,
+  "target": <question_text>,
+}
+...
+```
+- train.txt/dev.txt/test.txt 文件样例：
+```text
+{
+  "source": "欠条是永久有效的,未约定还款期限的借款合同纠纷,诉讼时效自债权人主张债权之日起计算,时效为2年。 根据《中华人民共和国民法通则》第一百三十五条:向人民法院请求保护民事权利的诉讼时效期间为二年,法律另有规定的除外。 第一百三十七条:诉讼时效期间从知道或者应当知道权利被侵害时起计算。但是,从权利被侵害之日起超过二十年的,人民法院不予保护。有特殊情况的,人民法院可以延长诉讼时效期间。 第六十二条第(四)项:履行期限不明确的,债务人可以随时履行,债权人也可以随时要求履行,但应当给对方必要的准备时间。",
+  "title": "永久有效",
+  "target": "欠条的有效期是多久"
+}
+...
+```
+
+更多数据集读取格式详见[数据集加载](https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_load.html#)和[自定义数据集](https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_self_defined.html)。
+
+### 模型训练
+运行如下命令即可在样例训练集上进行finetune，并在样例验证集上进行验证。
+```shell
+# GPU启动，参数`--gpus`指定训练所用的GPU卡号，可以是单卡，也可以多卡
+# 例如使用1号和2号卡，则：`--gpu 1,2`
+unset CUDA_VISIBLE_DEVICES
+python -m paddle.distributed.launch --gpus "1,2" --log_dir ./unimo/finetune/log run_gen.py \
+    --dataset_name=dureader_qg \
+    --model_name_or_path="unimo-text-1.0" \
+    --save_dir=./unimo/finetune/checkpoints \
+    --output_path ./unimo/finetune/predict.txt \
+    --logging_steps=100 \
+    --save_steps=500 \
+    --epochs=20 \
+    --batch_size=16 \
+    --learning_rate=1e-5 \
+    --warmup_propotion=0.02 \
+    --weight_decay=0.01 \
+    --max_seq_len=512 \
+    --max_target_len=30 \
+    --do_train \
+    --do_predict \
+    --max_dec_len=20 \
+    --min_dec_len=3 \
+    --num_return_sequences=1 \
+    --adversarial_training=None \
+    --template=1 \
+    --device=gpu
+```
+
+
+关键参数释义如下：
+- `gpus` 指示了训练所用的GPU，使用多卡训练可以指定多个GPU卡号，例如 --gpus "0,1"。
+- `dataset_name` 数据集名称，默认为`dureader_qg`。
+- `train_file` 本地训练数据地址，数据格式必须与`dataset_name`所指数据集格式相同，默认为None。
+- `predict_file` 本地测试数据地址，数据格式必须与`dataset_name`所指数据集格式相同，默认为None。
+- `model_name_or_path` 指示了finetune使用的具体预训练模型，可以是PaddleNLP提供的预训练模型，或者是本地的预训练模型。如果使用本地的预训练模型，可以配置本地模型的目录地址，例如: ./checkpoints/model_xx/，目录中需包含paddle预训练模型model_state.pdparams。如果使用PaddleNLP提供的预训练模型，可以选择下面其中之一。
+   | 可选预训练模型        |
+   |---------------------------------|
+   | unimo-text-1.0      |
+   | unimo-text-1.0-large |
+
+   <!-- | T5-PEGASUS |
+   | ernie-1.0 |
+   | ernie-gen-base-en |
+   | ernie-gen-large-en |
+   | ernie-gen-large-en-430g | -->
+
+- `save_dir` 表示模型的保存路径。
+- `output_path` 表示预测结果的保存路径。
+- `logging_steps` 表示日志打印间隔。
+- `save_steps` 表示模型保存及评估间隔。
+- `seed` 表示随机数生成器的种子。
+- `epochs` 表示训练轮数。
+- `batch_size` 表示每次迭代**每张卡**上的样本数目。
+- `learning_rate` 表示基础学习率大小，将于learning rate scheduler产生的值相乘作为当前学习率。
+- `weight_decay` 表示AdamW优化器中使用的weight_decay的系数。
+- `warmup_propotion` 表示学习率逐渐升高到基础学习率（即上面配置的learning_rate）所需要的迭代数占总步数的比例。
+- `max_seq_len` 模型输入序列的最大长度。
+- `max_target_len` 模型训练时标签的最大长度。
+- `min_dec_len` 模型生成序列的最小长度。
+- `max_dec_len` 模型生成序列的最大长度。
+- `do_train` 是否进行训练。
+- `do_predict` 是否进行预测，在验证集上会自动评估。
+- `device` 表示使用的设备，从gpu和cpu中选择。
+- `adversarial_training` 表示使用何种对抗训练策略，从['None', 'fgm', 'pgd']中选择。
+- `template` 表示使用的设备，从[0, 1, 2, 3]中选择，0表示不选择模版，1表示使用默认模版。
+
+程序运行时将会自动进行训练和验证，训练过程中会自动保存模型在指定的`save_dir`中。如：
+
+```text
+./unimo/finetune/checkpoints
+├── model_1000
+│   ├── model_config.json
+│   ├── model_state.pdparams
+│   ├── special_tokens_map.json
+│   ├── tokenizer_config.json
+│   └── vocab.txt
+└── ...
+```
+
+**NOTE:** 如需恢复模型训练，`model_name_or_path`配置本地模型的目录地址即可。
+
+### 模型预测
+
+运行下方脚本可以使用训练好的模型进行预测。
+
+```shell
+export CUDA_VISIBLE_DEVICES=0
+python -u run_gen.py \
+    --dataset_name=dureader_qg \
+    --model_name_or_path=your_model_path \
+    --output_path=./predict.txt \
+    --logging_steps=100 \
+    --batch_size=16 \
+    --max_seq_len=512 \
+    --max_target_len=30 \
+    --do_predict \
+    --max_dec_len=20 \
+    --min_dec_len=3 \
+    --template=1 \
+    --device=gpu
+```
+关键参数释义如下：
+- `output_path` 表示预测输出结果保存的文件路径，默认为./predict.txt。
+
+
+Finetuned baseline的模型在xxx任务验证集上有如下结果(指标为BLEU-4)：
+
+|       model_name        | DuReaderQG |
+| :-----------------------------: | :-----------: |
+|   finetuned unimo-text-1.0    | 41.08 |
+
+### 模型转换部署
+
+#### FasterTransformer加速及模型静态图导出
+
+使用动态图训练结束之后，可以通过[静态图导出脚本](export_model.py)实现基于FasterTransformer的高性能预测加速，并将动态图参数导出成静态图参数，静态图参数保存在`output_path`指定路径中。运行方式：
+
+```shell
+python export_model.py \
+    --model_name_or_path ./checkpoint \
+    --inference_model_dir ./export_checkpoint \
+    --max_out_len 64 \
+    --use_fp16_decoding
+```
+关键参数释义如下：
+
+* `model_name_or_path`：动态图训练保存的参数路径；默认为"./checkpoint"。
+* `inference_model_dir`：静态图图保存的参数路径；默认为"./export_checkpoint"。
+* `max_out_len`：最大输出长度。
+* `use_fp16_decoding`:是否使用fp16解码进行预测。
+
+执行命令后将会自动导出模型到指定的 `inference_model_dir` 中，保存模型文件结构如下所示：
+
+```text
+├── unimo_text.pdiparams
+├── unimo_text.pdiparams.info
+└── unimo_text.pdmodel
+```
+
+#### 模型部署
+本项目提供多种不同场景的部署方案，请根据实际情况进行选择：
+|部署方案|特色|场景|硬件|
+|-|-|-|-|
+|Paddle Inference<br>服务端／云端|通用性|模型算法复杂<br>硬件高性能|X86 CPU<br>NVIDIA 全系列 GPU<br>龙芯／飞腾等国产CPU<br>昆仑／昇腾／海光DCU等AI加速芯片
+|Paddle Serving<br>服务化|高并发|大流量、高并发、低延时、高吞吐<br>资源弹性调控应对服务流量变化<br>支持模型组合、加密、热更新等|X86/Arm CPU<br>NVIDIA GPU<br>昆仑／昇腾等
+
+
+问题生成应用已打通多种场景部署方案，点击链接获取具体的使用教程。
+- [Paddle Inference 推理 (Python)](./deploy/paddle_inference/README.md)
+- [Paddle Serving 服务化部署（Python）](./deploy/paddle_serving/README.md)
+
+
+## References
+Zheng, Chujie, and Minlie Huang. "Exploring prompt-based few-shot learning for grounded dialog generation." arXiv preprint arXiv:2109.06513 (2021).
+Li, Wei, et al. "Unimo: Towards unified-modal understanding and generation via cross-modal contrastive learning." arXiv preprint arXiv:2012.15409 (2020).
@@ -0,0 +1,54 @@
+# Paddle Inference部署
+本文档将介绍如何使用[Paddle Inference](https://paddle-inference.readthedocs.io/en/latest/guides/introduction/index_intro.html#paddle-inference)工具进行问题生成应用高性能推理推理部署。
+
+**目录**
+   * [背景介绍](#背景介绍)
+   * [导出预测部署模型](#导出预测部署模型)
+   * [基于Python预测](#基于Python预测)
+
+
+## 背景介绍
+Paddle inference和主框架的Model.predict均可实现推理预测，Paddle Inference 是飞桨的原生推理库， 作用于服务器端和云端，提供高性能的推理能力，主框架的Model 对象是一个具备训练、测试、推理的神经网络。相比于Model.predict，inference可使用MKLDNN、CUDNN、TensorRT进行预测加速。Model.predict适用于训练好的模型直接进行预测，paddle inference适用于对推理性能、通用性有要求的用户，针对不同平台不同的应用场景进行了深度的适配优化，保证模型在服务器端即训即用，快速部署。由于 Paddle Inference 能力直接基于飞桨的训练算子，因此它支持飞桨训练出的所有模型的推理。
+
+
+
+Paddle Inference Python端预测部署主要包含两个步骤：
+- 导出预测部署模型
+- 基于Python预测
+
+
+## 导出预测部署模型
+部署时需要使用预测格式的模型（即动态图转静态图操作）。预测格式模型相对训练格式模型而言，在拓扑上裁剪掉了预测不需要的算子，并且会做特定部署优化。具体操作详见[FasterTransformer加速及模型静态图导出](../../README.md)。
+
+## 基于Python预测
+<!-- 同上，高性能预测的默认输入和输出形式也为文件，可分别通过 test_path 和 save_path 进行指定，通过如下命令便可以基于Paddle Inference 进行高性能预测： -->
+
+在终端输入以下命令可在GPU上进行预测：
+```shell
+python deploy/paddle_inference/inference.py \
+               --inference_model_dir ./export_checkpoint \
+               --model_name_or_path "unimo-text-1.0" \
+               --predict_file predict_file_name \
+               --output_path output_path_name \
+               --device gpu \
+```
+
+<!-- 在终端输入以下命令可在CPU上进行预测：
+```shell
+python deploy/paddle_inference/inference_unimo_text.py --inference_model_dir ./export_checkpoint --device cpu
+``` -->
+经静态图转换，FastTransformer性能优化，Paddle Inference加速后的部署模型在dureader_qg devset的预测时间为27.74秒，相较于未优化前169.24秒，耗时缩减为原来的16.39%。
+关键参数释义如下：
+* `inference_model_dir`：用于高性能推理的静态图模型参数路径，默认为"./export_checkpoint"。
+* `model_name_or_path`：tokenizer对应模型或路径，默认为"unimo-text-1.0"。
+* `dataset_name`：数据集名称，默认为`dureader_qg`。
+* `predict_file`：本地预测数据地址，数据格式必须与`dataset_name`所指数据集格式相同，默认为None，当为None时默认加载`dataset_name`的dev集。
+* `output_path`：表示预测结果的保存路径。
+* `device`：推理时使用的设备，可选项["gpu"]，默认为"gpu"。
+* `batch_size`：进行推理时的批大小，默认为16。
+* `precision`：当使用TensorRT进行加速推理时，所使用的TensorRT精度，可选项["fp32", "fp16"]，默认为"fp32"。
+<!-- * `precision`：当使用TensorRT进行加速推理时，所使用的TensorRT精度，可选项["fp32", "fp16", "int8"]，默认为"fp32"。 -->
+<!-- * `device`：推理时使用的设备，可选项["gpu", "cpu", "xpu"]，默认为"gpu"。 -->
+<!-- * `enable_mkldnn`：当使用cpu时，选择是否使用MKL-DNN(oneDNN)进行加速推理，默认为False。 -->
+<!-- * `cpu_threads`：当使用cpu时，推理所用的进程数，默认为10。 -->
+<!-- * `use_tensorrt`：当使用gpu时，选择是否使用TensorRT进行加速推理，默认为False。 -->
@@ -0,0 +1,289 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+from functools import partial
+
+import numpy as np
+from numpy import array
+
+import paddle
+import paddle.distributed as dist
+from paddle.io import DataLoader, DistributedBatchSampler, BatchSampler
+from paddlenlp.data import Pad
+
+
+def postprocess_response(token_ids, tokenizer):
+    """Post-process the decoded sequence. Truncate from the first <eos>."""
+    eos_pos = len(token_ids)
+    for i, tok_id in enumerate(token_ids):
+        if tok_id == tokenizer.mask_token_id:
+            eos_pos = i
+            break
+    token_ids = token_ids[:eos_pos]
+    tokens = tokenizer.convert_ids_to_tokens(token_ids)
+    tokens = tokenizer.merge_subword(tokens)
+    return tokens
+
+
+def print_args(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).items()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def set_seed(seed):
+    # Use the same data seed(for data shuffle) for all procs to guarantee data
+    # consistency after sharding.
+    random.seed(seed)
+    np.random.seed(seed)
+    # Maybe different op seeds(for dropout) for different procs is better.
+    paddle.seed(seed + dist.get_rank())
+
+
+def convert_example(example,
+                    tokenizer,
+                    max_seq_len=512,
+                    max_target_len=128,
+                    max_title_len=256,
+                    mode='test',
+                    template=0):
+    """Convert all examples into necessary features."""
+    if mode == 'pretrain' or mode == 'pretrain_test':
+        context = example['context']
+        answer = example['answer']
+        target = example['target']
+
+        source = '答案：' + answer + tokenizer.sep_token + '上下文：' + context
+        title = None
+
+    elif mode == 'train' or mode == 'test':
+        target = None
+        if 'source' in example and 'title' in example:
+            source = example['source']
+            title = None
+            if 'title' in example.keys():
+                title = example['title']
+        elif 'context' in example and 'answer' in example:
+            source = example['context']
+            title = None
+            if 'answer' in example.keys():
+                title = example['answer']
+        else:
+            assert False, "Source and title are not in the input dictionary, nor are context and answer."
+        if 'target' in example.keys():
+            target = example['target']
+
+        if template == 1:
+            source = '答案：' + title + tokenizer.sep_token + '上下文：' + source
+            title = None
+            if target:
+                target = '问题：' + target
+        elif template == 2:
+            source = '答案：' + title + tokenizer.sep_token + '上下文：' + source
+            title = None
+            if target:
+                target = '在已知答案的前提下，问题：' + target
+        elif template == 3:
+            source = '这是一个问题生成任务，根据提供的答案和上下文，来生成问题。' + title + tokenizer.sep_token + '上下文：' + source
+            title = None
+            if target:
+                target = '问题：' + target
+
+    if mode == 'train' or mode == 'pretrain':
+        tokenized_example = tokenizer.gen_encode(source,
+                                                 title=title,
+                                                 target=target,
+                                                 max_seq_len=max_seq_len,
+                                                 max_target_len=max_target_len,
+                                                 max_title_len=max_title_len,
+                                                 return_position_ids=True,
+                                                 return_length=True)
+        target_start = tokenized_example['input_ids'].index(
+            tokenizer.cls_token_id, 1)
+        target_end = tokenized_example['seq_len']
+        # Use to gather the logits corresponding to the labels during training
+        tokenized_example['masked_positions'] = list(
+            range(target_start, target_end - 1))
+        tokenized_example['labels'] = tokenized_example['input_ids'][
+            target_start + 1:target_end]
+
+        return tokenized_example
+
+    elif mode == 'test' or mode == 'pretrain_test':
+        tokenized_example = tokenizer.gen_encode(
+            source,
+            title=title,
+            max_seq_len=max_seq_len,
+            max_title_len=max_title_len,
+            add_start_token_for_decoding=True,
+            return_position_ids=True,
+            return_length=True,
+        )
+
+        if 'target' in example and example['target']:
+            tokenized_example['target'] = example['target']
+        return tokenized_example
+
+
+def batchify_fn(batch_examples, pad_val, mode='test'):
+
+    def pad_mask(batch_attention_mask):
+        batch_size = len(batch_attention_mask)
+        max_len = max(map(len, batch_attention_mask))
+        attention_mask = np.ones(
+            (batch_size, max_len, max_len), dtype='float32') * -1e9
+        for i, mask_data in enumerate(attention_mask):
+            seq_len = len(batch_attention_mask[i])
+            mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i],
+                                                       dtype='float32')
+        # In order to ensure the correct broadcasting mechanism, expand one
+        # dimension to the second dimension (n_head of Transformer).
+        attention_mask = np.expand_dims(attention_mask, axis=1)
+        return attention_mask
+
+    pad_func = Pad(pad_val=pad_val, pad_right=False, dtype='int64')
+
+    input_ids = pad_func([example['input_ids'] for example in batch_examples])
+    token_type_ids = pad_func(
+        [example['token_type_ids'] for example in batch_examples])
+    position_ids = pad_func(
+        [example['position_ids'] for example in batch_examples])
+
+    attention_mask = pad_mask(
+        [example['attention_mask'] for example in batch_examples])
+
+    seq_len = np.asarray([example['seq_len'] for example in batch_examples],
+                         dtype='int32')
+
+    if mode == 'train' or mode == 'pretrain':
+        max_len = max([example['seq_len'] for example in batch_examples])
+        masked_positions = np.concatenate([
+            np.array(example['masked_positions']) +
+            (max_len - example['seq_len']) + i * max_len
+            for i, example in enumerate(batch_examples)
+        ])
+        labels = np.concatenate([
+            np.array(example['labels'], dtype='int64')
+            for example in batch_examples
+        ])
+        return input_ids, token_type_ids, position_ids, attention_mask, masked_positions, labels
+    elif mode == 'test' or mode == 'pretrain_test':
+        return input_ids, token_type_ids, position_ids, attention_mask, seq_len
+
+
+def create_data_loader(dataset, tokenizer, args, mode='test'):
+    trans_func = partial(convert_example,
+                         tokenizer=tokenizer,
+                         mode='test',
+                         template=1)
+    dataset = dataset.map(trans_func, lazy=True)
+    if mode == 'pretrain':
+        batch_sampler = DistributedBatchSampler(dataset,
+                                                batch_size=args.batch_size,
+                                                shuffle=True)
+    elif mode == 'train':
+        batch_sampler = DistributedBatchSampler(dataset,
+                                                batch_size=args.batch_size,
+                                                shuffle=True)
+    elif mode == 'test' or mode == 'pretrain_test':
+        batch_sampler = BatchSampler(dataset,
+                                     batch_size=args.batch_size // 2,
+                                     shuffle=False)
+    collate_fn = partial(batchify_fn, pad_val=tokenizer.pad_token_id, mode=mode)
+    data_loader = DataLoader(dataset,
+                             batch_sampler=batch_sampler,
+                             collate_fn=collate_fn,
+                             return_list=True)
+    return dataset, data_loader
+
+
+def post_process_sum(token_ids, tokenizer):
+    """Post-process the decoded sequence. Truncate from the first <eos>."""
+    eos_pos = len(token_ids)
+    for i, tok_id in enumerate(token_ids):
+        if tok_id == tokenizer.mask_token_id:
+            eos_pos = i
+            break
+    token_ids = token_ids[:eos_pos]
+    tokens = tokenizer.convert_ids_to_tokens(token_ids)
+    tokens = tokenizer.merge_subword(tokens)
+    special_tokens = ['[UNK]']
+    tokens = [token for token in tokens if token not in special_tokens]
+    return token_ids, tokens
+
+
+def remove_template(instr):
+    """Remove template prefix of decoded sequence."""
+    outstr = instr.strip('问题：')
+    outstr = instr.strip('在已知答案的前提下，问题：')
+    return outstr
+
+
+def select_sum(ids,
+               scores,
+               tokenizer,
+               max_dec_len=None,
+               num_return_sequences=1):
+    results = []
+    group = []
+    tmp = []
+    if scores is not None:
+        ids = ids.numpy()
+        scores = scores.numpy()
+
+        if len(ids) != len(scores) or (len(ids) % num_return_sequences) != 0:
+            raise ValueError(
+                "the length of `ids` is {}, but the `num_return_sequences` is {}"
+                .format(len(ids), num_return_sequences))
+
+        for pred, score in zip(ids, scores):
+            pred_token_ids, pred_tokens = post_process_sum(pred, tokenizer)
+            num_token = len(pred_token_ids)
+
+            target = "".join(pred_tokens)
+            target = remove_template(target)
+
+            # not ending
+            if max_dec_len is not None and num_token >= max_dec_len:
+                score -= 1e3
+
+            tmp.append([target, score])
+            if len(tmp) == num_return_sequences:
+                group.append(tmp)
+                tmp = []
+
+        for preds in group:
+            preds = sorted(preds, key=lambda x: -x[1])
+            results.append(preds[0][0])
+    else:
+        ids = ids.numpy()
+
+        for pred in ids:
+            pred_token_ids, pred_tokens = post_process_sum(pred, tokenizer)
+            num_token = len(pred_token_ids)
+            response = "".join(pred_tokens)
+            response = remove_template(response)
+
+            # TODO: Support return scores in FT.
+            tmp.append([response])
+            if len(tmp) == num_return_sequences:
+                group.append(tmp)
+                tmp = []
+
+        for preds in group:
+            results.append(preds[0][0])
+
+    return results
@@ -0,0 +1,266 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+from pprint import pprint
+
+import paddle
+from paddle import inference
+from paddlenlp.datasets import load_dataset
+
+from paddlenlp.transformers import UNIMOLMHeadModel, UNIMOTokenizer
+from paddlenlp.ops.ext_utils import load
+from infer_utils import print_args, set_seed, create_data_loader, select_sum, postprocess_response, convert_example
+import os
+import time
+
+
+def setup_args():
+    """Setup arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--inference_model_dir",
+                        default="./infer_model",
+                        type=str,
+                        help="Path to save inference model of UNIMOText. ")
+    parser.add_argument('--model_name_or_path',
+                        type=str,
+                        default='unimo-text-1.0',
+                        help='The path or shortcut name of the tokenizer.')
+    parser.add_argument("--device",
+                        default="gpu",
+                        choices=["gpu", "cpu", "xpu"],
+                        help="Device selected for inference.")
+    parser.add_argument(
+        "--use_tensorrt",
+        default=False,
+        type=eval,
+        choices=[True, False],
+        help="Whether to use inference engin TensorRT when using gpu.")
+    parser.add_argument('--enable_mkldnn',
+                        default=False,
+                        type=eval,
+                        choices=[True, False],
+                        help='Enable to use mkldnn to speed up when using cpu.')
+    parser.add_argument('--cpu_threads',
+                        default=10,
+                        type=int,
+                        help='Number of threads to predict when using cpu.')
+    parser.add_argument("--precision",
+                        default="fp32",
+                        type=str,
+                        choices=["fp32", "fp16", "int8"],
+                        help='The tensorrt precision.')
+    parser.add_argument("--batch_size",
+                        type=int,
+                        default=16,
+                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        '--output_path',
+        type=str,
+        default='./predict.txt',
+        help='The file path where the infer result will be saved.')
+    parser.add_argument('--logging_steps',
+                        type=int,
+                        default=100,
+                        help='Log every X updates steps.')
+    parser.add_argument('--dataset_name',
+                        type=str,
+                        default='dureader_qg',
+                        help='The name of the dataset to load.')
+    parser.add_argument("--predict_file",
+                        type=str,
+                        required=False,
+                        default=None,
+                        help="Predict data path.")
+    parser.add_argument('--max_dec_len',
+                        type=int,
+                        default=20,
+                        help='The maximum sequence length of decoding.')
+    parser.add_argument(
+        '--num_return_sequences',
+        type=int,
+        default=1,
+        help='The numbers of returned sequences for one input in generation.')
+
+    args = parser.parse_args()
+    return args
+
+
+def setup_predictor(args):
+    """Setup inference predictor."""
+    # Load FasterTransformer lib.
+    load("FasterTransformer", verbose=True)
+    model_file = os.path.join(args.inference_model_dir, "unimo_text.pdmodel")
+    params_file = os.path.join(args.inference_model_dir, "unimo_text.pdiparams")
+    if not os.path.exists(model_file):
+        raise ValueError("not find model file path {}".format(model_file))
+    if not os.path.exists(params_file):
+        raise ValueError("not find params file path {}".format(params_file))
+    config = inference.Config(model_file, params_file)
+    if args.device == "gpu":
+        config.enable_use_gpu(100, 0)
+        config.switch_ir_optim()
+        config.enable_memory_optim()
+        config.disable_glog_info()
+
+        precision_map = {
+            "fp16": inference.PrecisionType.Half,
+            "fp32": inference.PrecisionType.Float32,
+            "int8": inference.PrecisionType.Int8
+        }
+        precision_mode = precision_map[args.precision]
+        if args.use_tensorrt:
+            config.enable_tensorrt_engine(max_batch_size=args.batch_size,
+                                          min_subgraph_size=30,
+                                          precision_mode=precision_mode)
+    elif args.device == "cpu":
+        config.disable_gpu()
+        if args.enable_mkldnn:
+            config.enable_mkldnn()
+            config.set_mkldnn_cache_capacity(10)
+
+        config.set_cpu_math_library_num_threads(args.cpu_threads)
+    elif args.device == "xpu":
+        config.enable_xpu(100)
+    predictor = inference.create_predictor(config)
+    return predictor
+
+
+@paddle.no_grad()
+def infer_one(args, predictor, inputs=None):
+    """Use predictor to inference."""
+    tokenizer = UNIMOTokenizer.from_pretrained('unimo-text-1.0')
+
+    if not inputs:
+        inputs = {
+            "context":
+            "奇峰黄山千米以上的山峰有77座，整座黄山就是一座花岗岩的峰林，自古有36大峰，36小峰，最高峰莲花峰、最险峰天都峰和观日出的最佳点光明顶构成黄山的三大主峰。",
+            "answer": "莲花峰"
+        }
+
+    inputs = '答案：' + inputs['answer'] + tokenizer.sep_token + '上下文：' + inputs[
+        'context']
+    data = tokenizer.gen_encode(inputs,
+                                add_start_token_for_decoding=True,
+                                return_length=True,
+                                is_split_into_words=False)
+
+    input_handles = {}
+    for name in predictor.get_input_names():
+        input_handles[name] = predictor.get_input_handle(name)
+        if name == "attention_mask":
+            input_handles[name].copy_from_cpu(
+                np.expand_dims(np.asarray(data[name], dtype="float32"),
+                               axis=(0, 1)))
+        else:
+            input_handles[name].copy_from_cpu(
+                np.asarray(data[name], dtype="int32").reshape([1, -1]))
+
+    output_handles = [
+        predictor.get_output_handle(name)
+        for name in predictor.get_output_names()
+    ]
+
+    predictor.run()
+
+    output = [output_handle.copy_to_cpu() for output_handle in output_handles]
+
+    for sample in output[0][:, :, 0].tolist():
+        print("".join(postprocess_response(sample, tokenizer)))
+
+
+@paddle.no_grad()
+def infer(args, predictor, data_loader, tokenizer):
+    print('Infer begin...')
+    pred_ref = []
+    total_time = 0.0
+    start_time = time.time()
+    for step, inputs in enumerate(data_loader, 1):
+        input_ids, token_type_ids, position_ids, attention_mask, seq_len = inputs
+        data = {
+            'input_ids': input_ids,
+            'token_type_ids': token_type_ids,
+            'position_ids': position_ids,
+            'attention_mask': attention_mask,
+            'seq_len': seq_len
+        }
+
+        input_handles = {}
+        for name in predictor.get_input_names():
+            input_handles[name] = predictor.get_input_handle(name)
+            if name == "attention_mask":
+                input_handles[name].copy_from_cpu(
+                    np.asarray(data[name], dtype="float32"))
+            else:
+                input_handles[name].copy_from_cpu(
+                    np.asarray(data[name], dtype="int32"))
+
+        output_handles = [
+            predictor.get_output_handle(name)
+            for name in predictor.get_output_names()
+        ]
+
+        predictor.run()
+
+        output = [
+            output_handle.copy_to_cpu() for output_handle in output_handles
+        ]
+
+        ids = output[0]
+        scores = output[1]
+
+        ids = paddle.to_tensor(ids, dtype='int32')[:, 0, :]
+        scores = paddle.to_tensor(scores, dtype='float32')
+
+        total_time += (time.time() - start_time)
+        if step % args.logging_steps == 0:
+            print('step %d - %.3fs/step' %
+                  (step, total_time / args.logging_steps))
+            total_time = 0.0
+
+        results = select_sum(ids, scores, tokenizer, args.max_dec_len,
+                             args.num_return_sequences)
+
+        pred_ref.extend(results)
+        start_time = time.time()
+
+    with open(args.output_path, 'w', encoding='utf-8') as fout:
+        for ref in pred_ref:
+            fout.write(ref + '\n')
+
+    print('\nSave inference result into: %s' % args.output_path)
+
+    if 'target' in data_loader.dataset[0].keys():
+        with open(args.output_path + '.reference.txt', 'w',
+                  encoding='utf-8') as fout:
+            targets = [example['target'] for example in data_loader.dataset]
+            for target in targets:
+                fout.write(target + '\n')
+
+
+if __name__ == "__main__":
+    args = setup_args()
+    pprint(args)
+
+    predictor = setup_predictor(args)
+    tokenizer = UNIMOTokenizer.from_pretrained(args.model_name_or_path)
+    ds = load_dataset(args.dataset_name,
+                      splits='dev',
+                      data_files=args.predict_file)
+    ds, data_loader = create_data_loader(ds, tokenizer, args, 'test')
+
+    time_begin = time.time()
+    infer(args, predictor, data_loader, tokenizer)
+    print('inference cost time:', time.time() - time_begin)
@@ -0,0 +1,150 @@
+# Paddle Serving服务化部署
+
+本文档将介绍如何使用[Paddle Serving](https://github.com/PaddlePaddle/Serving/blob/develop/README_CN.md)工具部署问题生成在线服务。
+
+## 目录
+- [Paddle Serving服务化部署](#paddle-serving服务化部署)
+  - [目录](#目录)
+  - [背景介绍](#背景介绍)
+  - [环境准备](#环境准备)
+    - [安装Paddle Serving](#安装paddle-serving)
+    <!-- - [安装FasterTokenizer文本处理加速库（可选）](#安装fastertokenizer文本处理加速库可选) -->
+  - [模型转换](#模型转换)
+  - [pipeline部署](#pipeline部署)
+    - [修改配置文件](#修改配置文件)
+    - [server启动服务](#server启动服务)
+    - [client发送服务请求](#client发送服务请求)
+
+## 背景介绍
+Paddle Serving 依托深度学习框架 PaddlePaddle 旨在帮助深度学习开发者和企业提供高性能、灵活易用的工业级在线推理服务。Paddle Serving 支持 RESTful、gRPC、bRPC 等多种协议，提供多种异构硬件和多种操作系统环境下推理解决方案，和多种经典预训练模型示例。集成高性能服务端推理引擎 Paddle Inference 和端侧引擎 Paddle Lite。设计并实现基于有向无环图(DAG) 的异步流水线高性能推理框架，具有多模型组合、异步调度、并发推理、动态批量、多卡多流推理、请求缓存等特性。
+
+Paddle Serving Python端预测部署主要包含以下步骤：
+- 环境准备
+- 模型转换
+- 部署模型
+
+## 环境准备
+### 安装Paddle Serving
+安装client和serving app，用于向服务发送请求:
+```shell
+pip install paddle_serving_app paddle_serving_client
+```
+安装server，用于启动服务，根据服务器设备选择安装CPU server或GPU server：
+
+- 安装CPU server
+```shell
+pip install paddle_serving_server
+```
+- 安装GPU server, 注意选择跟本地环境一致的命令
+```shell
+# CUDA10.2 + Cudnn7 + TensorRT6
+pip install paddle-serving-server-gpu==0.8.3.post102 # -i https://pypi.tuna.tsinghua.edu.cn/simple
+# CUDA10.1 + TensorRT6
+pip install paddle-serving-server-gpu==0.8.3.post101 # -i https://pypi.tuna.tsinghua.edu.cn/simple
+# CUDA11.2 + TensorRT8
+pip install paddle-serving-server-gpu==0.8.3.post112 # -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+
+**NOTE:**
+- 可以开启国内清华镜像源来加速下载
+- 如果要安装最新版本的PaddleServing参考[链接](https://github.com/PaddlePaddle/Serving/blob/develop/doc/Latest_Packages_CN.md)。
+
+
+<!-- ### 安装FasterTokenizer文本处理加速库（可选）
+如果部署环境是Linux，推荐安装faster_tokenizer可以得到更极致的文本处理效率，进一步提升服务性能。目前暂不支持Windows设备安装，将会在下个版本支持。
+```shell
+pip install faster_tokenizer
+``` -->
+
+
+## 模型转换
+
+使用Paddle Serving做服务化部署时，需要将保存的inference模型转换为serving易于部署的模型。
+
+用已安装的paddle_serving_client将静态图参数模型转换成serving格式。关于如何使用将训练后的动态图模型转为静态图模型详见[FasterTransformer加速及模型静态图导出](../../README.md)。
+
+模型转换命令如下：
+```shell
+python -m paddle_serving_client.convert --dirname ./export_checkpoint \
+                                        --model_filename unimo_text.pdmodel \
+                                        --params_filename unimo_text.pdiparams \
+                                        --serving_server ./deploy/paddle_serving/export_checkpoint_server \
+                                        --serving_client ./deploy/paddle_serving/export_checkpoint_client
+```
+关键参数释义如下：
+* `dirname`：静态图模型文件夹地址。
+* `model_filename`：模型文件名。
+* `params_filename`：模型参数名。
+* `serving_server`：server的模型文件和配置文件路径，默认"serving_server"。
+* `serving_client`：client的配置文件路径，默认"serving_client"。
+
+更多参数可通过以下命令查询：
+```shell
+python -m paddle_serving_client.convert --help
+```
+模型转换完成后，会在./delopy/paddle_serving文件夹多出export_checkpoint_server和export_checkpoint_client的文件夹，文件夹目录格式如下：
+```
+export_checkpoint_server/
+├── unimo_text.pdiparams
+├── unimo_text.pdmodel
+├── serving_server_conf.prototxt
+└── serving_server_conf.stream.prototxt
+export_checkpoint_server/
+├── serving_client_conf.prototxt
+└── serving_client_conf.stream.prototxt
+```
+
+## pipeline部署
+
+paddle_serving目录包含启动pipeline服务和发送预测请求的代码，包括：
+```
+paddle_serving/
+├──config.yml        # 启动服务端的配置文件
+├──pipeline_client.py     # 发送pipeline预测请求的脚本
+└──pipeline_service.py        # 启动pipeline服务端的脚本
+```
+
+### 修改配置文件
+目录中的`config.yml`文件解释了每一个参数的含义，可以根据实际需要修改其中的配置。
+
+### server启动服务
+修改好配置文件后，执行下面命令启动服务:
+```shell
+cd deploy/paddle_serving
+# 启动服务，运行日志保存在log.txt
+python pipeline_service.py &> log.txt &
+```
+成功启动服务后，log.txt中会打印类似如下日志
+```
+--- Running analysis [ir_graph_to_program_pass]
+I0901 12:09:27.248943 12190 analysis_predictor.cc:1035] ======= optimize end =======
+I0901 12:09:27.249596 12190 naive_executor.cc:102] ---  skip [feed], feed -> seq_len
+I0901 12:09:27.249608 12190 naive_executor.cc:102] ---  skip [feed], feed -> attention_mask
+I0901 12:09:27.249614 12190 naive_executor.cc:102] ---  skip [feed], feed -> token_type_ids
+I0901 12:09:27.249617 12190 naive_executor.cc:102] ---  skip [feed], feed -> input_ids
+I0901 12:09:27.250080 12190 naive_executor.cc:102] ---  skip [_generated_var_3], fetch -> fetch
+I0901 12:09:27.250090 12190 naive_executor.cc:102] ---  skip [transpose_0.tmp_0], fetch -> fetch
+[2022-09-01 12:09:27,251] [    INFO] - Already cached /root/.paddlenlp/models/unimo-text-1.0/unimo-text-1.0-vocab.txt
+[2022-09-01 12:09:27,269] [    INFO] - tokenizer config file saved in /root/.paddlenlp/models/unimo-text-1.0/tokenizer_config.json
+[2022-09-01 12:09:27,269] [    INFO] - Special tokens file saved in /root/.paddlenlp/models/unimo-text-1.0/special_tokens_map.json
+[PipelineServicer] succ init
+[OP Object] init success
+2022/09/01 12:09:27 start proxy service
+```
+
+### client发送服务请求
+执行以下命令发送文本摘要服务请求：
+```shell
+cd deploy/paddle_serving
+python pipeline_client.py
+```
+注意执行客户端请求时关闭代理，并根据实际情况修改server_url地址(启动服务所在的机器)
+
+成功运行后，输出打印如下:
+```
+time cost :0.03429532051086426 seconds
+--------------------
+input:  {'context': '平安银行95511电话按9转报案人工服务。 1.寿险 :95511转1 2.信用卡 95511转2 3.平安银行 95511转3 4.一账通 95511转4转8 5.产险 95511转5 6.养老险团体险 95511转6 7.健康险 95511转7 8.证券 95511转8 9.车险报案95511转9 0.重听', 'answer': '95511'}
+output:  问题：平安银行人工服务电话
+--------------------
+```
@@ -0,0 +1,59 @@
+#rpc端口, rpc_port和http_port不允许同时为空。当rpc_port为空且http_port不为空时，会自动将rpc_port设置为http_port+1
+rpc_port: 18011
+
+#http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
+http_port: 9999
+
+#worker_num, 最大并发数。
+#当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
+#当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
+worker_num: 10
+
+#build_dag_each_worker, False，框架在进程内创建一条DAG；True，框架会每个进程内创建多个独立的DAG
+build_dag_each_worker: false
+
+dag:
+    #op资源类型, True, 为线程模型；False，为进程模型
+    is_thread_op: True
+
+    #重试次数
+    retry: 1
+
+    #使用性能分析, True，生成Timeline性能数据，对性能有一定影响；False为不使用
+    use_profile: false
+    tracer:
+        interval_s: 10
+
+op:
+    question_generation:
+        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
+        concurrency: 11
+
+        #当op配置没有server_endpoints时，从local_service_conf读取本地服务配置
+        local_service_conf:
+            #client类型，包括brpc, grpc和local_predictor.local_predictor不启动Serving服务，进程内预测
+            client_type: local_predictor
+
+            #模型路径
+            model_config: ../../unimo/serving/export_checkpoint_server
+
+            #Fetch结果列表，以client_config中fetch_var的alias_name为准，不设置默认取全部输出变量
+            # fetch_list: ["_generated_var_3", "slice_0.tmp_0"]
+            
+            # device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
+            device_type: 1
+            
+            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
+            devices: "0"
+
+            #开启MKLDNN加速
+            use_mkldnn: False
+
+            #thread_num
+            thread_num: 12
+
+            #ir_optim
+            ir_optim: False
+            
+            #开启tensorrt后，进行优化的子图包含的最少节点数
+            #min_subgraph_size: 10
@@ -0,0 +1,289 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+from functools import partial
+
+import numpy as np
+from numpy import array
+
+import paddle
+import paddle.distributed as dist
+from paddle.io import DataLoader, DistributedBatchSampler, BatchSampler
+from paddlenlp.data import Pad
+
+
+def postprocess_response(token_ids, tokenizer):
+    """Post-process the decoded sequence. Truncate from the first <eos>."""
+    eos_pos = len(token_ids)
+    for i, tok_id in enumerate(token_ids):
+        if tok_id == tokenizer.mask_token_id:
+            eos_pos = i
+            break
+    token_ids = token_ids[:eos_pos]
+    tokens = tokenizer.convert_ids_to_tokens(token_ids)
+    tokens = tokenizer.merge_subword(tokens)
+    return tokens
+
+
+def print_args(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).items()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def set_seed(seed):
+    # Use the same data seed(for data shuffle) for all procs to guarantee data
+    # consistency after sharding.
+    random.seed(seed)
+    np.random.seed(seed)
+    # Maybe different op seeds(for dropout) for different procs is better.
+    paddle.seed(seed + dist.get_rank())
+
+
+def convert_example(example,
+                    tokenizer,
+                    max_seq_len=512,
+                    max_target_len=128,
+                    max_title_len=256,
+                    mode='test',
+                    template=0):
+    """Convert all examples into necessary features."""
+    if mode == 'pretrain' or mode == 'pretrain_test':
+        context = example['context']
+        answer = example['answer']
+        target = example['target']
+
+        source = '答案：' + answer + tokenizer.sep_token + '上下文：' + context
+        title = None
+
+    elif mode == 'train' or mode == 'test':
+        target = None
+        if 'source' in example and 'title' in example:
+            source = example['source']
+            title = None
+            if 'title' in example.keys():
+                title = example['title']
+        elif 'context' in example and 'answer' in example:
+            source = example['context']
+            title = None
+            if 'answer' in example.keys():
+                title = example['answer']
+        else:
+            assert False, "Source and title are not in the input dictionary, nor are context and answer."
+        if 'target' in example.keys():
+            target = example['target']
+
+        if template == 1:
+            source = '答案：' + title + tokenizer.sep_token + '上下文：' + source
+            title = None
+            if target:
+                target = '问题：' + target
+        elif template == 2:
+            source = '答案：' + title + tokenizer.sep_token + '上下文：' + source
+            title = None
+            if target:
+                target = '在已知答案的前提下，问题：' + target
+        elif template == 3:
+            source = '这是一个问题生成任务，根据提供的答案和上下文，来生成问题。' + title + tokenizer.sep_token + '上下文：' + source
+            title = None
+            if target:
+                target = '问题：' + target
+
+    if mode == 'train' or mode == 'pretrain':
+        tokenized_example = tokenizer.gen_encode(source,
+                                                 title=title,
+                                                 target=target,
+                                                 max_seq_len=max_seq_len,
+                                                 max_target_len=max_target_len,
+                                                 max_title_len=max_title_len,
+                                                 return_position_ids=True,
+                                                 return_length=True)
+        target_start = tokenized_example['input_ids'].index(
+            tokenizer.cls_token_id, 1)
+        target_end = tokenized_example['seq_len']
+        # Use to gather the logits corresponding to the labels during training
+        tokenized_example['masked_positions'] = list(
+            range(target_start, target_end - 1))
+        tokenized_example['labels'] = tokenized_example['input_ids'][
+            target_start + 1:target_end]
+
+        return tokenized_example
+
+    elif mode == 'test' or mode == 'pretrain_test':
+        tokenized_example = tokenizer.gen_encode(
+            source,
+            title=title,
+            max_seq_len=max_seq_len,
+            max_title_len=max_title_len,
+            add_start_token_for_decoding=True,
+            return_position_ids=True,
+            return_length=True,
+        )
+
+        if 'target' in example and example['target']:
+            tokenized_example['target'] = example['target']
+        return tokenized_example
+
+
+def batchify_fn(batch_examples, pad_val, mode='test'):
+
+    def pad_mask(batch_attention_mask):
+        batch_size = len(batch_attention_mask)
+        max_len = max(map(len, batch_attention_mask))
+        attention_mask = np.ones(
+            (batch_size, max_len, max_len), dtype='float32') * -1e9
+        for i, mask_data in enumerate(attention_mask):
+            seq_len = len(batch_attention_mask[i])
+            mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i],
+                                                       dtype='float32')
+        # In order to ensure the correct broadcasting mechanism, expand one
+        # dimension to the second dimension (n_head of Transformer).
+        attention_mask = np.expand_dims(attention_mask, axis=1)
+        return attention_mask
+
+    pad_func = Pad(pad_val=pad_val, pad_right=False, dtype='int64')
+
+    input_ids = pad_func([example['input_ids'] for example in batch_examples])
+    token_type_ids = pad_func(
+        [example['token_type_ids'] for example in batch_examples])
+    position_ids = pad_func(
+        [example['position_ids'] for example in batch_examples])
+
+    attention_mask = pad_mask(
+        [example['attention_mask'] for example in batch_examples])
+
+    seq_len = np.asarray([example['seq_len'] for example in batch_examples],
+                         dtype='int32')
+
+    if mode == 'train' or mode == 'pretrain':
+        max_len = max([example['seq_len'] for example in batch_examples])
+        masked_positions = np.concatenate([
+            np.array(example['masked_positions']) +
+            (max_len - example['seq_len']) + i * max_len
+            for i, example in enumerate(batch_examples)
+        ])
+        labels = np.concatenate([
+            np.array(example['labels'], dtype='int64')
+            for example in batch_examples
+        ])
+        return input_ids, token_type_ids, position_ids, attention_mask, masked_positions, labels
+    elif mode == 'test' or mode == 'pretrain_test':
+        return input_ids, token_type_ids, position_ids, attention_mask, seq_len
+
+
+def create_data_loader(dataset, tokenizer, args, mode='test'):
+    trans_func = partial(convert_example,
+                         tokenizer=tokenizer,
+                         mode='test',
+                         template=1)
+    dataset = dataset.map(trans_func, lazy=True)
+    if mode == 'pretrain':
+        batch_sampler = DistributedBatchSampler(dataset,
+                                                batch_size=args.batch_size,
+                                                shuffle=True)
+    elif mode == 'train':
+        batch_sampler = DistributedBatchSampler(dataset,
+                                                batch_size=args.batch_size,
+                                                shuffle=True)
+    elif mode == 'test' or mode == 'pretrain_test':
+        batch_sampler = BatchSampler(dataset,
+                                     batch_size=args.batch_size // 2,
+                                     shuffle=False)
+    collate_fn = partial(batchify_fn, pad_val=tokenizer.pad_token_id, mode=mode)
+    data_loader = DataLoader(dataset,
+                             batch_sampler=batch_sampler,
+                             collate_fn=collate_fn,
+                             return_list=True)
+    return dataset, data_loader
+
+
+def post_process_sum(token_ids, tokenizer):
+    """Post-process the decoded sequence. Truncate from the first <eos>."""
+    eos_pos = len(token_ids)
+    for i, tok_id in enumerate(token_ids):
+        if tok_id == tokenizer.mask_token_id:
+            eos_pos = i
+            break
+    token_ids = token_ids[:eos_pos]
+    tokens = tokenizer.convert_ids_to_tokens(token_ids)
+    tokens = tokenizer.merge_subword(tokens)
+    special_tokens = ['[UNK]']
+    tokens = [token for token in tokens if token not in special_tokens]
+    return token_ids, tokens
+
+
+def remove_template(instr):
+    """Remove template prefix of decoded sequence."""
+    outstr = instr.strip('问题：')
+    outstr = instr.strip('在已知答案的前提下，问题：')
+    return outstr
+
+
+def select_sum(ids,
+               scores,
+               tokenizer,
+               max_dec_len=None,
+               num_return_sequences=1):
+    results = []
+    group = []
+    tmp = []
+    if scores is not None:
+        ids = ids.numpy()
+        scores = scores.numpy()
+
+        if len(ids) != len(scores) or (len(ids) % num_return_sequences) != 0:
+            raise ValueError(
+                "the length of `ids` is {}, but the `num_return_sequences` is {}"
+                .format(len(ids), num_return_sequences))
+
+        for pred, score in zip(ids, scores):
+            pred_token_ids, pred_tokens = post_process_sum(pred, tokenizer)
+            num_token = len(pred_token_ids)
+
+            target = "".join(pred_tokens)
+            target = remove_template(target)
+
+            # not ending
+            if max_dec_len is not None and num_token >= max_dec_len:
+                score -= 1e3
+
+            tmp.append([target, score])
+            if len(tmp) == num_return_sequences:
+                group.append(tmp)
+                tmp = []
+
+        for preds in group:
+            preds = sorted(preds, key=lambda x: -x[1])
+            results.append(preds[0][0])
+    else:
+        ids = ids.numpy()
+
+        for pred in ids:
+            pred_token_ids, pred_tokens = post_process_sum(pred, tokenizer)
+            num_token = len(pred_token_ids)
+            response = "".join(pred_tokens)
+            response = remove_template(response)
+
+            # TODO: Support return scores in FT.
+            tmp.append([response])
+            if len(tmp) == num_return_sequences:
+                group.append(tmp)
+                tmp = []
+
+        for preds in group:
+            results.append(preds[0][0])
+
+    return results
@@ -0,0 +1,54 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle_serving_server.pipeline import PipelineClient
+from numpy import array, float32
+import time
+import numpy as np
+
+
+class Runner(object):
+
+    def __init__(
+        self,
+        server_url: str,
+    ):
+        self.client = PipelineClient()
+        self.client.connect([server_url])
+
+    def Run(self, data):
+        inputs = data
+        start_time = time.time()
+        ret = self.client.predict(feed_dict={"inputs": inputs})
+        end_time = time.time()
+        print("time cost :{} seconds".format(end_time - start_time))
+        if not ret.value:
+            print('Fail to fetch summary.')
+        # ret is special class but a dict
+        for d, s in zip(data, eval(ret.value[0])):
+            print("--------------------")
+            print("input: ", d)
+            print("output: ", s)
+            print("--------------------")
+        return
+
+
+if __name__ == "__main__":
+    server_url = "127.0.0.1:18011"
+    runner = Runner(server_url)
+    requests = [{
+        "context":
+        "奇峰黄山千米以上的山峰有77座，整座黄山就是一座花岗岩的峰林，自古有36大峰，36小峰，最高峰莲花峰、最险峰天都峰和观日出的最佳点光明顶构成黄山的三大主峰。",
+        "answer": "莲花峰"
+    }]
+    runner.Run(requests)
@@ -0,0 +1,82 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle_serving_server.web_service import WebService, Op
+from numpy import array
+import logging
+import numpy as np
+from paddlenlp.transformers import AutoTokenizer
+from paddlenlp.ops.ext_utils import load
+from paddlenlp.transformers import UNIMOTokenizer
+from paddlenlp.data import Pad
+
+from infer_utils import convert_example, batchify_fn, select_sum, postprocess_response
+
+import paddle_serving_server.pipeline.operator
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class UnimoTextOp(Op):
+    """Op for unimo_text."""
+
+    def init_op(self):
+        self.tokenizer = UNIMOTokenizer.from_pretrained('unimo-text-1.0')
+
+    def preprocess(self, input_dicts, data_id, log_id):
+        # Convert input format
+        (_, input_dict), = input_dicts.items()
+        data = input_dict["inputs"]
+        if isinstance(data, str) and "array(" in data:
+            data = eval(data)
+        else:
+            _LOGGER.error("input value  {}is not supported.".format(data))
+        examples = [convert_example(i, self.tokenizer) for i in data]
+        input_ids, token_type_ids, position_ids, attention_mask, seq_len = batchify_fn(
+            examples, self.tokenizer.pad_token_id)
+        new_dict = {}
+        new_dict['input_ids'] = input_ids
+        new_dict['token_type_ids'] = token_type_ids
+        new_dict['attention_mask'] = attention_mask
+        new_dict['seq_len'] = seq_len
+        # the first return must be a dict or a list of dict, the dict corresponding to a batch of model input
+        return new_dict, False, None, ""
+
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
+        # keyname refer to export_checkpoint_client/serving_client_conf.prototxt
+        ids = fetch_dict['transpose_0.tmp_0'][:, 0, :].tolist()
+        scores = fetch_dict['_generated_var_3'][:, 0].tolist()
+
+        results = [
+            "".join(postprocess_response(sample, self.tokenizer))
+            for sample in ids
+        ]
+        new_dict = {}
+        new_dict["outputs"] = str(results)
+        # the first return must be a dict or a list of dict, the dict corresponding to a batch of model output
+        return new_dict, None, ""
+
+
+class UnimoTextService(WebService):
+
+    def get_pipeline_response(self, read_op):
+        return UnimoTextOp(name="question_generation", input_ops=[read_op])
+
+
+if __name__ == "__main__":
+    # Load FasterTransformer lib.
+    load("FasterTransformer", verbose=True)
+    service = UnimoTextService(name="question_generation")
+    service.prepare_pipeline_config("config.yml")
+    service.run_service()