aoyulong
diff --git a/‎examples/information_extraction/msra_ner/README.md
+1-1 b/‎examples/information_extraction/msra_ner/README.md
+1-1
diff --git a/‎examples/information_extraction/msra_ner/predict.py
+3-11 b/‎examples/information_extraction/msra_ner/predict.py
+3-11
diff --git a/‎examples/information_extraction/msra_ner/train.py
+14-14 b/‎examples/information_extraction/msra_ner/train.py
+14-14
diff --git a/‎examples/language_model/bigbird/README.md
+1-1 b/‎examples/language_model/bigbird/README.md
+1-1
diff --git a/‎examples/language_model/bigbird/args.py
+2-2 b/‎examples/language_model/bigbird/args.py
+2-2
diff --git a/‎examples/language_model/bigbird/run_glue.py
+16-15 b/‎examples/language_model/bigbird/run_glue.py
+16-15
diff --git a/‎examples/language_model/convbert/README.md
+1-1 b/‎examples/language_model/convbert/README.md
+1-1
diff --git a/‎examples/language_model/convbert/run_glue.py
+15-9 b/‎examples/language_model/convbert/run_glue.py
+15-9
diff --git a/‎examples/language_model/roformer/README.md
+2-2 b/‎examples/language_model/roformer/README.md
+2-2
diff --git a/‎examples/language_model/roformer/run_cail2019_scm.py
+2-2 b/‎examples/language_model/roformer/run_cail2019_scm.py
+2-2
diff --git a/‎examples/language_model/roformer/run_thucnews.py
+2-2 b/‎examples/language_model/roformer/run_thucnews.py
+2-2
diff --git a/‎examples/language_model/t5/README.md
+3-1 b/‎examples/language_model/t5/README.md
+3-1
diff --git a/‎examples/language_model/t5/run_glue.py
+16-4 b/‎examples/language_model/t5/run_glue.py
+16-4
diff --git a/‎examples/machine_reading_comprehension/DuReader-robust/args.py
+18-1 b/‎examples/machine_reading_comprehension/DuReader-robust/args.py
+18-1
@@ -45,7 +45,7 @@ python -u ./train.py \
 - `logging_steps`: 表示日志打印间隔。
 - `save_steps`: 表示模型保存及评估间隔。
 - `output_dir`: 表示模型保存路径。
-- `device`: 训练使用的设备, 'gpu'表示使用GPU, 'xpu'表示使用百度昆仑卡, 'cpu'表示使用CPU。
+- `device`: 训练使用的设备, 'gpu'表示使用GPU, 'xpu'表示使用百度昆仑卡, 'cpu'表示使用CPU, 'npu'表示使用华为昇腾卡。
 
 #### 多卡训练
 ```shell
 
@@ -13,18 +13,11 @@
 # limitations under the License.
 
 import argparse
-import os
-import ast
-import random
-import time
-import math
-from functools import partial
-
-import numpy as np
+
 import paddle
+from datasets import load_dataset
 from paddle.io import DataLoader
 
-from datasets import load_dataset
 from paddlenlp.data import DataCollatorForTokenClassification
 from paddlenlp.transformers import BertForTokenClassification, BertTokenizer
 
@@ -35,7 +28,7 @@
 parser.add_argument("--init_checkpoint_path", default=None, type=str, required=True, help="The model checkpoint path.", )
 parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", )
 parser.add_argument("--batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.", )
-parser.add_argument("--device", default="gpu", type=str, choices=["cpu", "gpu", "xpu"] ,help="The device to select to train the model, is must be cpu/gpu/xpu.")
+parser.add_argument("--device", default="gpu", type=str, choices=["cpu", "gpu", "xpu", "npu"] , help="The device to select to train the model, is must be cpu/gpu/xpu/npu.")
 # yapf: enable
 
 
@@ -100,7 +93,6 @@ def tokenize_and_align_labels(examples):
         tokenized_inputs["labels"] = labels
         return tokenized_inputs
 
-    ignore_label = -100
     batchify_fn = DataCollatorForTokenClassification(tokenizer)
 
     id2label = dict(enumerate(label_list))
 
@@ -14,22 +14,23 @@
 
 import argparse
 import os
-import random
 import time
-import math
-from functools import partial
 
-import numpy as np
 import paddle
+from datasets import load_dataset
 from paddle.io import DataLoader
 
-from paddlenlp.transformers import LinearDecayWithWarmup
-from paddlenlp.metrics import ChunkEvaluator
-from datasets import load_dataset
-from paddlenlp.transformers import BertForTokenClassification, BertTokenizer
-from paddlenlp.transformers import ErnieForTokenClassification, ErnieTokenizer
-from paddlenlp.transformers import ErnieCtmForTokenClassification, ErnieCtmTokenizer
 from paddlenlp.data import DataCollatorForTokenClassification
+from paddlenlp.metrics import ChunkEvaluator
+from paddlenlp.transformers import (
+    BertForTokenClassification,
+    BertTokenizer,
+    ErnieCtmForTokenClassification,
+    ErnieCtmTokenizer,
+    ErnieForTokenClassification,
+    ErnieTokenizer,
+    LinearDecayWithWarmup,
+)
 from paddlenlp.utils.log import logger
 
 MODEL_CLASSES = {
@@ -42,8 +43,8 @@
 
 # yapf: disable
 parser.add_argument("--model_type", default="bert", type=str, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), )
-parser.add_argument("--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join( sum([ list(classes[-1].pretrained_init_configuration.keys())  for classes in MODEL_CLASSES.values() ], [])), )
-parser.add_argument("--dataset", default="msra_ner", type=str, choices=["msra_ner", "peoples_daily_ner"] ,help="The named entity recognition datasets.")
+parser.add_argument("--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(sum([list(classes[-1].pretrained_init_configuration.keys()) for classes in MODEL_CLASSES.values()], [])), )
+parser.add_argument("--dataset", default="msra_ner", type=str, choices=["msra_ner", "peoples_daily_ner"] , help="The named entity recognition datasets.")
 parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.")
 parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
 parser.add_argument("--batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
@@ -57,7 +58,7 @@
 parser.add_argument("--logging_steps", type=int, default=1, help="Log every X updates steps.")
 parser.add_argument("--save_steps", type=int, default=100, help="Save checkpoint every X updates steps.")
 parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-parser.add_argument("--device", default="gpu", type=str, choices=["cpu", "gpu", "xpu"] ,help="The device to select to train the model, is must be cpu/gpu/xpu.")
+parser.add_argument("--device", default="gpu", type=str, choices=["cpu", "gpu", "xpu", "npu"] , help="The device to select to train the model, is must be cpu/gpu/xpu/npu.")
 # yapf: enable
 
 
@@ -179,7 +180,6 @@ def tokenize_and_align_labels(examples):
     metric = ChunkEvaluator(label_list=label_list)
 
     global_step = 0
-    last_step = args.num_train_epochs * len(train_data_loader)
     tic_train = time.time()
     for epoch in range(args.num_train_epochs):
         for step, batch in enumerate(train_data_loader):
 
@@ -127,7 +127,7 @@ python -m paddle.distributed.launch --gpus "0" run_glue.py \
 - `logging_steps` 表示日志打印间隔。
 - `save_steps` 表示模型保存及评估间隔。
 - `output_dir` 表示模型保存路径。
-- `device` 表示训练使用的设备, 'gpu'表示使用GPU, 'xpu'表示使用百度昆仑卡, 'cpu'表示使用CPU。
+- `device` 表示训练使用的设备, 'gpu'表示使用GPU, 'xpu'表示使用百度昆仑卡, 'cpu'表示使用CPU, 'npu'表示使用华为昇腾卡。
 
 基于`bigbird-base-uncased`在GLUE各评测任务上Fine-tuning后，在验证集上有如下结果：
 
 
@@ -69,8 +69,8 @@ def parse_args():
         "--device",
         type=str,
         default="gpu",
-        choices=["cpu", "gpu"],
-        help="Select cpu, gpu, xpu devices to train model.",
+        choices=["cpu", "gpu", "npu"],
+        help="Select cpu, gpu, xpu, npu devices to train model.",
     )
 
     parser.add_argument("--epochs", type=int, default=10, help="Number of epoches for training.")
 
@@ -12,32 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import argparse
-import logging
 import os
-import sys
 import random
 import time
-import math
-import distutils.util
 from functools import partial
 
+import args
 import numpy as np
 import paddle
 from paddle.io import DataLoader
-from paddle.metric import Metric, Accuracy, Precision, Recall
+from paddle.metric import Accuracy
 
+from paddlenlp.data import Stack
 from paddlenlp.datasets import load_dataset
-from paddlenlp.data import Stack, Tuple, Pad, Dict
-from paddlenlp.data.sampler import SamplerHelper
-from paddlenlp.transformers import BigBirdModel, BigBirdForSequenceClassification, BigBirdTokenizer
-from paddlenlp.transformers import create_bigbird_rand_mask_idx_list
-from paddlenlp.transformers import LinearDecayWithWarmup
 from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman
+from paddlenlp.transformers import (
+    BigBirdForSequenceClassification,
+    BigBirdTokenizer,
+    LinearDecayWithWarmup,
+    create_bigbird_rand_mask_idx_list,
+)
 from paddlenlp.utils.log import logger
 
-import args
-
 METRIC_CLASSES = {
     "cola": Mcc,
     "sst-2": Accuracy,
@@ -190,7 +186,7 @@ def do_train(args):
     train_ds = load_dataset("glue", args.task_name, splits="train")
     tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
 
-    num_classes = 1 if train_ds.label_list == None else len(train_ds.label_list)
+    num_classes = 1 if train_ds.label_list is None else len(train_ds.label_list)
     # In finetune task, bigbird performs better when setting dropout to zero.
     model = model_class.from_pretrained(
         args.model_name_or_path, num_classes=num_classes, attn_dropout=0.0, hidden_dropout_prob=0.0
@@ -327,5 +323,10 @@ def print_arguments(args):
 if __name__ == "__main__":
     args = args.parse_args()
     print_arguments(args)
-    assert args.device in ["cpu", "gpu", "xpu"], "Invalid device! Available device should be cpu, gpu, or xpu."
+    assert args.device in [
+        "cpu",
+        "gpu",
+        "xpu",
+        "npu",
+    ], "Invalid device! Available device should be cpu, gpu, xpu or npu."
     do_train(args)
@@ -55,7 +55,7 @@ python -u examples/language_model/convbert/run_glue.py \
 - `logging_steps` 表示日志打印间隔。
 - `save_steps` 表示模型保存及评估间隔。
 - `output_dir` 表示模型保存路径。
-- `device` 表示使用的设备类型。默认为GPU，可以配置为CPU、GPU、XPU。若希望使用多GPU训练，将其设置为GPU，同时环境变量CUDA_VISIBLE_DEVICES配置要使用的GPU id。
+- `device` 表示使用的设备类型。默认为GPU，可以配置为CPU、GPU、XPU、NPU。若希望使用多GPU训练，将其设置为GPU，同时环境变量CUDA_VISIBLE_DEVICES配置要使用的GPU id。
 
 Fine-tuning过程将按照 `logging_steps` 和 `save_steps` 的设置打印如下格式的日志：
 
 
@@ -24,14 +24,20 @@
 from paddle.io import DataLoader
 from paddle.metric import Accuracy
 
+from paddlenlp.data import Pad, Stack, Tuple
 from paddlenlp.datasets import load_dataset
-from paddlenlp.data import Stack, Tuple, Pad
-from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
-from paddlenlp.transformers import ElectraForSequenceClassification, ElectraTokenizer
-from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
-from paddlenlp.transformers import ConvBertForSequenceClassification, ConvBertTokenizer
-from paddlenlp.transformers import LinearDecayWithWarmup
 from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman
+from paddlenlp.transformers import (
+    BertForSequenceClassification,
+    BertTokenizer,
+    ConvBertForSequenceClassification,
+    ConvBertTokenizer,
+    ElectraForSequenceClassification,
+    ElectraTokenizer,
+    ErnieForSequenceClassification,
+    ErnieTokenizer,
+    LinearDecayWithWarmup,
+)
 
 FORMAT = "%(asctime)s-%(levelname)s: %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
@@ -135,8 +141,8 @@ def parse_args():
         "--device",
         default="gpu",
         type=str,
-        choices=["cpu", "gpu"],
-        help="The device to select to train the model, is must be cpu/gpu.",
+        choices=["cpu", "gpu", "npu"],
+        help="The device to select to train the model, is must be cpu/gpu/npu.",
     )
     args = parser.parse_args()
     return args
@@ -270,7 +276,7 @@ def do_train(args):
             dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True
         )
 
-    num_classes = 1 if train_ds.label_list == None else len(train_ds.label_list)
+    num_classes = 1 if train_ds.label_list is None else len(train_ds.label_list)
     model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_classes)
     if paddle.distributed.get_world_size() > 1:
         model = paddle.DataParallel(model)
 
@@ -55,7 +55,7 @@ python -m paddle.distributed.launch --gpus "0" examples/language_model/roformer/
 - `logging_steps` 表示日志打印间隔。
 - `save_steps` 表示模型保存及评估间隔。
 - `output_dir` 表示模型保存路径。
-- `device` 表示训练使用的设备, 'gpu'表示使用GPU, 'xpu'表示使用百度昆仑卡, 'cpu'表示使用CPU。
+- `device` 表示训练使用的设备, 'gpu'表示使用GPU, 'xpu'表示使用百度昆仑卡, 'cpu'表示使用CPU, 'npu'表示使用华为昇腾卡。
 - `use_amp` 指示是否启用自动混合精度训练。
 
 基于`roformer-chinese-base`在THUCNews分类任务上Fine-tuning后，在验证集上有如下结果：
@@ -100,7 +100,7 @@ python -m paddle.distributed.launch --gpus "0" examples/language_model/roformer/
 - `logging_steps` 表示日志打印间隔。
 - `save_steps` 表示模型保存及评估间隔。
 - `output_dir` 表示模型保存路径。
-- `device` 表示训练使用的设备, 'gpu'表示使用GPU, 'xpu'表示使用百度昆仑卡, 'cpu'表示使用CPU。
+- `device` 表示训练使用的设备, 'gpu'表示使用GPU, 'xpu'表示使用百度昆仑卡, 'cpu'表示使用CPU, 'npu'表示使用华为昇腾卡。
 - `use_amp` 指示是否启用自动混合精度训练。
 
 基于`roformer-chinese-base`在Cail2019_Scm任务上Fine-tuning后，有如下结果：
 
@@ -148,8 +148,8 @@ def parse_args():
         "--device",
         default="gpu",
         type=str,
-        choices=["cpu", "gpu", "xpu"],
-        help="The device to select to train the model, is must be cpu/gpu/xpu.",
+        choices=["cpu", "gpu", "xpu", "npu"],
+        help="The device to select to train the model, is must be cpu/gpu/xpu/npu.",
     )
     parser.add_argument(
         "--use_amp",
 
@@ -114,8 +114,8 @@ def parse_args():
         "--device",
         default="gpu",
         type=str,
-        choices=["cpu", "gpu", "xpu"],
-        help="The device to select to train the model, is must be cpu/gpu/xpu.",
+        choices=["cpu", "gpu", "xpu", "npu"],
+        help="The device to select to train the model, is must be cpu/gpu/xpu/npu.",
     )
     parser.add_argument("--use_amp", type=strtobool, default=False, help="Enable mixed precision training.")
     parser.add_argument("--scale_loss", type=float, default=2**15, help="The value of scale_loss for fp16.")
 
@@ -31,7 +31,8 @@ python run_glue.py \
     --save_steps 100 \
     --seed 42 \
     --scheduler_type linear \
-    --output_dir outputs/rte/
+    --output_dir outputs/rte/ \
+    --device gpu
 ```
 
 其中参数释义如下：
@@ -49,6 +50,7 @@ python run_glue.py \
 - `seed` 表示随机种子。
 - `scheduler_type` scheduler类型，可选linear和cosine，默认linear。
 - `output_dir` 表示模型保存路径。
+- `device` 表示训练使用的设备，可选cpu、gpu或npu。
 
 使用trainer进行Fine-tuning:
 
 
@@ -12,20 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import argparse
 import logging
 import math
 import os
 
 import paddle
+from data import (
+    GLUE_PROCESSED,
+    get_dev_dataloader,
+    get_mnli_dev_dataloader,
+    get_train_dataloader,
+)
 from paddle.amp import GradScaler, auto_cast
 from paddle.optimizer import AdamW
-from paddlenlp.transformers import T5ForConditionalGeneration, T5Tokenizer
 from tqdm import tqdm
-
-from data import get_dev_dataloader, get_train_dataloader, get_mnli_dev_dataloader, GLUE_PROCESSED
 from utils import GLUE_METRICS, get_scheduler, get_writer, set_seed
 
-import argparse
+from paddlenlp.transformers import T5ForConditionalGeneration, T5Tokenizer
 
 
 def parse_args():
@@ -139,6 +143,13 @@ def parse_args():
         help="num_workers.",
     )
     parser.add_argument("--is_test", action="store_true", help="is_test.")
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        type=str,
+        choices=["gpu", "cpu", "npu"],
+        help="The device to select to train the model, is must be cpu/gpu/npu.",
+    )
     args = parser.parse_args()
     args.task_name = args.task_name.lower()
     args.logdir = os.path.join(args.output_dir, "logs")
@@ -197,6 +208,7 @@ def evaluate(model, data_loader, tokenizer, label2id, metric_list, generate_max_
 
 
 def main(args):
+    paddle.set_device(args.device)
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
         datefmt="%m/%d/%Y %H:%M:%S",
 
@@ -1,3 +1,17 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 
 
@@ -47,7 +61,10 @@ def parse_args():
     parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
     parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
     parser.add_argument(
-        "--device", choices=["cpu", "gpu"], default="gpu", help="Select which device to train model, defaults to gpu."
+        "--device",
+        choices=["cpu", "gpu", "npu"],
+        default="gpu",
+        help="Select which device to train model, defaults to gpu.",
     )
     parser.add_argument(
         "--doc_stride",