fix typo (PaddlePaddle#4886)

sijunhe · web-flow · commit 6e9e0f0e5078 · 2023-02-21T01:03:19.000+08:00
diff --git a/applications/question_answering/unsupervised_qa/README.md b/applications/question_answering/unsupervised_qa/README.md
@@ -356,7 +356,7 @@ python -u -m paddle.distributed.launch --gpus "1,2" --log_dir log/question_gener
     --epochs=20 \
     --batch_size=16 \
     --learning_rate=1e-5 \
-    --warmup_propotion=0.02 \
+    --warmup_proportion=0.02 \
     --weight_decay=0.01 \
     --max_seq_len=512 \
     --max_target_len=30 \
@@ -391,7 +391,7 @@ python -u -m paddle.distributed.launch --gpus "1,2" --log_dir log/question_gener
 - `batch_size` 表示每次迭代**每张卡**上的样本数目。
 - `learning_rate` 表示基础学习率大小，将于learning rate scheduler产生的值相乘作为当前学习率。
 - `weight_decay` 表示AdamW优化器中使用的weight_decay的系数。
-- `warmup_propotion` 表示学习率逐渐升高到基础学习率（即上面配置的learning_rate）所需要的迭代数占总步数的比例。
+- `warmup_proportion` 表示学习率逐渐升高到基础学习率（即上面配置的learning_rate）所需要的迭代数占总步数的比例。
 - `max_seq_len` 模型输入序列的最大长度。
 - `max_target_len` 模型训练时标签的最大长度。
 - `min_dec_len` 模型生成序列的最小长度。
diff --git a/applications/question_answering/unsupervised_qa/finetune/question_generation/train.py b/applications/question_answering/unsupervised_qa/finetune/question_generation/train.py
@@ -48,7 +48,7 @@ def parse_args():
     parser.add_argument('--learning_rate', type=float, default=5e-5, help='The initial learning rate.')
     parser.add_argument('--weight_decay', type=float, default=0.01, help='The weight decay for optimizer.')
     parser.add_argument('--epochs', type=int, default=3, help='Total number of training epochs to perform.')
-    parser.add_argument('--warmup_propotion', type=float, default=0.02, help='The number of warmup steps.')
+    parser.add_argument('--warmup_proportion', type=float, default=0.02, help='The number of warmup steps.')
     parser.add_argument('--max_grad_norm', type=float, default=1.0, help='The max value of grad norm.')
     parser.add_argument('--beta1', type=float, default=0.9, help='beta1')
     parser.add_argument('--beta2', type=float, default=0.98, help='beta2')
@@ -153,7 +153,7 @@ def run(args):
     if args.do_train:
         num_training_steps = args.epochs * len(train_data_loader)
 
-        lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_propotion)
+        lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion)
         # Generate parameter names needed to perform weight decay.
         # All bias and LayerNorm parameters are excluded.
 
diff --git a/applications/text_summarization/finetune/README.md b/applications/text_summarization/finetune/README.md
@@ -206,7 +206,7 @@ python -m paddle.distributed.launch --gpus "2,3,4,5,6,7" train.py \
 - `eval_batch_size` 表示每次验证**每张卡**上的样本数目。
 - `learning_rate` 表示基础学习率大小，将于learning rate scheduler产生的值相乘作为当前学习率。
 - `weight_decay` 表示AdamW优化器中使用的weight_decay的系数。
-- `warmup_propotion`
+- `warmup_proportion`
   表示学习率逐渐升高到基础学习率（即上面配置的learning_rate）所需要的迭代数占总步数的比例，最早的使用可以参考[这篇论文](https://arxiv.org/pdf/1706.02677.pdf)
   。
 - `max_source_length` 模型输入序列的最大长度。
diff --git a/applications/text_summarization/pretrain/README.md b/applications/text_summarization/pretrain/README.md
@@ -152,7 +152,7 @@ python -m paddle.distributed.launch --gpus "2,3,4,5,6,7" train.py \
 - `eval_batch_size` 表示每次验证**每张卡**上的样本数目。
 - `learning_rate` 表示基础学习率大小，将于learning rate scheduler产生的值相乘作为当前学习率。
 - `weight_decay` 表示AdamW优化器中使用的weight_decay的系数。
-- `warmup_propotion`
+- `warmup_proportion`
   表示学习率逐渐升高到基础学习率（即上面配置的learning_rate）所需要的迭代数占总步数的比例，最早的使用可以参考[这篇论文](https://arxiv.org/pdf/1706.02677.pdf)。
 - `max_source_length` 模型输入序列的最大长度。
 - `max_target_length` 模型训练时标签的最大长度。
diff --git a/examples/code_generation/codegen/README.md b/examples/code_generation/codegen/README.md
@@ -237,7 +237,7 @@ python -m paddle.distributed.launch --gpus 0,1 run_clm.py \
 - `train_batch_size` 表示训练时**每张卡**上的样本数目。
 - `eval_batch_size` 表示测试时**每张卡**上的样本数目。
 - `learning_rate` 表示基础学习率大小，将于learning rate scheduler产生的值相乘作为当前学习率。
-- `warmup_propotion` 表示学习率逐渐升高到基础学习率（即上面配置的learning_rate）所需要的迭代数占总步数的比例，最早的使用可以参考[这篇论文](https://arxiv.org/pdf/1706.02677.pdf)。
+- `warmup_proportion` 表示学习率逐渐升高到基础学习率（即上面配置的learning_rate）所需要的迭代数占总步数的比例，最早的使用可以参考[这篇论文](https://arxiv.org/pdf/1706.02677.pdf)。
 - `device` 表示使用的设备，从gpu和cpu中选择。
 
 可通过`bash run_clm.sh`启动训练，更多参数详情和参数的默认值请参考`run_clm.py`。
diff --git a/examples/question_generation/unimo-text/README.md b/examples/question_generation/unimo-text/README.md
@@ -198,7 +198,7 @@ python -m paddle.distributed.launch --gpus "1,2" --log_dir ./unimo/finetune/log
     --epochs=20 \
     --batch_size=16 \
     --learning_rate=1e-5 \
-    --warmup_propotion=0.02 \
+    --warmup_proportion=0.02 \
     --weight_decay=0.01 \
     --max_seq_len=512 \
     --max_target_len=30 \
@@ -239,7 +239,7 @@ python -m paddle.distributed.launch --gpus "1,2" --log_dir ./unimo/finetune/log
 - `batch_size` 表示每次迭代**每张卡**上的样本数目。
 - `learning_rate` 表示基础学习率大小，将于learning rate scheduler产生的值相乘作为当前学习率。
 - `weight_decay` 表示AdamW优化器中使用的weight_decay的系数。
-- `warmup_propotion` 表示学习率逐渐升高到基础学习率（即上面配置的learning_rate）所需要的迭代数占总步数的比例。
+- `warmup_proportion` 表示学习率逐渐升高到基础学习率（即上面配置的learning_rate）所需要的迭代数占总步数的比例。
 - `max_seq_len` 模型输入序列的最大长度。
 - `max_target_len` 模型训练时标签的最大长度。
 - `min_dec_len` 模型生成序列的最小长度。
diff --git a/examples/question_generation/unimo-text/train.py b/examples/question_generation/unimo-text/train.py
@@ -12,25 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import time
-import math
 import argparse
 import json
-import copy
+import os
+import time
 
 import paddle
 import paddle.distributed as dist
-import paddle.nn as nn
 import paddle.nn.functional as F
-from paddlenlp.transformers import LinearDecayWithWarmup
+from gen_utils import create_data_loader, print_args, select_sum, set_seed
 from paddle.optimizer import AdamW
 
 from paddlenlp.datasets import load_dataset
-from paddlenlp.transformers import UNIMOLMHeadModel, UNIMOTokenizer, BasicTokenizer
 from paddlenlp.metrics import BLEU
-
-from gen_utils import print_args, set_seed, create_data_loader, select_sum
+from paddlenlp.transformers import (
+    BasicTokenizer,
+    LinearDecayWithWarmup,
+    UNIMOLMHeadModel,
+    UNIMOTokenizer,
+)
 
 
 # yapf: disable
@@ -48,7 +48,7 @@ def parse_args():
     parser.add_argument('--learning_rate', type=float, default=5e-5, help='The initial learning rate.')
     parser.add_argument('--weight_decay', type=float, default=0.01, help='The weight decay for optimizer.')
     parser.add_argument('--epochs', type=int, default=3, help='Total number of training epochs to perform.')
-    parser.add_argument('--warmup_propotion', type=float, default=0.02, help='The number of warmup steps.')
+    parser.add_argument('--warmup_proportion', type=float, default=0.02, help='The number of warmup steps.')
     parser.add_argument('--max_grad_norm', type=float, default=1.0, help='The max value of grad norm.')
     parser.add_argument('--beta1', type=float, default=0.9, help='beta1')
     parser.add_argument('--beta2', type=float, default=0.98, help='beta2')
@@ -153,7 +153,7 @@ def run(args):
     if args.do_train:
         num_training_steps = args.epochs * len(train_data_loader)
 
-        lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_propotion)
+        lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion)
         # Generate parameter names needed to perform weight decay.
         # All bias and LayerNorm parameters are excluded.
 
diff --git a/examples/text_generation/unimo-text/README.md b/examples/text_generation/unimo-text/README.md
@@ -62,7 +62,7 @@ python -m paddle.distributed.launch --gpus "0" --log_dir ./log run_gen.py \
     --epochs=6 \
     --batch_size=16 \
     --learning_rate=5e-5 \
-    --warmup_propotion=0.02 \
+    --warmup_proportion=0.02 \
     --weight_decay=0.01 \
     --max_seq_len=512 \
     --max_target_len=30 \
@@ -91,7 +91,7 @@ python -m paddle.distributed.launch --gpus "0" --log_dir ./log run_gen.py \
 - `batch_size` 表示每次迭代**每张卡**上的样本数目。
 - `learning_rate` 表示基础学习率大小，将于learning rate scheduler产生的值相乘作为当前学习率。
 - `weight_decay` 表示AdamW优化器中使用的weight_decay的系数。
-- `warmup_propotion` 表示学习率逐渐升高到基础学习率（即上面配置的learning_rate）所需要的迭代数占总步数的比例，最早的使用可以参考[这篇论文](https://arxiv.org/pdf/1706.02677.pdf)。
+- `warmup_proportion` 表示学习率逐渐升高到基础学习率（即上面配置的learning_rate）所需要的迭代数占总步数的比例，最早的使用可以参考[这篇论文](https://arxiv.org/pdf/1706.02677.pdf)。
 - `max_seq_len` 模型输入序列的最大长度。
 - `max_target_len` 模型训练时标签的最大长度。
 - `min_dec_len` 模型生成序列的最小长度。
diff --git a/examples/text_generation/unimo-text/run_gen.py b/examples/text_generation/unimo-text/run_gen.py
@@ -1,21 +1,35 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
 import os
 import time
-import math
-import argparse
-import json
 
 import paddle
 import paddle.distributed as dist
-import paddle.nn as nn
 import paddle.nn.functional as F
-from paddlenlp.transformers import LinearDecayWithWarmup
+from gen_utils import create_data_loader, print_args, select_sum, set_seed
 from paddle.optimizer import AdamW
 
 from paddlenlp.datasets import load_dataset
-from paddlenlp.transformers import UNIMOLMHeadModel, UNIMOTokenizer, BasicTokenizer
 from paddlenlp.metrics import BLEU
-
-from gen_utils import print_args, set_seed, create_data_loader, select_sum
+from paddlenlp.transformers import (
+    BasicTokenizer,
+    LinearDecayWithWarmup,
+    UNIMOLMHeadModel,
+    UNIMOTokenizer,
+)
 
 
 # yapf: disable
@@ -33,7 +47,7 @@ def parse_args():
     parser.add_argument('--learning_rate', type=float, default=5e-5, help='The initial learning rate.')
     parser.add_argument('--weight_decay', type=float, default=0.01, help='The weight decay for optimizer.')
     parser.add_argument('--epochs', type=int, default=3, help='Total number of training epochs to perform.')
-    parser.add_argument('--warmup_propotion', type=float, default=0.02, help='The number of warmup steps.')
+    parser.add_argument('--warmup_proportion', type=float, default=0.02, help='The number of warmup steps.')
     parser.add_argument('--max_grad_norm', type=float, default=1.0, help='The max value of grad norm.')
     parser.add_argument('--beta1', type=float, default=0.9, help='beta1')
     parser.add_argument('--beta2', type=float, default=0.98, help='beta2')
@@ -112,7 +126,7 @@ def run(args):
     if args.do_train:
         num_training_steps = args.epochs * len(train_data_loader)
 
-        lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_propotion)
+        lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion)
         # Generate parameter names needed to perform weight decay.
         # All bias and LayerNorm parameters are excluded.
 
diff --git a/examples/text_generation/unimo-text/scripts/lcsts_train.sh b/examples/text_generation/unimo-text/scripts/lcsts_train.sh
@@ -1,3 +1,17 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # GPU启动，参数`--gpus`指定训练所用的GPU卡号，可以是单卡，也可以多卡
 unset CUDA_VISIBLE_DEVICES
 
@@ -14,7 +28,7 @@ python -m paddle.distributed.launch --gpus "0,1,2,3" --log_dir ${log_dir} run_ge
     --epochs=6 \
     --batch_size=64 \
     --learning_rate=5e-5 \
-    --warmup_propotion=0.02 \
+    --warmup_proportion=0.02 \
     --weight_decay=0.01 \
     --max_seq_len=360 \
     --max_target_len=30 \
diff --git a/examples/text_generation/unimo-text/scripts/qg_train.sh b/examples/text_generation/unimo-text/scripts/qg_train.sh
@@ -1,3 +1,17 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # GPU启动，参数`--gpus`指定训练所用的GPU卡号，可以是单卡，也可以多卡
 unset CUDA_VISIBLE_DEVICES
 
@@ -14,7 +28,7 @@ python -m paddle.distributed.launch --gpus "0,1,2,3" --log_dir ${log_dir} run_ge
     --epochs=6 \
     --batch_size=8 \
     --learning_rate=5e-5 \
-    --warmup_propotion=0.02 \
+    --warmup_proportion=0.02 \
     --weight_decay=0.01 \
     --max_seq_len=360 \
     --max_target_len=30 \
diff --git a/examples/text_generation/unimo-text/scripts/table_train.sh b/examples/text_generation/unimo-text/scripts/table_train.sh
@@ -1,3 +1,17 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # GPU启动，参数`--gpus`指定训练所用的GPU卡号，可以是单卡，也可以多卡
 unset CUDA_VISIBLE_DEVICES
 
@@ -14,7 +28,7 @@ python -m paddle.distributed.launch --gpus "0,1,2,3" --log_dir ${log_dir} run_ge
     --epochs=6 \
     --batch_size=8 \
     --learning_rate=5e-5 \
-    --warmup_propotion=0.02 \
+    --warmup_proportion=0.02 \
     --weight_decay=0.01 \
     --max_seq_len=512 \
     --max_target_len=200 \
diff --git a/examples/text_summarization/unimo-text/README.md b/examples/text_summarization/unimo-text/README.md
@@ -202,7 +202,7 @@ python -m paddle.distributed.launch --gpus "0,1,2,3" --log_dir ${log_dir} train.
 - `batch_size` 表示每次迭代**每张卡**上的样本数目。
 - `learning_rate` 表示基础学习率大小，将于learning rate scheduler产生的值相乘作为当前学习率。
 - `weight_decay` 表示AdamW优化器中使用的weight_decay的系数。
-- `warmup_propotion` 表示学习率逐渐升高到基础学习率（即上面配置的learning_rate）所需要的迭代数占总步数的比例，最早的使用可以参考[这篇论文](https://arxiv.org/pdf/1706.02677.pdf)。
+- `warmup_proportion` 表示学习率逐渐升高到基础学习率（即上面配置的learning_rate）所需要的迭代数占总步数的比例，最早的使用可以参考[这篇论文](https://arxiv.org/pdf/1706.02677.pdf)。
 - `max_seq_len` 模型输入序列的最大长度。
 - `max_target_len` 模型训练时标签的最大长度。
 - `min_dec_len` 模型生成序列的最小长度。