1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import argparse
1
16
import os
2
17
import time
3
- import math
4
- import argparse
5
- import json
6
18
7
19
import paddle
8
20
import paddle .distributed as dist
9
- import paddle .nn as nn
10
21
import paddle .nn .functional as F
11
- from paddlenlp . transformers import LinearDecayWithWarmup
22
+ from gen_utils import create_data_loader , print_args , select_sum , set_seed
12
23
from paddle .optimizer import AdamW
13
24
14
25
from paddlenlp .datasets import load_dataset
15
- from paddlenlp .transformers import UNIMOLMHeadModel , UNIMOTokenizer , BasicTokenizer
16
26
from paddlenlp .metrics import BLEU
17
-
18
- from gen_utils import print_args , set_seed , create_data_loader , select_sum
27
+ from paddlenlp .transformers import (
28
+ BasicTokenizer ,
29
+ LinearDecayWithWarmup ,
30
+ UNIMOLMHeadModel ,
31
+ UNIMOTokenizer ,
32
+ )
19
33
20
34
21
35
# yapf: disable
@@ -33,7 +47,7 @@ def parse_args():
33
47
parser .add_argument ('--learning_rate' , type = float , default = 5e-5 , help = 'The initial learning rate.' )
34
48
parser .add_argument ('--weight_decay' , type = float , default = 0.01 , help = 'The weight decay for optimizer.' )
35
49
parser .add_argument ('--epochs' , type = int , default = 3 , help = 'Total number of training epochs to perform.' )
36
- parser .add_argument ('--warmup_propotion ' , type = float , default = 0.02 , help = 'The number of warmup steps.' )
50
+ parser .add_argument ('--warmup_proportion ' , type = float , default = 0.02 , help = 'The number of warmup steps.' )
37
51
parser .add_argument ('--max_grad_norm' , type = float , default = 1.0 , help = 'The max value of grad norm.' )
38
52
parser .add_argument ('--beta1' , type = float , default = 0.9 , help = 'beta1' )
39
53
parser .add_argument ('--beta2' , type = float , default = 0.98 , help = 'beta2' )
@@ -112,7 +126,7 @@ def run(args):
112
126
if args .do_train :
113
127
num_training_steps = args .epochs * len (train_data_loader )
114
128
115
- lr_scheduler = LinearDecayWithWarmup (args .learning_rate , num_training_steps , args .warmup_propotion )
129
+ lr_scheduler = LinearDecayWithWarmup (args .learning_rate , num_training_steps , args .warmup_proportion )
116
130
# Generate parameter names needed to perform weight decay.
117
131
# All bias and LayerNorm parameters are excluded.
118
132
0 commit comments