Skip to content

Commit 94dd90a

Browse files
authored
[Pre-Training] ERNIE-CW pre-training tasks docs. (PaddlePaddle#3111)
* add ernie-large config * update * update clue finetune. * unused delete. * update * support no nsp for enrie. * fix evaluation * fix amp o2 save_dtype bugs. * extand ernie. * fix ernie pretrain with ## vocab. * extend vocab * support custom tokenizer. * add some comments. * fix bugs. * add comments. * fix bug. * fix run_pretrain_static logging. * fix all gather. * fix a100 * fix * fix bugs * fix save * tmp commit for pre-process. * Update README.md * Update README.md * add amp o1 support * ernie cw readme. * fix * throw error when dataset is invalid. * update document. * refine readme. * fix * refactor * refator2 * Add pre-training introduction. * update image width. * refine doc * fit table width. * fix c++ style * fix table * refine docs * refine model_zoo/ernie-1.0/README.md * readfine readme. * fix link * fix bug * fix documents. * add weight. * fix config
1 parent 6b59ba2 commit 94dd90a

28 files changed

+2230
-172
lines changed

.copyright.hook

+134
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import absolute_import
16+
from __future__ import print_function
17+
from __future__ import unicode_literals
18+
19+
import argparse
20+
import io
21+
import re
22+
import sys
23+
import os
24+
import datetime
25+
26+
COPYRIGHT = '''Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
27+
28+
Licensed under the Apache License, Version 2.0 (the "License");
29+
you may not use this file except in compliance with the License.
30+
You may obtain a copy of the License at
31+
32+
http://www.apache.org/licenses/LICENSE-2.0
33+
34+
Unless required by applicable law or agreed to in writing, software
35+
distributed under the License is distributed on an "AS IS" BASIS,
36+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
37+
See the License for the specific language governing permissions and
38+
limitations under the License.'''
39+
40+
def _generate_copyright(comment_mark):
41+
copyright=COPYRIGHT.split(os.linesep)
42+
header = copyright[0].rstrip()
43+
44+
p = re.search('(\d{4})', header).group(0)
45+
now = datetime.datetime.now()
46+
47+
header = header.replace(p,str(now.year))
48+
49+
ans=[comment_mark + " " + header + os.linesep]
50+
for idx, line in enumerate(copyright[1:]):
51+
ans.append(comment_mark + " " + line.rstrip() + os.linesep)
52+
53+
return ans
54+
55+
def _get_comment_mark(path):
56+
lang_type=re.compile(r"\.(py|sh)$")
57+
if lang_type.search(path) is not None:
58+
return "#"
59+
60+
lang_type=re.compile(r"\.(h|c|hpp|cc|cpp|cu|go|cuh|proto)$")
61+
if lang_type.search(path) is not None:
62+
return "//"
63+
64+
return None
65+
66+
67+
RE_ENCODE = re.compile(r"^[ \t\v]*#.*?coding[:=]", re.IGNORECASE)
68+
RE_COPYRIGHT = re.compile(r".*Copyright \(c\) \d{4}", re.IGNORECASE)
69+
RE_SHEBANG = re.compile(r"^[ \t\v]*#[ \t]?\!")
70+
71+
def _check_copyright(path):
72+
head=[]
73+
try:
74+
with open(path) as f:
75+
head = [next(f) for x in range(4)]
76+
except StopIteration:
77+
pass
78+
79+
for idx, line in enumerate(head):
80+
if RE_COPYRIGHT.search(line) is not None:
81+
return True
82+
83+
return False
84+
85+
def generate_copyright(path, comment_mark):
86+
original_contents = io.open(path, encoding="utf-8").readlines()
87+
head = original_contents[0:4]
88+
89+
insert_line_no=0
90+
for i, line in enumerate(head):
91+
if RE_ENCODE.search(line) or RE_SHEBANG.search(line):
92+
insert_line_no=i+1
93+
94+
copyright = _generate_copyright(comment_mark)
95+
if insert_line_no == 0:
96+
new_contents = copyright
97+
if len(original_contents) > 0 and len(original_contents[0].strip()) != 0:
98+
new_contents.append(os.linesep)
99+
new_contents.extend(original_contents)
100+
else:
101+
new_contents=original_contents[0:insert_line_no]
102+
new_contents.append(os.linesep)
103+
new_contents.extend(copyright)
104+
if len(original_contents) > insert_line_no and len(original_contents[insert_line_no].strip()) != 0:
105+
new_contents.append(os.linesep)
106+
new_contents.extend(original_contents[insert_line_no:])
107+
new_contents="".join(new_contents)
108+
109+
with io.open(path, 'w') as output_file:
110+
output_file.write(new_contents)
111+
112+
113+
114+
def main(argv=None):
115+
parser = argparse.ArgumentParser(
116+
description='Checker for copyright declaration.')
117+
parser.add_argument('filenames', nargs='*', help='Filenames to check')
118+
args = parser.parse_args(argv)
119+
120+
retv = 0
121+
for path in args.filenames:
122+
comment_mark = _get_comment_mark(path)
123+
if comment_mark is None:
124+
print("warning:Unsupported file", path, file=sys.stderr)
125+
continue
126+
127+
if _check_copyright(path):
128+
continue
129+
130+
generate_copyright(path, comment_mark)
131+
132+
133+
if __name__ == '__main__':
134+
exit(main())

.pre-commit-config.yaml

+7
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,10 @@ repos:
2626
files: \.md$
2727
- id: remove-tabs
2828
files: \.md$
29+
- repo: local
30+
hooks:
31+
- id: copyright_checker
32+
name: copyright_checker
33+
entry: python .copyright.hook
34+
language: system
35+
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps|py|sh)$

docs/FAQ.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ emb.set_state_dict(load_layer_state_dict) # 加载模型参数
182182

183183
**A:** 预训练模型通常会有配套的tokenzier和词典,对于大多数中文预训练模型,如ERNIE-3.0,使用的都是字粒度的输入,tokenzier会将句子转换为字粒度的形式,模型无法收到词粒度的输入。如果希望引入额外的词典,需要修改预训练模型的tokenizer和词典,可以参考这里[blog](https://kexue.fm/archives/7758/comment-page-1#Tokenizer ),另外注意embedding矩阵也要加上这些新增词的embedding表示。
184184

185-
另外还有一种方式可以使用这些字典信息,可以将数据中在词典信息中的词进行整体mask进行一个mask language model的二次预训练,这样经过二次训练的模型就包含了对额外字典的表征。可参考 [PaddleNLP 预训练数据流程](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/ernie-1.0/data_tools)。
185+
另外还有一种方式可以使用这些字典信息,可以将数据中在词典信息中的词进行整体mask进行一个mask language model的二次预训练,这样经过二次训练的模型就包含了对额外字典的表征。可参考 [PaddleNLP 预训练数据流程](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/ernie-1.0/)。
186186

187187

188188
此外还有些词粒度及字词混合粒度的预训练模型,在这些词粒度的模型下引入额外的词表也会容易些,我们也将持续丰富PaddleNLP中的预训练模型。

docs/model_zoo/transformers/ERNIE/contents.rst

+12
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@ ERNIE模型汇总
1616
| | | 12-heads, 108M parameters. |
1717
| | | Trained on Chinese text. |
1818
+----------------------------------------------------------------------------------+--------------+----------------------------------------------------------------------------------+
19+
|``ernie-1.0-base-zh-cw`` | Chinese | 12-layer, 768-hidden, |
20+
| | | 12-heads, 118M parameters. |
21+
| | | Trained on Chinese text. |
22+
+----------------------------------------------------------------------------------+--------------+----------------------------------------------------------------------------------+
23+
|``ernie-1.0-large-zh-cw`` | Chinese | 24-layer, 1024-hidden, |
24+
| | | 16-heads, 272M parameters. |
25+
| | | Trained on Chinese text. |
26+
+----------------------------------------------------------------------------------+--------------+----------------------------------------------------------------------------------+
1927
|``ernie-tiny`` | Chinese | 3-layer, 1024-hidden, |
2028
| | | 16-heads, _M parameters. |
2129
| | | Trained on Chinese text. |
@@ -32,6 +40,10 @@ ERNIE模型汇总
3240
| | | 16-heads, 336M parameters. |
3341
| | | Trained on lower-cased English text. |
3442
+----------------------------------------------------------------------------------+--------------+----------------------------------------------------------------------------------+
43+
|``ernie-3.0-xbase-zh`` | Chinese | 20-layer, 1024-hidden, |
44+
| | | 16-heads, 296M parameters. |
45+
| | | Trained on Chinese text. |
46+
+----------------------------------------------------------------------------------+--------------+----------------------------------------------------------------------------------+
3547
|``ernie-3.0-base-zh`` | Chinese | 12-layer, 768-hidden, |
3648
| | | 12-heads, 118M parameters. |
3749
| | | Trained on Chinese text. |

examples/benchmark/clue/README.md

+66-28
Original file line numberDiff line numberDiff line change
@@ -67,14 +67,51 @@
6767
<td style="text-align:center;">
6868
<span style="font-size:18px;">C<sup>3</sup></span>
6969
</td>
70+
</tr> <tr>
71+
<td rowspan=3 align=center> 24L1024H </td>
72+
<td style="text-align:center">
73+
<span style="font-size:18px">ERNIE 1.0-Large-zh-CW</span>
74+
</td>
75+
<td style="text-align:center">
76+
<span style="font-size:18px"><b>79.03</b></span>
77+
</td>
78+
<td style="text-align:center">
79+
<span style="font-size:18px">75.97</span>
80+
</td>
81+
<td style="text-align:center">
82+
<span style="font-size:18px">59.65</span>
83+
</td>
84+
<td style="text-align:center">
85+
<span style="font-size:18px"><b>62.91</b></span>
86+
</td>
87+
<td style="text-align:center">
88+
<span style="font-size:18px"><b>85.09</b></span>
89+
</td>
90+
<td style="text-align:center">
91+
<span style="font-size:18px"><b>81.73</b></span>
92+
</td>
93+
<td style="text-align:center">
94+
<span style="font-size:18px"><b>93.09</b></span>
95+
</td>
96+
<td style="text-align:center">
97+
<span style="font-size:18px"><b>84.53</b></span>
98+
</td>
99+
<td style="text-align:center">
100+
<span style="font-size:18px"><b>74.22/91.88</b></span>
101+
</td>
102+
<td style="text-align:center">
103+
<span style="font-size:18px"><b>88.57</b></span>
104+
</td>
105+
<td style="text-align:center">
106+
<span style="font-size:18px"><b>84.54</b></span>
107+
</td>
70108
</tr>
71109
<tr>
72-
<td rowspan=2 align=center> 24L1024H </td>
73110
<td style="text-align:center">
74111
<span style="font-size:18px">ERNIE 2.0-Large-zh</span>
75112
</td>
76113
<td style="text-align:center">
77-
<span style="font-size:18px"><b>77.03</b></span>
114+
<span style="font-size:18px">77.03</span>
78115
</td>
79116
<td style="text-align:center">
80117
<span style="font-size:18px"><b>76.41</b></span>
@@ -89,16 +126,16 @@
89126
<span style="font-size:18px">83.82</span>
90127
</td>
91128
<td style="text-align:center">
92-
<span style="font-size:18px"><b>79.69</b></span>
129+
<span style="font-size:18px">79.69</span>
93130
</td>
94131
<td style="text-align:center">
95132
<span style="font-size:18px">89.14</span>
96133
</td>
97134
<td style="text-align:center">
98-
<span style="font-size:18px"><b>84.10</b></span>
135+
<span style="font-size:18px">84.10</span>
99136
</td>
100137
<td style="text-align:center">
101-
<span style="font-size:18px"><b>71.48/90.35</b></span>
138+
<span style="font-size:18px">71.48/90.35</span>
102139
</td>
103140
<td style="text-align:center">
104141
<span style="font-size:18px">85.52</span>
@@ -124,13 +161,13 @@
124161
<span style="font-size:18px">62.02</span>
125162
</td>
126163
<td style="text-align:center">
127-
<span style="font-size:18px"><b>83.88</b></span>
164+
<span style="font-size:18px">83.88</span>
128165
</td>
129166
<td style="text-align:center">
130167
<span style="font-size:18px">78.81</span>
131168
</td>
132169
<td style="text-align:center">
133-
<span style="font-size:18px"><b>90.79</b></span>
170+
<span style="font-size:18px">90.79</span>
134171
</td>
135172
<td style="text-align:center">
136173
<span style="font-size:18px">83.67</span>
@@ -139,7 +176,7 @@
139176
<span style="font-size:18px">70.58/89.82</span>
140177
</td>
141178
<td style="text-align:center">
142-
<span style="font-size:18px"><b>85.72</b></span>
179+
<span style="font-size:18px">85.72</span>
143180
</td>
144181
<td style="text-align:center">
145182
<span style="font-size:18px">75.26</span>
@@ -151,37 +188,37 @@
151188
<span style="font-size:18px">ERNIE 3.0-Xbase-zh</span>
152189
</td>
153190
<td style="text-align:center">
154-
<span style="font-size:18px"><b>78.71</b></span>
191+
<span style="font-size:18px"><b>78.39</b></span>
155192
</td>
156193
<td style="text-align:center">
157-
<span style="font-size:18px"><b>76.85</b></span>
194+
<span style="font-size:18px"><b>76.16</b></span>
158195
</td>
159196
<td style="text-align:center">
160-
<span style="font-size:18px"><b>59.89</b></span>
197+
<span style="font-size:18px"><b>59.55</b></span>
161198
</td>
162199
<td style="text-align:center">
163-
<span style="font-size:18px"><b>62.41</b></span>
200+
<span style="font-size:18px"><b>61.87</b></span>
164201
</td>
165202
<td style="text-align:center">
166-
<span style="font-size:18px"><b>84.76</b></span>
203+
<span style="font-size:18px"><b>84.40</b></span>
167204
</td>
168205
<td style="text-align:center">
169-
<span style="font-size:18px"><b>82.51</b></span>
206+
<span style="font-size:18px"><b>81.73</b></span>
170207
</td>
171208
<td style="text-align:center">
172-
<span style="font-size:18px"><b>89.80</b></span>
209+
<span style="font-size:18px"><b>88.82</b></span>
173210
</td>
174211
<td style="text-align:center">
175-
<span style="font-size:18px"><b>84.47</b></span>
212+
<span style="font-size:18px"><b>83.60</b></span>
176213
</td>
177214
<td style="text-align:center">
178-
<span style="font-size:18px"><b>75.49/92.67</b></span>
215+
<span style="font-size:18px"><b>75.99/93.00</b></span>
179216
</td>
180217
<td style="text-align:center">
181-
<span style="font-size:18px"><b>86.36</b></span>
218+
<span style="font-size:18px"><b>86.78</b></span>
182219
</td>
183220
<td style="text-align:center">
184-
<span style="font-size:18px"><b>84.59</b></span>
221+
<span style="font-size:18px"><b>84.98</b></span>
185222
</td>
186223
</tr>
187224
<tr>
@@ -270,31 +307,31 @@
270307
<span style="font-size:18px">ERNIE 2.0-Base-zh</span>
271308
</td>
272309
<td style="text-align:center">
273-
<span style="font-size:18px">74.95</span>
310+
<span style="font-size:18px">74.32</span>
274311
</td>
275312
<td style="text-align:center">
276-
<span style="font-size:18px">76.25</span>
313+
<span style="font-size:18px">75.65</span>
277314
</td>
278315
<td style="text-align:center">
279-
<span style="font-size:18px">58.53</span>
316+
<span style="font-size:18px">58.25</span>
280317
</td>
281318
<td style="text-align:center">
282-
<span style="font-size:18px">61.72</span>
319+
<span style="font-size:18px">61.64</span>
283320
</td>
284321
<td style="text-align:center">
285-
<span style="font-size:18px">83.07</span>
322+
<span style="font-size:18px">82.62</span>
286323
</td>
287324
<td style="text-align:center">
288-
<span style="font-size:18px">78.81</span>
325+
<span style="font-size:18px">78.71</span>
289326
</td>
290327
<td style="text-align:center">
291-
<span style="font-size:18px">84.21</span>
328+
<span style="font-size:18px">81.91</span>
292329
</td>
293330
<td style="text-align:center">
294-
<span style="font-size:18px">82.77</span>
331+
<span style="font-size:18px">82.33</span>
295332
</td>
296333
<td style="text-align:center">
297-
<span style="font-size:18px">68.22/88.71</span>
334+
<span style="font-size:18px">66.08/87.46</span>
298335
</td>
299336
<td style="text-align:center">
300337
<span style="font-size:18px">82.78</span>
@@ -1154,6 +1191,7 @@ AFQMC(语义相似度)、TNEWS(文本分类)、IFLYTEK(长文本分类
11541191

11551192
| Model | AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUEWSC2020 | CSL | CMRC2018 | CHID | C<sup>3</sup> |
11561193
| -------------------------------- | ------- | ------- | ------- | -------- | -------- | ----------- | ------- | -------- | ------- | ------------- |
1194+
| ERNIE 1.0-Large-zh-cw | 2e-5,64 | 3e-5,32 | 5e-5,16 | 2e-5,16 | 2e-5,32 | 1e-5,32 | 1e-5,16 | 2e-5,24 | 1e-5,24 | 2e-5,32 |
11571195
| ERNIE 3.0-Xbase-zh | 2e-5,16 | 3e-5,32 | 3e-5,32 | 3e-5,64 | 3e-5,64 | 2e-5,32 | 1e-5,16 | 3e-5,24 | 2e-5,24 | 3e-5,24 |
11581196
| ERNIE 2.0-Large-zh | 1e-5,32 | 3e-5,64 | 3e-5,32 | 2e-5,32 | 1e-5,16 | 3e-5,32 | 1e-5,64 | 2e-5,24 | 2e-5,24 | 3e-5,32 |
11591197
| HFL/RoBERTa-wwm-ext-large | 1e-5,32 | 3e-5,32 | 2e-5,32 | 1e-5,16 | 1e-5,16 | 2e-5,16 | 2e-5,16 | 3e-5,32 | 1e-5,24 | 2e-5,24 |

0 commit comments

Comments
 (0)