-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnohup.out
200 lines (198 loc) · 12 KB
/
nohup.out
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
/opt/conda/envs/beit3/lib/python3.8/site-packages/torch/distributed/launch.py:180: FutureWarning: The module torch.distributed.launch is deprecated
and will be removed in future. Use torchrun.
Note that --use_env is set by default in torchrun.
If your script expects `--local_rank` argument to be set, please
change it to read from `os.environ['LOCAL_RANK']` instead. See
https://pytorch.org/docs/stable/distributed.html#launch-utility for
further instructions
warnings.warn(
WARNING:torch.distributed.run:
*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
*****************************************
| distributed init (rank 3): env://, gpu 3
| distributed init (rank 2): env://, gpu 2
| distributed init (rank 5): env://, gpu 5
| distributed init (rank 0): env://, gpu 0
| distributed init (rank 4): env://, gpu 4
| distributed init (rank 1): env://, gpu 1
| distributed init (rank 6): env://, gpu 6
| distributed init (rank 7): env://, gpu 7
Namespace(aa='rand-m9-mstd0.5-inc1', auto_resume=True, batch_size=128, captioning_mask_prob=0.6, checkpoint=None, checkpoint_activations=True, clip_grad=None, color_jitter=0.4, crop_pct=None, cutmix=1.0, cutmix_minmax=None, data_path='./data/water', deepscale=False, deepscale_config=None, deepspeed=False, deepspeed_config='./saved_model/deepspeed_config.json', deepspeed_mpi=False, device='cuda', dist_backend='nccl', dist_eval=True, dist_on_itp=False, dist_url='env://', distributed=True, drop_path=0.25, drop_worst_after=12000, drop_worst_ratio=0.2, enable_deepspeed=True, epochs=50, eval=False, eval_batch_size=None, finetune='./pretrained_model/beit3_large_itc_patch16_224.pth', gpu=0, hwj_test=False, initial_scale_power=16, input_size=224, label_smoothing=0.1, layer_decay=0.8, length_penalty=0.6, local_rank=0, log_dir='./logs', lr=0.0002, min_lr=1e-06, mixup=0.8, mixup_mode='batch', mixup_prob=1.0, mixup_switch_prob=0.5, model='beit3_large_patch16_224', model_ema=False, model_ema_decay=0.9999, model_ema_force_cpu=False, model_key='model|module', model_prefix='', momentum=0.9, nb_classes=1000, num_beams=3, num_max_bpe_tokens=64, num_workers=10, opt='adamw', opt_betas=[0.9, 0.999], opt_eps=1e-08, output_dir='./saved_model', pin_mem=True, randaug=False, rank=0, recount=1, remode='pixel', reprob=0.25, resplit=False, resume='', save_ckpt=True, save_ckpt_freq=5, seed=42, sentencepiece_model='./pretrained_model/beit3.spm', smoothing=0.1, start_epoch=0, task='imagenet', task_cache_path='./saved_model', task_head_lr_weight=0, train_interpolation='bicubic', update_freq=1, vocab_size=64010, warmup_epochs=5, warmup_lr=1e-06, warmup_steps=-1, weight_decay=0.05, world_size=8, zero_stage=0)
Traceback (most recent call last):
File "run_beit3_finetuning.py", line 475, in <module>
main(opts, ds_init)
File "run_beit3_finetuning.py", line 247, in main
data_loader_train, data_loader_val = create_downstream_dataset(args)
File "/workspace/share/beit3/datasets.py", line 855, in create_downstream_dataset
create_dataset_by_split(args, split="train", is_train=True), \
File "/workspace/share/beit3/datasets.py", line 831, in create_dataset_by_split
dataset = dataset_class(
File "/workspace/share/beit3/datasets.py", line 40, in __init__
Traceback (most recent call last):
with open(index_file, mode="r", encoding="utf-8") as reader:
File "run_beit3_finetuning.py", line 475, in <module>
FileNotFoundError: [Errno 2] No such file or directory: './data/water/imagenet.train.index.jsonl'
main(opts, ds_init)
File "run_beit3_finetuning.py", line 247, in main
data_loader_train, data_loader_val = create_downstream_dataset(args)
File "/workspace/share/beit3/datasets.py", line 855, in create_downstream_dataset
create_dataset_by_split(args, split="train", is_train=True), \
File "/workspace/share/beit3/datasets.py", line 831, in create_dataset_by_split
Traceback (most recent call last):
File "run_beit3_finetuning.py", line 475, in <module>
dataset = dataset_class(
File "/workspace/share/beit3/datasets.py", line 40, in __init__
with open(index_file, mode="r", encoding="utf-8") as reader:
FileNotFoundError: [Errno 2] No such file or directory: './data/water/imagenet.train.index.jsonl'
main(opts, ds_init)
File "run_beit3_finetuning.py", line 247, in main
data_loader_train, data_loader_val = create_downstream_dataset(args)
File "/workspace/share/beit3/datasets.py", line 855, in create_downstream_dataset
create_dataset_by_split(args, split="train", is_train=True), \
File "/workspace/share/beit3/datasets.py", line 831, in create_dataset_by_split
dataset = dataset_class(
File "/workspace/share/beit3/datasets.py", line 40, in __init__
with open(index_file, mode="r", encoding="utf-8") as reader:
FileNotFoundErrorTraceback (most recent call last):
: [Errno 2] No such file or directory: './data/water/imagenet.train.index.jsonl'
File "run_beit3_finetuning.py", line 475, in <module>
main(opts, ds_init)
File "run_beit3_finetuning.py", line 247, in main
data_loader_train, data_loader_val = create_downstream_dataset(args)
File "/workspace/share/beit3/datasets.py", line 855, in create_downstream_dataset
Traceback (most recent call last):
File "run_beit3_finetuning.py", line 475, in <module>
create_dataset_by_split(args, split="train", is_train=True), \
File "/workspace/share/beit3/datasets.py", line 831, in create_dataset_by_split
dataset = dataset_class(
File "/workspace/share/beit3/datasets.py", line 40, in __init__
main(opts, ds_init)
with open(index_file, mode="r", encoding="utf-8") as reader: File "run_beit3_finetuning.py", line 247, in main
FileNotFoundError: [Errno 2] No such file or directory: './data/water/imagenet.train.index.jsonl'
data_loader_train, data_loader_val = create_downstream_dataset(args)
File "/workspace/share/beit3/datasets.py", line 855, in create_downstream_dataset
create_dataset_by_split(args, split="train", is_train=True), \
File "/workspace/share/beit3/datasets.py", line 831, in create_dataset_by_split
dataset = dataset_class(
File "/workspace/share/beit3/datasets.py", line 40, in __init__
with open(index_file, mode="r", encoding="utf-8") as reader:
FileNotFoundError: [Errno 2] No such file or directory: './data/water/imagenet.train.index.jsonl'
Traceback (most recent call last):
File "run_beit3_finetuning.py", line 475, in <module>
main(opts, ds_init)
File "run_beit3_finetuning.py", line 247, in main
data_loader_train, data_loader_val = create_downstream_dataset(args)
File "/workspace/share/beit3/datasets.py", line 855, in create_downstream_dataset
create_dataset_by_split(args, split="train", is_train=True), \
File "/workspace/share/beit3/datasets.py", line 831, in create_dataset_by_split
dataset = dataset_class(
File "/workspace/share/beit3/datasets.py", line 40, in __init__
with open(index_file, mode="r", encoding="utf-8") as reader:
FileNotFoundError: [Errno 2] No such file or directory: './data/water/imagenet.train.index.jsonl'
Traceback (most recent call last):
File "run_beit3_finetuning.py", line 475, in <module>
main(opts, ds_init)
File "run_beit3_finetuning.py", line 247, in main
data_loader_train, data_loader_val = create_downstream_dataset(args)
File "/workspace/share/beit3/datasets.py", line 855, in create_downstream_dataset
create_dataset_by_split(args, split="train", is_train=True), \
File "/workspace/share/beit3/datasets.py", line 831, in create_dataset_by_split
dataset = dataset_class(
File "/workspace/share/beit3/datasets.py", line 40, in __init__
with open(index_file, mode="r", encoding="utf-8") as reader:
FileNotFoundError: [Errno 2] No such file or directory: './data/water/imagenet.train.index.jsonl'
Traceback (most recent call last):
File "run_beit3_finetuning.py", line 475, in <module>
main(opts, ds_init)
File "run_beit3_finetuning.py", line 247, in main
data_loader_train, data_loader_val = create_downstream_dataset(args)
File "/workspace/share/beit3/datasets.py", line 855, in create_downstream_dataset
create_dataset_by_split(args, split="train", is_train=True), \
File "/workspace/share/beit3/datasets.py", line 831, in create_dataset_by_split
dataset = dataset_class(
File "/workspace/share/beit3/datasets.py", line 40, in __init__
with open(index_file, mode="r", encoding="utf-8") as reader:
FileNotFoundError: [Errno 2] No such file or directory: './data/water/imagenet.train.index.jsonl'
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 153390) of binary: /opt/conda/envs/beit3/bin/python
Traceback (most recent call last):
File "/opt/conda/envs/beit3/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/opt/conda/envs/beit3/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/opt/conda/envs/beit3/lib/python3.8/site-packages/torch/distributed/launch.py", line 195, in <module>
main()
File "/opt/conda/envs/beit3/lib/python3.8/site-packages/torch/distributed/launch.py", line 191, in main
launch(args)
File "/opt/conda/envs/beit3/lib/python3.8/site-packages/torch/distributed/launch.py", line 176, in launch
run(args)
File "/opt/conda/envs/beit3/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/opt/conda/envs/beit3/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/envs/beit3/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
run_beit3_finetuning.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2023-03-23_01:11:05
host : 90116f175f11
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 153391)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2023-03-23_01:11:05
host : 90116f175f11
rank : 2 (local_rank: 2)
exitcode : 1 (pid: 153392)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2023-03-23_01:11:05
host : 90116f175f11
rank : 3 (local_rank: 3)
exitcode : 1 (pid: 153393)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[4]:
time : 2023-03-23_01:11:05
host : 90116f175f11
rank : 4 (local_rank: 4)
exitcode : 1 (pid: 153394)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[5]:
time : 2023-03-23_01:11:05
host : 90116f175f11
rank : 5 (local_rank: 5)
exitcode : 1 (pid: 153395)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[6]:
time : 2023-03-23_01:11:05
host : 90116f175f11
rank : 6 (local_rank: 6)
exitcode : 1 (pid: 153396)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[7]:
time : 2023-03-23_01:11:05
host : 90116f175f11
rank : 7 (local_rank: 7)
exitcode : 1 (pid: 153397)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2023-03-23_01:11:05
host : 90116f175f11
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 153390)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================