forked from graphcore/examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfigs.yml
120 lines (104 loc) · 2.99 KB
/
configs.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#----------------------------------------------------------------------------------
defaults: &defaults
dataloader_workers: 4
loss_img_weight: 7
text_seq_len: 80
truncate_captions: True
lr_decay: True
checkpoint_output_dir: "./output/ckpt"
wandb: False
#----------------------------------------------------------------------------------
#----------------------------------------------------------------------------------
L16_CLIP_vocab:
<<: *defaults
# Execution
batch_size: 1
epochs: 200
batches_per_step: 1
replication_factor: 1
gradient_accumulation: 15
stochastic_rounding: True
embedding_serialization_factor: 8
enable_half_partials: True
ipus_per_replica: 4
layers_per_ipu: [0,7,7,2]
matmul_proportion: 0.2
fp16: True
# Optimizer
optimizer: "Adam"
learning_rate: 3e-4
enable_half_first_order_momentum: True
# Model
hidden_size: 512
num_hidden_layers: 16
num_attention_heads: 16
dim_head: 64
ff_dropout: 0.0
attn_dropout: 0.0
sandwich_norm: True
attn_types: "axial_row,axial_row,axial_col,axial_row,axial_row,axial_row,axial_col,axial_row,axial_row,axial_row,axial_col,full,axial_row,axial_row,axial_col,full"
checkpoint_save_steps: 5000
loss_scaling: 16384
# Dataset
input_folder: "./data/COCO"
# Misc
wandb_project_name: "miniDALL-E_CLIP_vocab"
#----------------------------------------------------------------------------------
#----------------------------------------------------------------------------------
L16:
<<: *defaults
# Execution
batch_size: 3
epochs: 200
batches_per_step: 1
replication_factor: 1
gradient_accumulation: 8
stochastic_rounding: True
embedding_serialization_factor: 4
enable_half_partials: True
ipus_per_replica: 4
layers_per_ipu: [0,6,6,4]
fp16: True
# Optimizer
optimizer: "Adam"
learning_rate: 3e-4
enable_half_first_order_momentum: True
# Model
hidden_size: 512
num_hidden_layers: 16
num_attention_heads: 16
dim_head: 64
ff_dropout: 0.0
attn_dropout: 0.0
attn_types: "axial_row,axial_row,axial_col,axial_row,axial_row,axial_row,axial_col,axial_row,axial_row,axial_row,axial_col,full,axial_row,axial_row,axial_col,full"
bpe_path: ./models/bpe/bpe_yttm_vocab.txt
checkpoint_save_steps: 5000
loss_scaling: 16384
# Dataset
input_folder: "./data/COCO"
# Misc
wandb_project_name: "miniDALL-E"
#----------------------------------------------------------------------------------
#----------------------------------------------------------------------------------
unit_test:
<<: *defaults
# Execution
batch_size: 1
epochs: 1
batches_per_step: 1
replication_factor: 1
gradient_accumulation: 2
# Model
hidden_size: 64
num_hidden_layers: 1
num_attention_heads: 1
dim_head: 64
ff_dropout: 0.0
attn_dropout: 0.0
enable_half_partials: False
attn_types: "axial_row"
bpe_path: ./models/bpe/bpe_yttm_vocab.txt
# Optimizer
learning_rate: 3e-4
loss_scaling: 1
#----------------------------------------------------------------------------------