-
Notifications
You must be signed in to change notification settings - Fork 1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add tensor parallelism for RWKV #1237
base: main
Are you sure you want to change the base?
Changes from 19 commits
4c7cb11
46904d5
e2933ef
d1112ab
43d641d
de02f37
dd441b6
a418670
540d856
12aac35
97c7915
5f89ed8
91cb759
49b263a
48de682
c6fac96
5a259c0
c2d6c85
bdb3658
ff7f328
1350b2c
c4d7a54
ee2f142
6e81f0b
df95419
d682529
0bc11d6
c6db95c
daac503
bf478ce
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
{ | ||
# Parallelism is not yet supported for rwkv | ||
"pipe_parallel_size": 1, | ||
"model_parallel_size": 1, | ||
|
||
"num_layers": 24, | ||
"hidden_size": 2048, | ||
"num_attention_heads": 32, # head_size = dim_att / num_attention_heads. | ||
# head_size is 64 for all rwkv models | ||
"seq_length": 4096, | ||
"max_position_embeddings": 4096, | ||
"output_layer_parallelism": "column", | ||
"norm": "rmsnorm", | ||
"rms_norm_epsilon": 1.0e-5, | ||
"train_micro_batch_size_per_gpu": 4, | ||
|
||
"attention_config": [[["rwkv"], 24]], | ||
|
||
"activation": "silu", | ||
|
||
# model settings | ||
|
||
#"pos_emb": "rotary", | ||
"rotary_pct": 0.25, | ||
"no_weight_tying": true, | ||
"gpt_j_residual": true, | ||
|
||
# these should provide some speedup but takes a while to build, set to true if desired | ||
"scaled_upper_triang_masked_softmax_fusion": false, | ||
"bias_gelu_fusion": false, | ||
"rope_fusion": false, | ||
"layernorm_fusion": false, | ||
|
||
|
||
# init methods | ||
"init_method": "small_init", | ||
"output_layer_init_method": "wang_init", | ||
|
||
# optimizer settings | ||
"optimizer": { | ||
"type": "Adam", | ||
"params": { | ||
"lr": 0.0008, | ||
"betas": [0.9, 0.95], | ||
"eps": 1.0e-8, | ||
} | ||
}, | ||
"min_lr": 0.00008, | ||
|
||
# for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training | ||
"zero_optimization": { | ||
"stage": 1, | ||
"allgather_partitions": True, | ||
"allgather_bucket_size": 500000000, | ||
"overlap_comm": True, | ||
"reduce_scatter": True, | ||
"reduce_bucket_size": 500000000, | ||
"contiguous_gradients": True, | ||
}, | ||
|
||
# batch / data settings | ||
"data_impl": "mmap", | ||
"num_workers": 1, | ||
|
||
# activation checkpointing | ||
"checkpoint_activations": true, | ||
"checkpoint_num_layers": 1, | ||
"partition_activations": true, | ||
"synchronize_each_layer": true, | ||
|
||
# regularization | ||
"gradient_clipping": 1.0, | ||
"weight_decay": 0.1, | ||
"hidden_dropout": 0, | ||
"attention_dropout": 0, | ||
|
||
# precision settings | ||
"bf16": { | ||
"bf16": true, | ||
"enabled": true, | ||
"loss_scale": 0, | ||
"loss_scale_window": 1000, | ||
"initial_scale_power": 12, | ||
"hysteresis": 2, | ||
"min_loss_scale": 1, | ||
}, | ||
|
||
# misc. training settings | ||
"train_iters": 320000, | ||
"lr_decay_iters": 320000, | ||
"distributed_backend": "nccl", | ||
"lr_decay_style": "constant", | ||
"warmup": 0.01, | ||
"checkpoint_factor": 100, | ||
"eval_interval": 100000, | ||
"eval_iters": 10, | ||
"seed": 1234, | ||
|
||
# logging | ||
"log_interval": 10, | ||
"steps_per_print": 10, | ||
"wall_clock_breakdown": true, | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
{ | ||
# Parallelism is not yet supported for rwkv | ||
"pipe_parallel_size": 1, | ||
"model_parallel_size": 1, | ||
|
||
"num_layers": 24, | ||
"hidden_size": 1024, | ||
"num_attention_heads": 16, # head_size = dim_att / num_attention_heads. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar comment here. Calling these attention heads is highly misleading. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I kind of disagree, as rwkv code generally references time mixing as |
||
# head_size is 64 for all rwkv models | ||
"seq_length": 4096, | ||
"max_position_embeddings": 4096, | ||
"output_layer_parallelism": "column", | ||
"norm": "rmsnorm", | ||
"rms_norm_epsilon": 1.0e-5, | ||
"train_micro_batch_size_per_gpu": 1, | ||
|
||
"attention_config": [[["rwkv"], 24]], | ||
|
||
"activation": "silu", | ||
|
||
# model settings | ||
|
||
#"pos_emb": "rotary", | ||
"rotary_pct": 0.25, | ||
"no_weight_tying": true, | ||
"gpt_j_residual": true, | ||
|
||
# these should provide some speedup but takes a while to build, set to true if desired | ||
"scaled_upper_triang_masked_softmax_fusion": false, | ||
"bias_gelu_fusion": false, | ||
"rope_fusion": false, | ||
"layernorm_fusion": false, | ||
|
||
|
||
# init methods | ||
"init_method": "small_init", | ||
"output_layer_init_method": "wang_init", | ||
|
||
# optimizer settings | ||
"optimizer": { | ||
"type": "Adam", | ||
"params": { | ||
"lr": 0.0008, | ||
"betas": [0.9, 0.95], | ||
"eps": 1.0e-8, | ||
} | ||
}, | ||
"min_lr": 0.00008, | ||
|
||
# for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training | ||
"zero_optimization": { | ||
"stage": 1, | ||
"allgather_partitions": True, | ||
"allgather_bucket_size": 500000000, | ||
"overlap_comm": True, | ||
"reduce_scatter": True, | ||
"reduce_bucket_size": 500000000, | ||
"contiguous_gradients": True, | ||
}, | ||
|
||
# batch / data settings | ||
"data_impl": "mmap", | ||
"num_workers": 1, | ||
|
||
# activation checkpointing | ||
"checkpoint_activations": true, | ||
"checkpoint_num_layers": 1, | ||
"partition_activations": true, | ||
"synchronize_each_layer": true, | ||
|
||
# regularization | ||
"gradient_clipping": 1.0, | ||
"weight_decay": 0.1, | ||
"hidden_dropout": 0, | ||
"attention_dropout": 0, | ||
|
||
# precision settings | ||
"bf16": { | ||
"bf16": true, | ||
"enabled": true, | ||
"loss_scale": 0, | ||
"loss_scale_window": 1000, | ||
"initial_scale_power": 12, | ||
"hysteresis": 2, | ||
"min_loss_scale": 1, | ||
}, | ||
|
||
# misc. training settings | ||
"train_iters": 320000, | ||
"lr_decay_iters": 320000, | ||
"distributed_backend": "nccl", | ||
"lr_decay_style": "constant", | ||
"warmup": 0.01, | ||
"checkpoint_factor": 100, | ||
"eval_interval": 100000, | ||
"eval_iters": 10, | ||
"seed": 1234, | ||
|
||
# logging | ||
"log_interval": 10, | ||
"steps_per_print": 10, | ||
"wall_clock_breakdown": true, | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
{ | ||
# Parallelism is not yet supported for rwkv | ||
"pipe_parallel_size": 1, | ||
"model_parallel_size": 1, | ||
|
||
"num_layers": 32, | ||
"hidden_size": 4096, | ||
"num_attention_heads": 64, # head_size = dim_att / num_attention_heads. | ||
# head_size is 64 for all rwkv models | ||
"seq_length": 4096, | ||
"max_position_embeddings": 4096, | ||
"output_layer_parallelism": "column", | ||
"norm": "rmsnorm", | ||
"rms_norm_epsilon": 1.0e-5, | ||
"train_micro_batch_size_per_gpu": 8, | ||
|
||
"attention_config": [[["rwkv"], 32]], | ||
|
||
"activation": "silu", | ||
|
||
# model settings | ||
|
||
#"pos_emb": "rotary", | ||
"rotary_pct": 0.25, | ||
"no_weight_tying": true, | ||
"gpt_j_residual": true, | ||
|
||
# these should provide some speedup but takes a while to build, set to true if desired | ||
"scaled_upper_triang_masked_softmax_fusion": false, | ||
"bias_gelu_fusion": false, | ||
"rope_fusion": false, | ||
"layernorm_fusion": false, | ||
|
||
|
||
# init methods | ||
"init_method": "small_init", | ||
"output_layer_init_method": "wang_init", | ||
|
||
# optimizer settings | ||
"optimizer": { | ||
"type": "Adam", | ||
"params": { | ||
"lr": 0.0008, | ||
"betas": [0.9, 0.95], | ||
"eps": 1.0e-8, | ||
} | ||
}, | ||
"min_lr": 0.00008, | ||
|
||
# for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training | ||
"zero_optimization": { | ||
"stage": 1, | ||
"allgather_partitions": True, | ||
"allgather_bucket_size": 500000000, | ||
"overlap_comm": True, | ||
"reduce_scatter": True, | ||
"reduce_bucket_size": 500000000, | ||
"contiguous_gradients": True, | ||
}, | ||
|
||
# batch / data settings | ||
"data_impl": "mmap", | ||
"num_workers": 1, | ||
|
||
# activation checkpointing | ||
"checkpoint_activations": true, | ||
"checkpoint_num_layers": 1, | ||
"partition_activations": true, | ||
"synchronize_each_layer": true, | ||
|
||
# regularization | ||
"gradient_clipping": 1.0, | ||
"weight_decay": 0.1, | ||
"hidden_dropout": 0, | ||
"attention_dropout": 0, | ||
|
||
# precision settings | ||
"bf16": { | ||
"bf16": true, | ||
"enabled": true, | ||
"loss_scale": 0, | ||
"loss_scale_window": 1000, | ||
"initial_scale_power": 12, | ||
"hysteresis": 2, | ||
"min_loss_scale": 1, | ||
}, | ||
|
||
# misc. training settings | ||
"train_iters": 500, | ||
"lr_decay_iters": 500, | ||
"distributed_backend": "nccl", | ||
"lr_decay_style": "constant", | ||
"warmup": 0.01, | ||
"checkpoint_factor": 100, | ||
"eval_interval": 100000, | ||
"eval_iters": 10, | ||
|
||
# logging | ||
"log_interval": 10, | ||
"steps_per_print": 10, | ||
"wall_clock_breakdown": true, | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we should either have unified args (across mamba, rwkv, transformers) for these, or prepend these args with whatever block type they're targeting (e.g.
rwkv_dim_att
).