Skip to content

Commit 569b7c8

Browse files
committed
add keti7 scripts
1 parent efd7d26 commit 569b7c8

11 files changed

+1050
-0
lines changed

llm/auto_parallel/galvatron/scripts/profile_all2all.sh

Lines changed: 115 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_4,mlx5_bond_3,mlx5_bond_2,mlx5_bond_7,mlx5_bond_6,mlx5_bond_8,mlx5_bond_5
16+
export NCCL_IB_DISABLE=0
17+
echo "Running python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce /apdcephfs_fsgm/share_303760348/guangming/WorkSpace/paddle3.0/paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 0 --tp_deg 8 --save_file_name /apdcephfs_fsgm/share_303760348/guangming/WorkSpace/paddle3.0/llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json "
18+
python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce /apdcephfs_fsgm/share_303760348/guangming/WorkSpace/paddle3.0/paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 0 --tp_deg 8 --save_file_name /apdcephfs_fsgm/share_303760348/guangming/WorkSpace/paddle3.0/llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json
19+
sleep 1
20+
echo "Running python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce /apdcephfs_fsgm/share_303760348/guangming/WorkSpace/paddle3.0/paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 0 --tp_deg 4 --save_file_name /apdcephfs_fsgm/share_303760348/guangming/WorkSpace/paddle3.0/llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json "
21+
python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce /apdcephfs_fsgm/share_303760348/guangming/WorkSpace/paddle3.0/paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 0 --tp_deg 4 --save_file_name /apdcephfs_fsgm/share_303760348/guangming/WorkSpace/paddle3.0/llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json
22+
sleep 1
23+
echo "Running python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce /apdcephfs_fsgm/share_303760348/guangming/WorkSpace/paddle3.0/paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 0 --tp_deg 2 --save_file_name /apdcephfs_fsgm/share_303760348/guangming/WorkSpace/paddle3.0/llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json "
24+
python3 -u -m paddle.distributed.launch --ips xxx --gpus 0,1,2,3,4,5,6,7 --log_dir output/profile_allreduce /apdcephfs_fsgm/share_303760348/guangming/WorkSpace/paddle3.0/paddlenlp/experimental/galvatron/profiler/profile_allreduce.py --output_dir "./output" --profile_time 0 --tp_deg 2 --save_file_name /apdcephfs_fsgm/share_303760348/guangming/WorkSpace/paddle3.0/llm/auto_parallel/galvatron-llama-submit/configs/allreduce_bandwidth_1nodes_8gpus_per_node.json
25+
sleep 1
26+
rm -r ./profiler_log

llm/auto_parallel/galvatron/scripts/profile_allreduce_sp.sh

Lines changed: 116 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
set -x
16+
unset CUDA_VISIBLE_DEVICES
17+
18+
unset PADDLE_ELASTIC_JOB_ID
19+
unset PADDLE_TRAINER_ENDPOINTS
20+
unset DISTRIBUTED_TRAINER_ENDPOINTS
21+
unset FLAGS_START_PORT
22+
unset PADDLE_ELASTIC_TIMEOUT
23+
unset PADDLE_TRAINERS_NUM
24+
unset PADDLE_TRAINER_ID
25+
unset PADDLE_WORKERS_IP_PORT_LIST
26+
unset PADDLE_TRAINERS
27+
unset PADDLE_NUM_GRADIENT_SERVERS
28+
29+
source <path_to_your_own_python>
30+
31+
task_name="qwen"
32+
dir_name="profile_computation"
33+
rm -rf output/$dir_name/$task_name/
34+
rm -rf "output/$dir_name/$task_name""_log"
35+
36+
export SOT_LOG_LEVEL=4
37+
export PYTHONPATH=../../../:$PYTHONPATH
38+
39+
TRAINER="./train_qwen.py"
40+
LAUNCHER="python -u -m paddle.distributed.launch"
41+
LAUNCHER="${LAUNCHER} --gpus 7" # 设置需要使用的GPU
42+
LAUNCHER="${LAUNCHER} --log_dir output/$dir_name/$task_name""_log ${TRAINER} --output_dir "./output""
43+
44+
export LAUNCHER=$LAUNCHER
45+
46+
# [max_steps] [logging_steps] [enable_auto_parallel]
47+
TRAIN_ARGS="
48+
--weight_decay 0.01 \
49+
--warmup_ratio 0.01 \
50+
--max_grad_norm 1.0 \
51+
--learning_rate 3e-05 \
52+
--min_learning_rate 3e-06 \
53+
--max_steps 25 \
54+
--logging_steps 1 \
55+
--continue_training 0 \
56+
--do_train true \
57+
--do_eval false \
58+
--do_predict false \
59+
--disable_tqdm true \
60+
--skip_profile_timer false \
61+
--skip_memory_metrics 0 \
62+
--save_total_limit 2 \
63+
--device gpu \
64+
--dataloader_num_workers 1 \
65+
--distributed_dataloader 0 \
66+
--enable_auto_parallel 1 \
67+
"
68+
69+
# [seq_length] [num_hidden_layers]
70+
MODEL_ARGS="
71+
--model_name_or_path "llama" \
72+
--tokenizer_name_or_path "llama" \
73+
--num_hidden_layers 2 \
74+
--intermediate_size 25600 \
75+
--vocab_size 32000 \
76+
--hidden_size 5120 \
77+
--seq_length 1024 \
78+
--num_attention_heads 64 \
79+
--num_key_value_heads 8 \
80+
"
81+
82+
# [mbsz, accumulation_steps] [recompute] [amp]
83+
CONFIG_ARGS="
84+
--per_device_train_batch_size 1 \
85+
--gradient_accumulation_steps 4 \
86+
--recompute true \
87+
--recompute_use_reentrant true \
88+
--recompute_granularity full \
89+
--pp_recompute_interval 0 \
90+
--bf16 true \
91+
--fp16_opt_level "O2" \
92+
--amp_master_grad true \
93+
--amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
94+
--amp_custom_white_list "lookup_table" "lookup_table_v2" \
95+
"
96+
97+
# [dp_deg, dp_type] [tp_deg, megatron-sp] [pp_deg, 1F1B] [parallel_configs]
98+
PARALLEL_ARGS=(
99+
--to_static 0
100+
--sharding_parallel_degree 1
101+
--sharding "stage2"
102+
--tensor_parallel_degree 2
103+
--sequence_parallel true
104+
--pipeline_parallel_degree 2
105+
--virtual_pp_degree 1
106+
--pipeline_schedule_mode "1F1B"
107+
--sep_parallel_degree 1
108+
--pipeline_parallel_config "enable_send_recv_overlap"
109+
--data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate"
110+
--sharding_parallel_config "enable_overlap enable_release_grads"
111+
--tensor_parallel_config "enable_mp_async_allreduce replace_with_parallel_cross_entropy"
112+
)
113+
114+
# [fused] [flash_attention]
115+
DEFAULT_OPTIMIZER="
116+
--fuse_attention_ffn true \
117+
--fuse_attention_qkv true \
118+
--fused_linear_param_grad_add 1 \
119+
--fuse_sequence_parallel_allreduce true \
120+
--use_flash_attention true \
121+
--use_fused_rope true \
122+
--use_fused_rms_norm false \
123+
--enable_linear_fused_grad_add true \
124+
"
125+
126+
# [data]
127+
DATA_ARGS="
128+
--input_dir ./data \
129+
--split 949,50,1 \
130+
--max_seq_length 16384"
131+
132+
# [runtime profiler]
133+
RUNTIME_PROFILE_ARGS="
134+
--profile_time_flag 1 \
135+
--profile_forward_only 1 \
136+
--save_time_flag 1 \
137+
"
138+
139+
# [model profiler] [sequence type]
140+
MODEL_PROFILER_ARGS="
141+
--profile_type computation \
142+
--profile_mode sequence \
143+
--profile_fixed_batch_size 1 \
144+
--layernum_min 1 \
145+
--layernum_max 2 \
146+
--profile_min_seq_length 4096 \
147+
--profile_max_seq_length 16384 \
148+
--profile_seq_length_step 4096 \
149+
--num_layertype 1 \
150+
"
151+
152+
python ./profile.py \
153+
$MODEL_ARGS \
154+
$TRAIN_ARGS \
155+
$CONFIG_ARGS \
156+
"${PARALLEL_ARGS[@]}" \
157+
$DEFAULT_OPTIMIZER \
158+
$DATA_ARGS \
159+
$RUNTIME_PROFILE_ARGS \
160+
$MODEL_PROFILER_ARGS
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
launch="${interpreter} -u -m paddle.distributed.launch"
16+
launch="${launch} --master $master:$port --nnodes $nnodes --rank $rank --gpus 0,1,2,3,4,5,6,7"
17+
18+
export INTERPRETER=${interpreter}
19+
export LAUNCHER=${launch}
20+
export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_4,mlx5_bond_3,mlx5_bond_2,mlx5_bond_7,mlx5_bond_6,mlx5_bond_8,mlx5_bond_5
21+
export NCCL_IB_DISABLE=0
22+
23+
PROFILE_HARDWARE_ARGS=(
24+
--num_nodes $nnodes
25+
--num_gpus_per_node 8
26+
--backend 'paddle'
27+
--max_pp_deg 8
28+
--max_tp_deg 8
29+
)
30+
31+
${interpreter} profile_hardware.py \
32+
"${PROFILE_HARDWARE_ARGS[@]}"
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
set -x
16+
unset CUDA_VISIBLE_DEVICES
17+
18+
unset PADDLE_ELASTIC_JOB_ID
19+
unset PADDLE_TRAINER_ENDPOINTS
20+
unset DISTRIBUTED_TRAINER_ENDPOINTS
21+
unset FLAGS_START_PORT
22+
unset PADDLE_ELASTIC_TIMEOUT
23+
unset PADDLE_TRAINERS_NUM
24+
unset PADDLE_TRAINER_ID
25+
unset PADDLE_WORKERS_IP_PORT_LIST
26+
unset PADDLE_TRAINERS
27+
unset PADDLE_NUM_GRADIENT_SERVERS
28+
29+
source <path_to_your_own_python>
30+
31+
task_name="qwen_profile_memory"
32+
dir_name="profile_memory"
33+
rm -rf output/$dir_name/$task_name/
34+
rm -rf "output/$dir_name/$task_name""_log"
35+
36+
export SOT_LOG_LEVEL=4
37+
export PYTHONPATH=../../../:$PYTHONPATH
38+
39+
TRAINER="./train_qwen.py"
40+
LAUNCHER="python -u -m paddle.distributed.launch"
41+
LAUNCHER="${LAUNCHER} --gpus 0,1,2,3,4,5,6,7" # 设置需要使用的GPU
42+
LAUNCHER="${LAUNCHER} --log_dir output/$dir_name/$task_name""_log ${TRAINER} --output_dir "./output""
43+
44+
export LAUNCHER=$LAUNCHER
45+
export PROFILE_WORLD_SIZE=8
46+
47+
# [max_steps] [logging_steps] [enable_auto_parallel]
48+
TRAIN_ARGS="
49+
--weight_decay 0.01 \
50+
--warmup_ratio 0.01 \
51+
--max_grad_norm 1.0 \
52+
--learning_rate 3e-05 \
53+
--min_learning_rate 3e-06 \
54+
--max_steps 10 \
55+
--logging_steps 1 \
56+
--continue_training 0 \
57+
--do_train true \
58+
--do_eval false \
59+
--do_predict false \
60+
--disable_tqdm true \
61+
--skip_profile_timer false \
62+
--skip_memory_metrics 0 \
63+
--save_total_limit 2 \
64+
--device gpu \
65+
--dataloader_num_workers 1 \
66+
--distributed_dataloader 0 \
67+
--enable_auto_parallel 1 \
68+
"
69+
70+
# [seq_length] [num_hidden_layers]
71+
MODEL_ARGS="
72+
--model_name_or_path "llama" \
73+
--tokenizer_name_or_path "llama" \
74+
--num_hidden_layers 2 \
75+
--intermediate_size 25600 \
76+
--vocab_size 32000 \
77+
--hidden_size 5120 \
78+
--seq_length 1024 \
79+
--num_attention_heads 64 \
80+
--num_key_value_heads 8 \
81+
"
82+
83+
# [mbsz, accumulation_steps] [recompute] [amp]
84+
CONFIG_ARGS="
85+
--per_device_train_batch_size 8 \
86+
--gradient_accumulation_steps 1 \
87+
--recompute false \
88+
--recompute_use_reentrant true \
89+
--recompute_granularity full \
90+
--pp_recompute_interval 0 \
91+
--bf16 true \
92+
--fp16_opt_level "O2" \
93+
--amp_master_grad true \
94+
--amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
95+
--amp_custom_white_list "lookup_table" "lookup_table_v2" \
96+
"
97+
98+
# [dp_deg, dp_type] [tp_deg, megatron-sp] [pp_deg, 1F1B] [parallel_configs]
99+
PARALLEL_ARGS=(
100+
--to_static 0
101+
--sharding_parallel_degree 1
102+
--sharding "stage2"
103+
--tensor_parallel_degree 2
104+
--sequence_parallel true
105+
--pipeline_parallel_degree 2
106+
--virtual_pp_degree 1
107+
--pipeline_schedule_mode "1F1B"
108+
--sep_parallel_degree 1
109+
--pipeline_parallel_config "enable_send_recv_overlap"
110+
--data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate"
111+
--sharding_parallel_config "enable_overlap"
112+
--tensor_parallel_config "enable_mp_async_allreduce"
113+
)
114+
115+
# [fused] [flash_attention]
116+
DEFAULT_OPTIMIZER_ARGS="
117+
--fuse_attention_ffn true \
118+
--fuse_attention_qkv true \
119+
--fused_linear_param_grad_add 1 \
120+
--fuse_sequence_parallel_allreduce true \
121+
--use_flash_attention true \
122+
--use_fused_rope true \
123+
--use_fused_rms_norm true \
124+
--enable_linear_fused_grad_add true \
125+
"
126+
127+
# [data]
128+
DATA_ARGS="
129+
--input_dir ./data \
130+
--split 949,50,1 \
131+
--max_seq_length 16384"
132+
133+
# [runtime profiler]
134+
RUNTIME_PROFILE_ARGS="
135+
--profile_memory_flag 1 \
136+
--save_memory_flag 1 \
137+
"
138+
139+
# [model profiler] [static type]
140+
MODEL_PROFILER_ARGS="
141+
--profile_type memory \
142+
--profile_mode static \
143+
--profile_fixed_batch_size 8 \
144+
--layernum_min 1 \
145+
--layernum_max 2 \
146+
--profile_fixed_seq_length_list 16384 \
147+
--num_layertype 1 \
148+
--max_tp_deg 8 \
149+
--max_per_device_train_batch_size 4 \
150+
"
151+
152+
python ./profile.py \
153+
$MODEL_ARGS \
154+
$TRAIN_ARGS \
155+
$CONFIG_ARGS \
156+
"${PARALLEL_ARGS[@]}" \
157+
$DEFAULT_OPTIMIZER_ARGS \
158+
$DATA_ARGS \
159+
$RUNTIME_PROFILE_ARGS \
160+
$MODEL_PROFILER_ARGS
161+

0 commit comments

Comments
 (0)