Skip to content

tsingmicro training code #779

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 153 additions & 0 deletions training/tsingmicro/Baichuan2-13B/fineturning_xla.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# !/bin/sh
clear
export XLA_THREAD_POOL_SIZE=4
export XLA_IO_THREAD_POOL_SIZE=4
export NPROC=4
export OMP_NUM_THREADS=4
export OMP_WAIT_POLICY=PASSIVE
export TF_CPP_LOG_THREAD_ID=1
export TX8_MAX_CONSTANT_SIZE=1
#export TF_XLA_FLAGS="--xla_min_cluster_size=1--tf_xla_enable_xla_devices"
export TF_CPP_MIN_LOG_LEVEL=0
export TF_CPP_VMODULE="poplar_compiler=1"
#export TF_CPP_VMODULE="poplar_compiler=1,tx8_executor=3,xla_graph_executor=1,init_python_bindings=1,tfrt_cpu_pjrt_client=1,tx8_threadpool=3,xla_device=1,tfrt_tx8_pjrt_client=3,pjrt_tx8_client=1,tx8_threadpool=3,hlo_pass_pipeline=5,hlo_constant_folding=5,tx8_hlo_constant_folding=5,hlo_evaluator=1,shape_util=1,hlo_evaluator=1"
case $1 in
0)
echo "单机单卡 Baichuan2-13B GPU"
export CUDA_VISIBLE_DEVICES=1 #指定哪些cuda卡
export USE_TORCH_XLA=1 #动态图静态图
export TX8_NUM_DEVICES=1 #全模型大 -》多卡
export PJRT_DEVICE=TX8
export PJRT_LOCAL_WORLD_SIZE=1
# export TSM_DUMP_DATA=1 #覆盖
# export TX8_MODEL_EXPORT_LEVEL=11
# export XLA_HLO_DEBUG=1
# --mstt_config_name_or_path ./config_tensor.json \
python tx8/run_xla_train.py \
--model_name_or_path /login_home/zhangna/Baichuan2-13B/baichuan-inc--Baichuan2-13B-Base \
--dataset_name /login_home/zhangna/dataset/samsum \
--dataset_config_name default \
--num_train_epochs 1 \
--per_device_train_batch_size 1 \
--trust_remote_code True \
--do_train \
--output_dir ./output \
--overwrite_output_dir \
--num_hidden_layers 1 \
--block_size 2048 \
--cache_dir ./output/tmp \
--torch_dtype bfloat16 \
--optim adamw_torch \
--learning_rate 5e-5 \
--save_strategy no \
--logging_dir ./output/tmp/logs \
--logging_strategy steps \
--logging_steps 5 \
# --use_flash_attn True
;;

1)
echo "单机多卡 Baichuan2-13B GPU"
export CUDA_VISIBLE_DEVICES=0,1,2,3 #指定哪些cuda卡
export USE_TORCH_XLA=1 #动态图静态图
export TX8_NUM_DEVICES=4 #全模型大 -》多卡
export PJRT_DEVICE=TX8
export PJRT_LOCAL_WORLD_SIZE=4
# export TSM_DUMP_DATA=1 #覆盖
# export TX8_MODEL_EXPORT_LEVEL=10
# export XLA_HLO_DEBUG=1
export MASTER_ADDR=localhost
export MASTER_PORT=12355
python tx8/xla_spawn.py tx8/run_xla_train.py \
--model_name_or_path /login_home/zhangna/Baichuan2-13B/baichuan-inc--Baichuan2-13B-Base \
--dataset_name /login_home/zhangna/dataset/samsum \
--dataset_config_name default \
--num_train_epochs 1 \
--per_device_train_batch_size 1 \
--trust_remote_code True \
--do_train \
--output_dir ./output \
--overwrite_output_dir \
--num_hidden_layers 1 \
--block_size 2048 \
--cache_dir ./output/tmp \
--torch_dtype bfloat16 \
--optim adamw_torch \
--learning_rate 5e-5 \
--save_strategy no \
--logging_dir ./output/tmp/logs \
--logging_strategy steps \
--logging_steps 5 \
--fsdp "full_shard" \
--fsdp_config ./tx8/fsdp_xla_config.json \
# --use_flash_attn True
;;

2)
echo "单机多卡 Baichuan2-13B GPU"
export CUDA_VISIBLE_DEVICES=0,1,2,3 #指定哪些cuda卡
export USE_TORCH_XLA=1 #动态图静态图
export TX8_NUM_DEVICES=4 #全模型大 -》多卡
export PJRT_DEVICE=TX8
export PJRT_LOCAL_WORLD_SIZE=4
export TSM_DUMP_DATA=1 #覆盖
export TX8_MODEL_EXPORT_LEVEL=10
export MASTER_ADDR=localhost
export MASTER_PORT=12355
python tx8/xla_spawn.py tx8/run_xla_train.py \
--model_name_or_path /login_home/zhangna/Baichuan2-13B/baichuan-inc--Baichuan2-13B-Base \
--dataset_name /login_home/zhangna/dataset/samsum \
--dataset_config_name default \
--num_train_epochs 1 \
--per_device_train_batch_size 1 \
--trust_remote_code True \
--do_train \
--output_dir ./output \
--overwrite_output_dir \
--num_hidden_layers 8 \
--block_size 2048 \
--cache_dir ./output/tmp \
--torch_dtype bfloat16 \
--optim adamw_torch \
--learning_rate 5e-5 \
--save_strategy no \
--logging_dir ./output/tmp/logs \
--logging_strategy steps \
--logging_steps 5 \
--fsdp "full_shard" \
--fsdp_config ./tx8/fsdp_xla_config.json
;;

3)
echo "单机单卡 Baichuan2-13B cuda"
export CUDA_VISIBLE_DEVICES=0 #指定哪些cuda卡
export USE_TORCH_XLA=1 #动态图静态图
export TX8_NUM_DEVICES=1 #全模型大 -》多卡
export PJRT_DEVICE=TX8
export PJRT_LOCAL_WORLD_SIZE=1
# export TSM_DUMP_DATA=1 #覆盖
# export TX8_MODEL_EXPORT_LEVEL=11
torchrun tx8/run_xla_train.py \
--model_name_or_path /login_home/zhangna/Baichuan2-13B/baichuan-inc--Baichuan2-13B-Base \
--dataset_name /login_home/zhangna/dataset/samsum \
--dataset_config_name default \
--num_train_epochs 1 \
--per_device_train_batch_size 1 \
--trust_remote_code True \
--do_train \
--output_dir ./output \
--overwrite_output_dir \
--num_hidden_layers 1 \
--block_size 2048 \
--cache_dir ./output/tmp \
--torch_dtype bfloat16 \
--optim adamw_torch \
--learning_rate 5e-5 \
--save_strategy no \
--logging_dir ./output/tmp/logs \
--logging_strategy steps \
--logging_steps 5 \
--use_cuda \
;;

esac
14 changes: 14 additions & 0 deletions training/tsingmicro/Baichuan2-13B/tx8/compare_distributed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import sys
from tsprobe.pytorch import *

def main():
npu_dump_path = sys.argv[1]
gpu_dump_path = sys.argv[2]
output_path = sys.argv[3]
print('npu_dump_path:', npu_dump_path)
print('gpu_dump_path:', gpu_dump_path)
print('dump_compare_output_path:', output_path)
compare_distributed(npu_dump_path, gpu_dump_path, output_path)

if __name__ == "__main__":
main()
18 changes: 18 additions & 0 deletions training/tsingmicro/Baichuan2-13B/tx8/config_tensor.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"task": "tensor",
"dump_path": "/login_home/zhangna/transformers-4.46.1_2048_577/examples/train/Baichuan2-13B/data_dump",
"rank": [],
"step": [],
"level": "L1",
"seed": 1234,
"is_deterministic": false,
"enable_dataloader": false,

"tensor": {
"scope": [],
"list":[],
"data_mode": ["all"],
"backward_input": "",
"file_format": "bin"
}
}
19 changes: 19 additions & 0 deletions training/tsingmicro/Baichuan2-13B/tx8/fsdp_xla_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"fsdp_transformer_layer_cls_to_wrap": [
"BaichuanLayer"
],
"xla": true,
"xla_fsdp_settings": {
"compute_dtype": "bfloat16",
"shard_param_on_dim_0": false,
"pin_layout_in_collective_ops": false,
"_debug_dummy_reduce_scatter_op": false,
"flatten_parameters": true,
"param_init_cpu_device": true,
"optimization_barrier_in_forward": false,
"optimization_barrier_in_backward": false,
"reshard_after_forward": false,
"_shard_size_multiple": 1
},
"xla_fsdp_grad_ckpt": false
}
49 changes: 49 additions & 0 deletions training/tsingmicro/Baichuan2-13B/tx8/mstt_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
try:
from msprobe.pytorch import PrecisionDebugger
used_mstt = True
except ImportError:
used_mstt = False
print(f"!!!!!!!!!!!!!!!import mstt failed")

#如下示例dump指定代码块前反向数据。

"""Python
from transformers.utils import (
PrecisionDebuggerINIT,
PrecisionDebuggerMarkStep,
PrecisionDebuggerBGN,
PrecisionDebuggerEND,
)

# 请勿将PrecisionDebugger的初始化流程插入到循环代码中
PrecisionDebuggerINIT(config_path="./config.json")

# 模型、损失函数的定义及初始化等操作

# 数据集迭代的位置一般为模型训练开始的位置
for data, label in data_loader:
PrecisionDebuggerBGN() # 开启数据dump

# 如下是模型每个step执行的逻辑
output = model(data)
PrecisionDebuggerEND() # 插入该函数到start函数之后,只dump start函数到该函数之间代码的前反向数据,本函数到stop函数之间的数据则不dump
#...
loss.backward()
xm.mark_step()
PrecisionDebuggerMarkStep() # 关闭数据dump,一定在mark_step()函数之后调用。
"""
def PrecisionDebuggerBGN():
if used_mstt:
PrecisionDebugger.start()
def PrecisionDebuggerEND():
if used_mstt:
PrecisionDebugger.forward_backward_dump_end()
#'/workspace/SPMD_TX8_DEVELOP/transformer/config_tensor.json'
def PrecisionDebuggerINIT( config_path,task=None,dump_path=None,level=None,model=None,step=None,):
if used_mstt:
return PrecisionDebugger(config_path=config_path,task=task,dump_path=dump_path,level=level,model=model,step=step)
else:
return None
def PrecisionDebuggerMarkStep():
if used_mstt:
PrecisionDebugger.stop()
Loading