FlagOpen · loong-hao-2025 · Jan 24, 2025
diff --git a/training/tsingmicro/Baichuan2-13B/fineturning_xla.sh b/training/tsingmicro/Baichuan2-13B/fineturning_xla.sh
@@ -0,0 +1,153 @@
+# !/bin/sh
+   clear
+   export XLA_THREAD_POOL_SIZE=4
+   export XLA_IO_THREAD_POOL_SIZE=4
+   export NPROC=4
+   export OMP_NUM_THREADS=4
+   export OMP_WAIT_POLICY=PASSIVE
+   export TF_CPP_LOG_THREAD_ID=1
+   export TX8_MAX_CONSTANT_SIZE=1
+   #export TF_XLA_FLAGS="--xla_min_cluster_size=1--tf_xla_enable_xla_devices"
+   export TF_CPP_MIN_LOG_LEVEL=0
+   export TF_CPP_VMODULE="poplar_compiler=1"
+   #export TF_CPP_VMODULE="poplar_compiler=1,tx8_executor=3,xla_graph_executor=1,init_python_bindings=1,tfrt_cpu_pjrt_client=1,tx8_threadpool=3,xla_device=1,tfrt_tx8_pjrt_client=3,pjrt_tx8_client=1,tx8_threadpool=3,hlo_pass_pipeline=5,hlo_constant_folding=5,tx8_hlo_constant_folding=5,hlo_evaluator=1,shape_util=1,hlo_evaluator=1"
+   case $1 in
+      0)
+         echo "单机单卡 Baichuan2-13B GPU"
+         export CUDA_VISIBLE_DEVICES=1 #指定哪些cuda卡
+         export USE_TORCH_XLA=1        #动态图静态图
+         export TX8_NUM_DEVICES=1      #全模型大 -》多卡
+         export PJRT_DEVICE=TX8
+         export PJRT_LOCAL_WORLD_SIZE=1
+         # export TSM_DUMP_DATA=1      #覆盖  
+         # export TX8_MODEL_EXPORT_LEVEL=11
+         # export XLA_HLO_DEBUG=1
+         # --mstt_config_name_or_path ./config_tensor.json \
+         python tx8/run_xla_train.py \
+         --model_name_or_path /login_home/zhangna/Baichuan2-13B/baichuan-inc--Baichuan2-13B-Base \
+         --dataset_name /login_home/zhangna/dataset/samsum \
+         --dataset_config_name default \
+         --num_train_epochs 1 \
+         --per_device_train_batch_size 1 \
+         --trust_remote_code True \
+         --do_train \
+         --output_dir ./output \
+         --overwrite_output_dir \
+         --num_hidden_layers 1 \
+         --block_size 2048 \
+         --cache_dir ./output/tmp \
+         --torch_dtype bfloat16 \
+         --optim adamw_torch \
+         --learning_rate 5e-5 \
+         --save_strategy no \
+         --logging_dir ./output/tmp/logs \
+         --logging_strategy steps \
+         --logging_steps 5 \
+         # --use_flash_attn True
+         ;;
+
+      1)
+         echo "单机多卡 Baichuan2-13B GPU"
+         export CUDA_VISIBLE_DEVICES=0,1,2,3 #指定哪些cuda卡
+         export USE_TORCH_XLA=1        #动态图静态图
+         export TX8_NUM_DEVICES=4      #全模型大 -》多卡
+         export PJRT_DEVICE=TX8
+         export PJRT_LOCAL_WORLD_SIZE=4
+         # export TSM_DUMP_DATA=1      #覆盖  
+         # export TX8_MODEL_EXPORT_LEVEL=10
+         # export XLA_HLO_DEBUG=1
+         export MASTER_ADDR=localhost
+         export MASTER_PORT=12355
+         python tx8/xla_spawn.py tx8/run_xla_train.py \
+         --model_name_or_path /login_home/zhangna/Baichuan2-13B/baichuan-inc--Baichuan2-13B-Base \
+         --dataset_name /login_home/zhangna/dataset/samsum \
+         --dataset_config_name default \
+         --num_train_epochs 1 \
+         --per_device_train_batch_size 1 \
+         --trust_remote_code True \
+         --do_train \
+         --output_dir ./output \
+         --overwrite_output_dir \
+         --num_hidden_layers 1 \
+         --block_size 2048 \
+         --cache_dir ./output/tmp \
+         --torch_dtype bfloat16 \
+         --optim adamw_torch \
+         --learning_rate 5e-5 \
+         --save_strategy no \
+         --logging_dir ./output/tmp/logs \
+         --logging_strategy steps \
+         --logging_steps 5 \
+         --fsdp "full_shard" \
+         --fsdp_config ./tx8/fsdp_xla_config.json \
+         # --use_flash_attn True
+         ;;
+
+      2)
+         echo "单机多卡 Baichuan2-13B GPU"
+         export CUDA_VISIBLE_DEVICES=0,1,2,3 #指定哪些cuda卡
+         export USE_TORCH_XLA=1        #动态图静态图
+         export TX8_NUM_DEVICES=4    #全模型大 -》多卡
+         export PJRT_DEVICE=TX8
+         export PJRT_LOCAL_WORLD_SIZE=4
+         export TSM_DUMP_DATA=1      #覆盖  
+         export TX8_MODEL_EXPORT_LEVEL=10
+         export MASTER_ADDR=localhost
+         export MASTER_PORT=12355
+         python tx8/xla_spawn.py tx8/run_xla_train.py \
+         --model_name_or_path /login_home/zhangna/Baichuan2-13B/baichuan-inc--Baichuan2-13B-Base \
+         --dataset_name /login_home/zhangna/dataset/samsum \
+         --dataset_config_name default \
+         --num_train_epochs 1 \
+         --per_device_train_batch_size 1 \
+         --trust_remote_code True \
+         --do_train \
+         --output_dir ./output \
+         --overwrite_output_dir \
+         --num_hidden_layers 8 \
+         --block_size 2048 \
+         --cache_dir ./output/tmp \
+         --torch_dtype bfloat16 \
+         --optim adamw_torch \
+         --learning_rate 5e-5 \
+         --save_strategy no \
+         --logging_dir ./output/tmp/logs \
+         --logging_strategy steps \
+         --logging_steps 5 \
+         --fsdp "full_shard" \
+         --fsdp_config ./tx8/fsdp_xla_config.json
+         ;;
+
+      3)
+         echo "单机单卡 Baichuan2-13B cuda"
+         export CUDA_VISIBLE_DEVICES=0 #指定哪些cuda卡
+         export USE_TORCH_XLA=1        #动态图静态图
+         export TX8_NUM_DEVICES=1      #全模型大 -》多卡
+         export PJRT_DEVICE=TX8
+         export PJRT_LOCAL_WORLD_SIZE=1
+         # export TSM_DUMP_DATA=1      #覆盖  
+         # export TX8_MODEL_EXPORT_LEVEL=11
+         torchrun tx8/run_xla_train.py \
+         --model_name_or_path /login_home/zhangna/Baichuan2-13B/baichuan-inc--Baichuan2-13B-Base \
+         --dataset_name /login_home/zhangna/dataset/samsum \
+         --dataset_config_name default \
+         --num_train_epochs 1 \
+         --per_device_train_batch_size 1 \
+         --trust_remote_code True \
+         --do_train \
+         --output_dir ./output \
+         --overwrite_output_dir \
+         --num_hidden_layers 1 \
+         --block_size 2048 \
+         --cache_dir ./output/tmp \
+         --torch_dtype bfloat16 \
+         --optim adamw_torch \
+         --learning_rate 5e-5 \
+         --save_strategy no \
+         --logging_dir ./output/tmp/logs \
+         --logging_strategy steps \
+         --logging_steps 5 \
+         --use_cuda \
+         ;;
+
+   esac
diff --git a/training/tsingmicro/Baichuan2-13B/tx8/compare_distributed.py b/training/tsingmicro/Baichuan2-13B/tx8/compare_distributed.py
@@ -0,0 +1,14 @@
+import sys
+from tsprobe.pytorch import *
+
+def main():
+    npu_dump_path = sys.argv[1]
+    gpu_dump_path = sys.argv[2]
+    output_path = sys.argv[3]
+    print('npu_dump_path:', npu_dump_path)
+    print('gpu_dump_path:', gpu_dump_path)
+    print('dump_compare_output_path:', output_path)
+    compare_distributed(npu_dump_path, gpu_dump_path, output_path)
+
+if __name__ == "__main__":
+    main()
diff --git a/training/tsingmicro/Baichuan2-13B/tx8/config_tensor.json b/training/tsingmicro/Baichuan2-13B/tx8/config_tensor.json
@@ -0,0 +1,18 @@
+{
+    "task": "tensor",
+    "dump_path": "/login_home/zhangna/transformers-4.46.1_2048_577/examples/train/Baichuan2-13B/data_dump",
+    "rank": [],
+    "step": [],
+    "level": "L1",
+    "seed": 1234,
+    "is_deterministic": false,
+    "enable_dataloader": false,
+
+    "tensor": {
+        "scope": [],
+        "list":[],
+        "data_mode": ["all"],
+        "backward_input": "",
+        "file_format": "bin"
+    }
+}
diff --git a/training/tsingmicro/Baichuan2-13B/tx8/fsdp_xla_config.json b/training/tsingmicro/Baichuan2-13B/tx8/fsdp_xla_config.json
@@ -0,0 +1,19 @@
+{
+    "fsdp_transformer_layer_cls_to_wrap": [
+        "BaichuanLayer"
+    ],
+    "xla": true,
+    "xla_fsdp_settings": {
+        "compute_dtype": "bfloat16",
+        "shard_param_on_dim_0": false,
+        "pin_layout_in_collective_ops": false,
+        "_debug_dummy_reduce_scatter_op": false,
+        "flatten_parameters": true,
+        "param_init_cpu_device": true,
+        "optimization_barrier_in_forward": false,
+        "optimization_barrier_in_backward": false,
+        "reshard_after_forward": false,
+        "_shard_size_multiple": 1
+    },
+    "xla_fsdp_grad_ckpt": false
+}
diff --git a/training/tsingmicro/Baichuan2-13B/tx8/mstt_util.py b/training/tsingmicro/Baichuan2-13B/tx8/mstt_util.py
@@ -0,0 +1,49 @@
+try:
+    from msprobe.pytorch import PrecisionDebugger
+    used_mstt = True
+except ImportError:
+    used_mstt = False
+    print(f"!!!!!!!!!!!!!!!import mstt failed")
+
+#如下示例dump指定代码块前反向数据。
+
+"""Python
+from transformers.utils import (
+    PrecisionDebuggerINIT,
+    PrecisionDebuggerMarkStep,
+    PrecisionDebuggerBGN,
+    PrecisionDebuggerEND,
+)
+
+# 请勿将PrecisionDebugger的初始化流程插入到循环代码中
+PrecisionDebuggerINIT(config_path="./config.json")
+
+# 模型、损失函数的定义及初始化等操作
+
+# 数据集迭代的位置一般为模型训练开始的位置
+for data, label in data_loader:
+	PrecisionDebuggerBGN() # 开启数据dump
+
+	# 如下是模型每个step执行的逻辑
+    output = model(data)
+    PrecisionDebuggerEND() # 插入该函数到start函数之后，只dump start函数到该函数之间代码的前反向数据，本函数到stop函数之间的数据则不dump
+    #...
+    loss.backward()
+    xm.mark_step()
+	PrecisionDebuggerMarkStep() # 关闭数据dump,一定在mark_step()函数之后调用。
+"""
+def PrecisionDebuggerBGN():
+    if used_mstt:
+        PrecisionDebugger.start()
+def PrecisionDebuggerEND():
+    if used_mstt:
+        PrecisionDebugger.forward_backward_dump_end()
+#'/workspace/SPMD_TX8_DEVELOP/transformer/config_tensor.json'
+def PrecisionDebuggerINIT( config_path,task=None,dump_path=None,level=None,model=None,step=None,):
+    if used_mstt:
+        return PrecisionDebugger(config_path=config_path,task=task,dump_path=dump_path,level=level,model=model,step=step)
+    else:
+        return None
+def PrecisionDebuggerMarkStep():
+    if used_mstt:
+        PrecisionDebugger.stop()