-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathhiddenSweep256.sh
executable file
·58 lines (47 loc) · 2.22 KB
/
hiddenSweep256.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/bin/bash
#SBATCH --job-name="benchmarks"
#SBATCH --partition=g40
#SBATCH --mem-per-cpu=16GB # Amount of CPU memory
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8 # Crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=6 # Number of cores per tasks
#SBATCH --hint=nomultithread # We get physical cores not logical
#SBATCH --gres=gpu:8 # Number of gpus
#SBATCH --output=%x_%j.out # Set this dir where you want slurm outs to go
#SBATCH --error=%x_%j.out # Set this dir where you want slurm outs to go
#SBATCH --exclusive # Turn off node sharing
#SBATCH --comment=stablegpt
# setup the environment using the script we created before
source /fsx/quentin/jacob/gpt-neox-stuff/setup.sh
export NCCL_DEBUG=WARN
export NCCL_TREE_THRESHOLD=0
export NCCL_PROTO=simple
# Network issues without the following two NCCL vars set; See https://github.com/NVIDIA/nccl/issues/676
export NCCL_IBEXT_DISABLE=1
export NCCL_SOCKET_IFNAME=^docker0,lo
export FI_EFA_FORK_SAFE=1
export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
export FI_EFA_ENABLE_SHM_TRANSFER=0
export FI_PROVIDER=efa
export FI_EFA_TX_MIN_CREDITS=64
export PYTHONFAULTHANDLER=1
export OMPI_MCA_mtl_base_verbose=1
export OMPI_MCA_btl="^openib"
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_PORT=12802
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
# Hide duplicated errors using this hack - will be properly fixed in pt-1.12
export TORCHELASTIC_ERROR_FILE=$TRAIN_PATH/tmp/torch-elastic-error.json
# Move to the gpt-neox install
TRAIN_PATH=/fsx/quentin/jacob/gpt-neox-stuff/GEMMs_project/transformer_sizing/experiments/scripts
cd $TRAIN_PATH
# Write the hostfile for this job
#/fsx/shiv/zphang/scripts/write_hostfile.sh
#export DLTS_HOSTFILE=/fsx/shiv/zphang/hostfiles/hosts_$SLURM_JOBID
bash /fsx/quentin/jacob/write_hostfile.sh
export DLTS_HOSTFILE=/fsx/quentin/jacob/hostfiles/hosts_$SLURM_JOBID
#export DLTS_HOSTFILE=/fsx/quentin/hostfiles/hosts_test_2
#sudo mkdir -p /home/quentin/.cache/torch_extensions
#sudo chmod -R 777 /home/quentin
python transformer_flops_256.py >> median_output128heads.txt