diff --git a/.gitignore b/.gitignore index cac3499524d..da681fc81b7 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ build *~ slurm* logs +.vscode/ +apex/ diff --git a/examples/evaluate_zeroshot_2_6B_monolingual_eng-bpe_sp_32768_10_rotary.sh b/examples/evaluate_zeroshot_2_6B_monolingual_eng-bpe_sp_32768_10_rotary.sh new file mode 100755 index 00000000000..2d9838da05e --- /dev/null +++ b/examples/evaluate_zeroshot_2_6B_monolingual_eng-bpe_sp_32768_10_rotary.sh @@ -0,0 +1,45 @@ +#!/bin/bash + + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json +export NCCL_ASYNC_ERROR_HANDLING=1 +export NCCL_IB_TIMEOUT=50 +export UCX_RC_TIMEOUT=4s +export NCCL_IB_RETRY_CNT=10 +export NCCL_SOCKET_IFNAME=ib0 +export GLOO_SOCKET_IFNAME=ib0 + +MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)" +MASTER_ADDR="${MASTER_ADDR}i" +MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')" +MASTER_PORT=6000 + + +DISTRIBUTED_ARGS="--nproc_per_node 2 \ + --nnodes 1 \ + --node_rank 0 \ + --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ + --rdzv_backend c10d" + + +python -u -m torch.distributed.run $DISTRIBUTED_ARGS ./tasks/main.py \ + --task "LAMBADA" \ + --valid-data /p/project/opengptx-elm/thellmann1/opengpt_2023/data/bflm/lambada_test.jsonl \ + --strict-lambada \ + --tokenizer-type OpenGPTX-SPTokenizer \ + --tokenizer-model /p/scratch/opengptx-elm/data/datasources_opgptx/data_quality_experiments_datasets/ablations_studies/monolingual_en/70B_10/tokenizer_training/bpe/sp/32768_10/bpe_tokenizer.model \ + --load /p/scratch/opengptx-elm/ali5/opengpt/megatron-lm/2023-07-27_17-52-53/output_dir/2_6B_monolingual_eng-bpe_sp_32768_10_rotary.sh/checkpoints \ + --tensor-model-parallel-size 2 \ + --pipeline-model-parallel-size 1 \ + --no-position-embedding \ + --position-embedding-type rotary \ + --num-layers 32 \ + --hidden-size 2560 \ + --num-attention-heads 32 \ + --micro-batch-size 5 \ + --global-batch-size 480 \ + --seq-length 2048 \ + --max-position-embeddings 2048 \ + --log-interval 10 \ + --bf16 diff --git a/examples/run_text_generation_server_2_6B.sh b/examples/run_text_generation_server_2_6B.sh new file mode 100755 index 00000000000..16c8ea8ea1d --- /dev/null +++ b/examples/run_text_generation_server_2_6B.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +set -x -e + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json +export NCCL_ASYNC_ERROR_HANDLING=1 +export NCCL_IB_TIMEOUT=50 +export UCX_RC_TIMEOUT=4s +export NCCL_IB_RETRY_CNT=10 +export NCCL_SOCKET_IFNAME=ib0 +export NCCL_DEBUG=INFO +export MASTER_ADDR=127.0.0.1 +export MASTER_PORT=60234 + +export CMD=" \ + tools/run_text_generation_server.py \ + --load /p/scratch/opengptx-elm/ali5/opengpt/megatron-lm/2023-07-27_17-52-51/output_dir/2_6B_monolingual_eng-bpe_sp_32768_10_nope.sbatch/checkpoints/53100 \ + --tokenizer-model /p/scratch/opengptx-elm/data/datasources_opgptx/data_quality_experiments_datasets/ablations_studies/monolingual_en/70B_10/tokenizer_training/bpe/sp/32768_10/bpe_tokenizer.model \ + --tokenizer-type OpenGPTX-SPTokenizer \ + --pipeline-model-parallel-size 1 \ + --tensor-model-parallel-size 2 \ + --num-layers 32 \ + --hidden-size 2560 \ + --num-attention-heads 32 \ + --max-position-embeddings 2048 \ + --bf16 \ + --micro-batch-size 1 \ + --seq-length 2048 \ + --out-seq-length 2048 \ + --temperature 0.8 \ + --top_p 0.5 \ + --seed 42 \ + --position-embedding-type none \ + --no-position-embedding \ + --use-flash-attn \ + --reset-attention-mask \ + --reset-position-ids" + + +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node 2 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + +bash -c "$LAUNCHER $CMD" diff --git a/examples/run_text_generation_server_2_6B_monolingual_eng-bpe_hf_32768_10_rotary_iter_0053100.sh b/examples/run_text_generation_server_2_6B_monolingual_eng-bpe_hf_32768_10_rotary_iter_0053100.sh new file mode 100755 index 00000000000..ee147c49777 --- /dev/null +++ b/examples/run_text_generation_server_2_6B_monolingual_eng-bpe_hf_32768_10_rotary_iter_0053100.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +set -x -e + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json +export NCCL_ASYNC_ERROR_HANDLING=1 +export NCCL_IB_TIMEOUT=50 +export UCX_RC_TIMEOUT=4s +export NCCL_IB_RETRY_CNT=10 +export NCCL_SOCKET_IFNAME=ib0 +export NCCL_DEBUG=INFO +export MASTER_ADDR=127.0.0.1 +export MASTER_PORT=60234 +export MAX_JOBS=$SLURM_JOB_CPUS_PER_NODE + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-${(%):-%x}}" )" &> /dev/null && pwd ) + + +export CMD=" \ + $SCRIPT_DIR/../tools/run_text_generation_server.py \ + --load /beegfs/p_gptx/tokenizer_study/cp_2_6B_iter_0053100/checkpoints/2_6B_monolingual_eng-bpe_hf_32768_10_rotary.sh/checkpoints \ + --tokenizer-model /beegfs/p_gptx/tokenizer_study/2_6B_tokenizer_models/2_6B_monolingual_eng-bpe_hf_32768_10_rotary.sh/tokenizer/iter_0053100/tokenizer.json \ + --tokenizer-type OpenGPTX-HFTokenizer \ + --pipeline-model-parallel-size 1 \ + --tensor-model-parallel-size 2 \ + --num-layers 32 \ + --hidden-size 2560 \ + --num-attention-heads 32 \ + --max-position-embeddings 2048 \ + --bf16 \ + --micro-batch-size 5 \ + --seq-length 2048 \ + --out-seq-length 2048 \ + --temperature 0.8 \ + --top_p 0.5 \ + --seed 42 \ + --position-embedding-type rotary \ + --no-position-embedding \ + " + +export LAUNCHER="python -u -m torch.distributed.launch \ + --nproc_per_node 2 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + + +bash -c "$LAUNCHER $CMD" + diff --git a/examples/run_text_generation_server_2_6B_monolingual_eng-bpe_sp_100352_10_rotary_iter_0053100.sh b/examples/run_text_generation_server_2_6B_monolingual_eng-bpe_sp_100352_10_rotary_iter_0053100.sh new file mode 100644 index 00000000000..5aa5894b5b7 --- /dev/null +++ b/examples/run_text_generation_server_2_6B_monolingual_eng-bpe_sp_100352_10_rotary_iter_0053100.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +set -x -e + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json +export NCCL_ASYNC_ERROR_HANDLING=1 +export NCCL_IB_TIMEOUT=50 +export UCX_RC_TIMEOUT=4s +export NCCL_IB_RETRY_CNT=10 +export NCCL_SOCKET_IFNAME=ib0 +export NCCL_DEBUG=INFO +export MASTER_ADDR=127.0.0.1 +export MASTER_PORT=60234 +export MAX_JOBS=$SLURM_JOB_CPUS_PER_NODE + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-${(%):-%x}}" )" &> /dev/null && pwd ) + + +export CMD=" \ + $SCRIPT_DIR/../tools/run_text_generation_server.py \ + --load /p/scratch/opengptx-elm/ali5/opengpt/megatron-lm/2023-07-31_16-09-58/output_dir/2_6B_monolingual_eng-bpe_hf_100352_10_rotary.sh/checkpoints \ + --tokenizer-model /p/scratch/opengptx-elm/data/datasources_opgptx/data_quality_experiments_datasets/ablations_studies/monolingual_en/70B_10/tokenizer_training/bpe/sp/100352_10/bpe_tokenizer.model \ + --tokenizer-type OpenGPTX-SPTokenizer \ + --pipeline-model-parallel-size 1 \ + --tensor-model-parallel-size 2 \ + --num-layers 32 \ + --hidden-size 2560 \ + --num-attention-heads 32 \ + --max-position-embeddings 2048 \ + --bf16 \ + --micro-batch-size 5 \ + --seq-length 2048 \ + --out-seq-length 2048 \ + --temperature 0.8 \ + --top_p 0.5 \ + --seed 42 \ + --position-embedding-type rotary \ + --no-position-embedding \ + " + +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node 2 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + + +bash -c "$LAUNCHER $CMD" diff --git a/examples/run_text_generation_server_2_6B_monolingual_eng-bpe_sp_32768_10_nope_iter_0053100.sh b/examples/run_text_generation_server_2_6B_monolingual_eng-bpe_sp_32768_10_nope_iter_0053100.sh new file mode 100644 index 00000000000..54e08ad475f --- /dev/null +++ b/examples/run_text_generation_server_2_6B_monolingual_eng-bpe_sp_32768_10_nope_iter_0053100.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +set -x -e + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json +export NCCL_ASYNC_ERROR_HANDLING=1 +export NCCL_IB_TIMEOUT=50 +export UCX_RC_TIMEOUT=4s +export NCCL_IB_RETRY_CNT=10 +export NCCL_SOCKET_IFNAME=ib0 +export NCCL_DEBUG=INFO +export MASTER_ADDR=127.0.0.1 +export MASTER_PORT=60234 +export MAX_JOBS=$SLURM_JOB_CPUS_PER_NODE + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-${(%):-%x}}" )" &> /dev/null && pwd ) + + +export CMD=" \ + $SCRIPT_DIR/../tools/run_text_generation_server.py \ + --load /p/scratch/opengptx-elm/ali5/opengpt/megatron-lm/2023-07-27_17-52-51/output_dir/2_6B_monolingual_eng-bpe_sp_32768_10_nope.sbatch/checkpoints \ + --tokenizer-model /p/scratch/opengptx-elm/data/datasources_opgptx/data_quality_experiments_datasets/ablations_studies/monolingual_en/70B_10/tokenizer_training/bpe/sp/32768_10/bpe_tokenizer.model \ + --tokenizer-type OpenGPTX-SPTokenizer \ + --pipeline-model-parallel-size 1 \ + --tensor-model-parallel-size 2 \ + --num-layers 32 \ + --hidden-size 2560 \ + --num-attention-heads 32 \ + --max-position-embeddings 2048 \ + --bf16 \ + --micro-batch-size 5 \ + --seq-length 2048 \ + --out-seq-length 2048 \ + --temperature 0.8 \ + --top_p 0.5 \ + --seed 42 \ + --position-embedding-type none \ + --no-position-embedding \ + " + +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node 2 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + + +bash -c "$LAUNCHER $CMD" diff --git a/examples/run_text_generation_server_2_6B_monolingual_eng-bpe_sp_32768_10_rotary_iter_0053100.sh b/examples/run_text_generation_server_2_6B_monolingual_eng-bpe_sp_32768_10_rotary_iter_0053100.sh new file mode 100755 index 00000000000..4738fe287d8 --- /dev/null +++ b/examples/run_text_generation_server_2_6B_monolingual_eng-bpe_sp_32768_10_rotary_iter_0053100.sh @@ -0,0 +1,81 @@ +#!/bin/bash + +set -x -e + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json +export NCCL_ASYNC_ERROR_HANDLING=1 +export NCCL_IB_TIMEOUT=50 +export UCX_RC_TIMEOUT=4s +export NCCL_IB_RETRY_CNT=10 +export NCCL_SOCKET_IFNAME=ib0 +export NCCL_DEBUG=INFO +export MASTER_ADDR=127.0.0.1 +export MASTER_PORT=60234 +export MAX_JOBS=$SLURM_JOB_CPUS_PER_NODE + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-${(%):-%x}}" )" &> /dev/null && pwd ) + + +export CMD=" \ + $SCRIPT_DIR/../tools/run_text_generation_server.py \ + --load /p/scratch/opengptx-elm/ali5/opengpt/megatron-lm/2023-07-27_17-52-53/output_dir/2_6B_monolingual_eng-bpe_sp_32768_10_rotary.sh/checkpoints \ + --out-seq-length 2048 \ + --temperature 0.8 \ + --top_p 0.5 \ + --tensor-model-parallel-size 2 \ + --pipeline-model-parallel-size 1 \ + --max-tokens-to-oom=300000 \ + --num-layers 32 \ + --hidden-size 2560 \ + --num-attention-heads 32 \ + --seq-length 2048 \ + --max-position-embeddings 2048 \ + --micro-batch-size 5 \ + --global-batch-size 480 \ + --train-samples 25_488_281 \ + --tokenizer-type OpenGPTX-SPTokenizer \ + --tokenizer-model /p/scratch/opengptx-elm/data/datasources_opgptx/data_quality_experiments_datasets/ablations_studies/monolingual_en/70B_10/tokenizer_training/bpe/sp/32768_10/bpe_tokenizer.model \ + --init-method-std 0.02 \ + --bf16 \ + --seed 42 \ + --no-position-embedding \ + --position-embedding-type rotary \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 1.6e-4 \ + --min-lr 1.6e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples 22_089_843 \ + --lr-warmup-samples 31_860 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + --use-distributed-optimizer \ + --log-interval 100 \ + --log-memory-to-tensorboard \ + --log-world-size-to-tensorboard \ + --save-interval 3000 \ + --eval-interval 3000 \ + --eval-iters 1 \ + --tensorboard-dir /p/scratch/opengptx-elm/ali5/opengpt/megatron-lm/2023-07-27_17-52-53/output_dir/2_6B_monolingual_eng-bpe_sp_32768_10_rotary.sh/tensorboard \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + --num-workers 11 \ + --data-impl mmap \ + --distributed-backend nccl \ + --load /p/scratch/opengptx-elm/ali5/opengpt/megatron-lm/2023-07-27_17-52-53/output_dir/2_6B_monolingual_eng-bpe_sp_32768_10_rotary.sh/checkpoints \ +" + +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node 2 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + + +bash -c "$LAUNCHER $CMD" diff --git a/examples/run_text_generation_server_2_6B_multilingual-bpe_sp_32768_10_rotary_iter_0053100.sh b/examples/run_text_generation_server_2_6B_multilingual-bpe_sp_32768_10_rotary_iter_0053100.sh new file mode 100644 index 00000000000..c7b5932c7a7 --- /dev/null +++ b/examples/run_text_generation_server_2_6B_multilingual-bpe_sp_32768_10_rotary_iter_0053100.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +set -x -e + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json +export NCCL_ASYNC_ERROR_HANDLING=1 +export NCCL_IB_TIMEOUT=50 +export UCX_RC_TIMEOUT=4s +export NCCL_IB_RETRY_CNT=10 +export NCCL_SOCKET_IFNAME=ib0 +export NCCL_DEBUG=INFO +export MASTER_ADDR=127.0.0.1 +export MASTER_PORT=60234 +export MAX_JOBS=$SLURM_JOB_CPUS_PER_NODE + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-${(%):-%x}}" )" &> /dev/null && pwd ) + + +export CMD=" \ + $SCRIPT_DIR/../tools/run_text_generation_server.py \ + --load /p/scratch/opengptx-elm/ali5/opengpt/megatron-lm/2023-07-27_17-52-53/output_dir/2_6B_multilingual-bpe_sp_32768_10_rotary.sh/checkpoints \ + --tokenizer-model /p/scratch/opengptx-elm/data/datasources_opgptx/data_quality_experiments_datasets/ablations_studies/multilingual/70B_EQW_10/tokenizer_training/bpe/sp/32768_10/bpe_tokenizer.model \ + --tokenizer-type OpenGPTX-SPTokenizer \ + --pipeline-model-parallel-size 1 \ + --tensor-model-parallel-size 2 \ + --num-layers 32 \ + --hidden-size 2560 \ + --num-attention-heads 32 \ + --max-position-embeddings 2048 \ + --bf16 \ + --micro-batch-size 5 \ + --seq-length 2048 \ + --out-seq-length 2048 \ + --temperature 0.8 \ + --top_p 0.5 \ + --seed 42 \ + --position-embedding-type rotary \ + --no-position-embedding \ + " + +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node 2 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + + +bash -c "$LAUNCHER $CMD" diff --git a/examples/run_text_generation_server_345M.sh b/examples/run_text_generation_server_345M.sh index a151b984676..91ab0a1dccc 100755 --- a/examples/run_text_generation_server_345M.sh +++ b/examples/run_text_generation_server_345M.sh @@ -6,29 +6,105 @@ DISTRIBUTED_ARGS="--nproc_per_node 1 \ --master_addr localhost \ --master_port 6000" -CHECKPOINT= -VOCAB_FILE= -MERGE_FILE= export CUDA_DEVICE_MAX_CONNECTIONS=1 +export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json +export NCCL_ASYNC_ERROR_HANDLING=1 +export NCCL_IB_TIMEOUT=50 +export UCX_RC_TIMEOUT=4s +export NCCL_IB_RETRY_CNT=10 +export NCCL_SOCKET_IFNAME=ib0 +export NCCL_DEBUG=INFO +export MASTER_ADDR=127.0.0.1 +export MASTER_PORT=60234 -pip install flask-restful +# pip install flask-restful -torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ - --tensor-model-parallel-size 1 \ +python tools/run_text_generation_server.py \ + --load /p/project/opengptx-elm/thellmann1/workdir/checkpoint_conversion_meglm_test/meglm/2023-07-27_18-00-00/output_dir/340M_meglm_8105626.sbatch/checkpoints \ + --tokenizer-model /p/project/opengptx-elm/thellmann1/workdir/checkpoint_conversion_meglm_test/meglm/2023-07-27_18-00-00/output_dir/340M_meglm_8105626.sbatch/converted_checkpoints/iter_0015000/tokenizer.model \ + --tokenizer-type OpenGPTX-SPTokenizer \ --pipeline-model-parallel-size 1 \ + --tensor-model-parallel-size 1 \ --num-layers 24 \ --hidden-size 1024 \ - --load ${CHECKPOINT} \ --num-attention-heads 16 \ - --max-position-embeddings 1024 \ - --tokenizer-type GPT2BPETokenizer \ - --fp16 \ + --max-position-embeddings 2048 \ + --bf16 \ --micro-batch-size 1 \ - --seq-length 1024 \ - --out-seq-length 1024 \ - --temperature 1.0 \ - --vocab-file $VOCAB_FILE \ - --merge-file $MERGE_FILE \ - --top_p 0.9 \ - --seed 42 + --seq-length 2048 \ + --out-seq-length 2048 \ + --temperature 0.8 \ + --top_p 0.5 \ + --seed 42 \ + --no-position-embedding \ + --position-embedding-type rotary \ + --use-flash-attn \ + --reset-attention-mask \ + --reset-position-ids + + + +# --load /p/project/opengptx-elm/thellmann1/workdir/checkpoint_conversion_meglm_test/meglm/2023-07-17_16-38-00/output_dir/340M_monolingual_en_sp_bpe_32768_10.sbatch/checkpoints \ +# --tokenizer-model /p/project/opengptx-elm/thellmann1/workdir/checkpoint_conversion_meglm_test/meglm/2023-07-17_16-38-00/output_dir/340M_monolingual_en_sp_bpe_32768_10.sbatch/converted_checkpoints/iter_0001525/tokenizer.model \ +# --tokenizer-type SentencePieceTokenizer \ +# --pipeline-model-parallel-size 1 \ +# --tensor-model-parallel-size 1 \ +# --num-layers 24 \ +# --hidden-size 1024 \ +# --num-attention-heads 16 \ +# --max-position-embeddings 2048 \ +# --position-embedding-type alibi \ +# --no-position-embedding \ +# --bf16 \ +# --micro-batch-size 1 \ +# --seq-length 2048 \ +# --out-seq-length 2048 \ +# --temperature 0.8 \ +# --top_p 0.5 \ +# --seed 42 + +# --load /p/project/opengptx-elm/thellmann1/workdir/checkpoint_conversion_meglm_test/meglm/2023-07-13_13-00-00/output_dir/2_6B_multilingual-unigram_sp_32768_10.sbatch/checkpoints \ +# --vocab-file='/p/project/opengptx-elm/thellmann1/workdir/checkpoint_conversion_meglm_test/meglm/2023-07-13_13-00-00/output_dir/2_6B_multilingual-unigram_sp_32768_10.sbatch/converted_checkpoints/iter_0009537/vocab.json' \ +# --merge-file='/p/project/opengptx-elm/thellmann1/workdir/checkpoint_conversion_meglm_test/meglm/2023-07-13_13-00-00/output_dir/2_6B_multilingual-unigram_sp_32768_10.sbatch/converted_checkpoints/iter_0009537/merges.txt' \ +# --tokenizer-type GPT2BPETokenizer \ +# --pipeline-model-parallel-size 2 \ +# --tensor-model-parallel-size 2 \ +# --num-layers 12 \ +# --hidden-size 768 \ +# --num-attention-heads 12 \ +# --max-position-embeddings 2048 \ +# --position-embedding-type alibi \ +# --bf16 \ +# --micro-batch-size 32 \ +# --seq-length 2048 \ +# --out-seq-length 2048 \ +# --temperature 0.8 \ +# --top_p 0.5 \ +# --seed 42 + + + +# --load /p/project/opengptx-elm/thellmann1/workdir/checkpoint_conversion_meglm_test/meglm/2023-07-27_18-00-00/output_dir/340M_meglm_8105626.sbatch/checkpoints/iter_0015000 \ +# --tokenizer-model /p/project/opengptx-elm/thellmann1/workdir/checkpoint_conversion_meglm_test/meglm/2023-07-27_18-00-00/output_dir/340M_meglm_8105626.sbatch/converted_checkpoints/iter_0015000/tokenizer.mode +# --tokenizer-type OpenGPTX-SPTokenizer \ +# --pipeline-model-parallel-size 1 \ +# --tensor-model-parallel-size 1 \ +# --num-layers 24 \ +# --hidden-size 1024 \ +# --num-attention-heads 16 \ +# --max-position-embeddings 2048 \ +# --bf16 \ +# --micro-batch-size 1 \ +# --seq-length 2048 \ +# --out-seq-length 2048 \ +# --temperature 0.8 \ +# --top_p 0.5 \ +# --seed 42 \ +# --distributed-backend nccl +# --position-embedding-type rotary \ +# --use-flash-attn \ +# --reset-attention-mask \ +# --reset-position-ids \ +# --no-position-embedding \ + diff --git a/examples/run_text_generation_server_345M_2.sh b/examples/run_text_generation_server_345M_2.sh new file mode 100644 index 00000000000..2d893376b4a --- /dev/null +++ b/examples/run_text_generation_server_345M_2.sh @@ -0,0 +1,109 @@ +#!/bin/bash +# This example will start serving the 345M model. +DISTRIBUTED_ARGS="--nproc_per_node 1 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json +export NCCL_ASYNC_ERROR_HANDLING=1 +export NCCL_IB_TIMEOUT=50 +export UCX_RC_TIMEOUT=4s +export NCCL_IB_RETRY_CNT=10 +export NCCL_SOCKET_IFNAME=ib0 +export NCCL_DEBUG=INFO +export MASTER_ADDR=127.0.0.1 +export MASTER_PORT=60234 + +# pip install flask-restful + +python tools/run_text_generation_server.py \ + --load /p/scratch/opengptx-elm/ali5/opengpt/megatron-lm/output_dir/8181788/checkpoints \ + --tokenizer-model /p/scratch/opengptx-elm/data/datasources_opgptx/data_quality_experiments_datasets/ablations_studies/monolingual_en/70B_10/tokenizer_training/bpe/sp/32768_10/bpe_tokenizer.model \ + --tokenizer-type OpenGPTX-SPTokenizer \ + --pipeline-model-parallel-size 1 \ + --tensor-model-parallel-size 1 \ + --num-layers 24 \ + --hidden-size 1024 \ + --num-attention-heads 16 \ + --max-position-embeddings 2048 \ + --bf16 \ + --micro-batch-size 1 \ + --seq-length 2048 \ + --out-seq-length 2048 \ + --temperature 0.8 \ + --top_p 0.5 \ + --seed 42 \ + --no-position-embedding \ + --position-embedding-type rotary \ + --reset-attention-mask \ + --reset-position-ids + + + +# --load /p/project/opengptx-elm/thellmann1/workdir/checkpoint_conversion_meglm_test/meglm/2023-07-17_16-38-00/output_dir/340M_monolingual_en_sp_bpe_32768_10.sbatch/checkpoints \ +# --tokenizer-model /p/project/opengptx-elm/thellmann1/workdir/checkpoint_conversion_meglm_test/meglm/2023-07-17_16-38-00/output_dir/340M_monolingual_en_sp_bpe_32768_10.sbatch/converted_checkpoints/iter_0001525/tokenizer.model \ +# --tokenizer-type SentencePieceTokenizer \ +# --pipeline-model-parallel-size 1 \ +# --tensor-model-parallel-size 1 \ +# --num-layers 24 \ +# --hidden-size 1024 \ +# --num-attention-heads 16 \ +# --max-position-embeddings 2048 \ +# --position-embedding-type alibi \ +# --no-position-embedding \ +# --bf16 \ +# --micro-batch-size 1 \ +# --seq-length 2048 \ +# --out-seq-length 2048 \ +# --temperature 0.8 \ +# --top_p 0.5 \ +# --seed 42 + +# --load /p/project/opengptx-elm/thellmann1/workdir/checkpoint_conversion_meglm_test/meglm/2023-07-13_13-00-00/output_dir/2_6B_multilingual-unigram_sp_32768_10.sbatch/checkpoints \ +# --vocab-file='/p/project/opengptx-elm/thellmann1/workdir/checkpoint_conversion_meglm_test/meglm/2023-07-13_13-00-00/output_dir/2_6B_multilingual-unigram_sp_32768_10.sbatch/converted_checkpoints/iter_0009537/vocab.json' \ +# --merge-file='/p/project/opengptx-elm/thellmann1/workdir/checkpoint_conversion_meglm_test/meglm/2023-07-13_13-00-00/output_dir/2_6B_multilingual-unigram_sp_32768_10.sbatch/converted_checkpoints/iter_0009537/merges.txt' \ +# --tokenizer-type GPT2BPETokenizer \ +# --pipeline-model-parallel-size 2 \ +# --tensor-model-parallel-size 2 \ +# --num-layers 12 \ +# --hidden-size 768 \ +# --num-attention-heads 12 \ +# --max-position-embeddings 2048 \ +# --position-embedding-type alibi \ +# --bf16 \ +# --micro-batch-size 32 \ +# --seq-length 2048 \ +# --out-seq-length 2048 \ +# --temperature 0.8 \ +# --top_p 0.5 \ +# --seed 42 + + + +# --load /p/project/opengptx-elm/thellmann1/workdir/checkpoint_conversion_meglm_test/meglm/2023-07-27_18-00-00/output_dir/340M_meglm_8105626.sbatch/checkpoints/iter_0015000 \ +# --tokenizer-model /p/project/opengptx-elm/thellmann1/workdir/checkpoint_conversion_meglm_test/meglm/2023-07-27_18-00-00/output_dir/340M_meglm_8105626.sbatch/converted_checkpoints/iter_0015000/tokenizer.mode +# --tokenizer-type OpenGPTX-SPTokenizer \ +# --pipeline-model-parallel-size 1 \ +# --tensor-model-parallel-size 1 \ +# --num-layers 24 \ +# --hidden-size 1024 \ +# --num-attention-heads 16 \ +# --max-position-embeddings 2048 \ +# --bf16 \ +# --micro-batch-size 1 \ +# --seq-length 2048 \ +# --out-seq-length 2048 \ +# --temperature 0.8 \ +# --top_p 0.5 \ +# --seed 42 \ +# --distributed-backend nccl +# --position-embedding-type rotary \ +# --use-flash-attn \ +# --reset-attention-mask \ +# --reset-position-ids \ +# --no-position-embedding \ + diff --git a/examples/run_text_generation_server_gpt2_345M.sh b/examples/run_text_generation_server_gpt2_345M.sh new file mode 100755 index 00000000000..eed65edf612 --- /dev/null +++ b/examples/run_text_generation_server_gpt2_345M.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +set -x -e + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json +export NCCL_ASYNC_ERROR_HANDLING=1 +export UCX_RC_TIMEOUT=4s +export NCCL_DEBUG=INFO +export MAX_JOBS=$SLURM_JOB_CPUS_PER_NODE + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-${(%):-%x}}" )" &> /dev/null && pwd ) + + +export CMD=" \ + $SCRIPT_DIR/../tools/run_text_generation_server.py \ + --load /p/project/opengptx-elm/thellmann1/opengpt_2023/gpt345M \ + --vocab-file /p/project/opengptx-elm/thellmann1/opengpt_2023/gpt345M/gpt2-vocab.json \ + --merge-file /p/project/opengptx-elm/thellmann1/opengpt_2023/gpt345M/gpt2-merges.txt \ + --tokenizer-type GPT2BPETokenizer \ + --tensor-model-parallel-size 1 \ + --num-layers 24 \ + --hidden-size 1024 \ + --num-attention-heads 16 \ + --max-position-embeddings 1024 \ + --fp16 \ + --micro-batch-size 8 \ + --seq-length 1024 \ + --max-tokens-to-oom 300000 \ + --out-seq-length 1024 \ + --temperature 0.8 \ + --top_p 0.5 + " + +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node 1 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + + +bash -c "$LAUNCHER $CMD" diff --git a/examples/run_text_generation_server_iter_0001525.sh b/examples/run_text_generation_server_iter_0001525.sh new file mode 100755 index 00000000000..0acc7ca05ed --- /dev/null +++ b/examples/run_text_generation_server_iter_0001525.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +set -x -e + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json +export NCCL_ASYNC_ERROR_HANDLING=1 +export NCCL_IB_TIMEOUT=50 +export UCX_RC_TIMEOUT=4s +export NCCL_IB_RETRY_CNT=10 +export NCCL_SOCKET_IFNAME=ib0 +export NCCL_DEBUG=INFO +export MASTER_ADDR=127.0.0.1 +export MASTER_PORT=60234 + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-${(%):-%x}}" )" &> /dev/null && pwd ) + + +export CMD=" \ + $SCRIPT_DIR/../tools/run_text_generation_server.py \ + --load /p/project/opengptx-elm/thellmann1/workdir/checkpoint_conversion_meglm_test/meglm/2023-07-17_16-38-00/output_dir/340M_monolingual_en_sp_bpe_32768_10.sbatch/checkpoints \ + --tokenizer-model /p/scratch/opengptx-elm/data/datasources_opgptx/data_quality_experiments_datasets/ablations_studies/monolingual_en/70B_10/tokenizer_training/bpe/sp/32768_10/bpe_tokenizer.model \ + --tokenizer-type SentencePieceTokenizer \ + --pipeline-model-parallel-size 1 \ + --tensor-model-parallel-size 1 \ + --num-layers 24 \ + --hidden-size 1024 \ + --num-attention-heads 16 \ + --max-position-embeddings 2048 \ + --bf16 \ + --micro-batch-size 5 \ + --seq-length 2048 \ + --out-seq-length 2048 \ + --temperature 0.8 \ + --top_p 0.5 \ + --seed 42 \ + --position-embedding-type alibi \ + --no-position-embedding \ + " + + +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node 1 \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr localhost \ + --master_port 6000" + +bash -c "$LAUNCHER $CMD" diff --git a/megatron/arguments.py b/megatron/arguments.py index 51cc8f83b2a..969c4d273ee 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -509,7 +509,10 @@ def _add_inference_args(parser): choices=["megatron", "huggingface"], help='Select either Megatron or Huggingface as the ' 'Bert embedder.') - + group.add_argument('--pad-to-seq-length', action='store_true', default=False, + help='Always add to max seq length supported by model and' + 'not max seq length found in a batch.') + return parser diff --git a/megatron/text_generation/api.py b/megatron/text_generation/api.py index 090b630a5f3..492e3f804c4 100644 --- a/megatron/text_generation/api.py +++ b/megatron/text_generation/api.py @@ -6,35 +6,39 @@ import torch from megatron.core import mpu + from .communication import broadcast_float_list from .generation import ( - generate_tokens_probs_and_return_on_first_stage, - score_and_return_on_first_stage, - beam_search_and_return_on_first_stage) -from .tokenization import ( - tokenize_prompts, - detokenize_generations) - -def generate_and_post_process(model, - prompts=None, - tokens_to_generate=0, - return_output_log_probs=False, - top_k_sampling=0, - top_p_sampling=0.0, - top_p_decay=0.0, - top_p_bound=0.0, - temperature=1.0, - add_BOS=False, - use_eod_token_for_early_termination=True, - stop_on_double_eol=False, - stop_on_eol=False, - prevent_newline_after_colon=False, - random_seed=-1): + beam_search_and_return_on_first_stage, + generate_tokens_probs_and_return_on_first_stage, + score_and_return_on_first_stage, +) +from .tokenization import detokenize_generations, tokenize_prompts + + +def generate_and_post_process( + model, + prompts=None, + tokens_to_generate=0, + return_output_log_probs=False, + top_k_sampling=0, + top_p_sampling=0.0, + top_p_decay=0.0, + top_p_bound=0.0, + temperature=1.0, + add_BOS=False, + use_eod_token_for_early_termination=True, + stop_on_double_eol=False, + stop_on_eol=False, + prevent_newline_after_colon=False, + random_seed=-1, + return_is_max_logprobs=False, +): """Run inference and post-process outputs, i.e., detokenize, move to cpu and convert to list.""" # Main inference. - tokens, lengths, output_log_probs = generate( + generate_result = generate( model, prompts=prompts, tokens_to_generate=tokens_to_generate, @@ -49,55 +53,93 @@ def generate_and_post_process(model, stop_on_double_eol=stop_on_double_eol, stop_on_eol=stop_on_eol, prevent_newline_after_colon=prevent_newline_after_colon, - random_seed=random_seed) + random_seed=random_seed, + return_is_max_logprobs=return_is_max_logprobs, + ) + + if return_is_max_logprobs: + tokens, lengths, output_log_probs, is_max_logprobs = generate_result + else: + tokens, lengths, output_log_probs = generate_result # Only post-process on first stage. if mpu.is_pipeline_first_stage(): - tokens, prompts_plus_generations, prompts_plus_generations_segments = \ - detokenize_generations(tokens, lengths, True) + ( + tokens, + prompts_plus_generations, + prompts_plus_generations_segments, + ) = detokenize_generations(tokens, lengths, True) if return_output_log_probs: output_log_probs = output_log_probs.cpu().numpy().tolist() - for i, (prob, seg) in enumerate(zip(output_log_probs, prompts_plus_generations_segments)): - output_log_probs[i] = prob[:len(seg)-1] + for i, (prob, seg) in enumerate( + zip(output_log_probs, prompts_plus_generations_segments) + ): + output_log_probs[i] = prob[: len(seg) - 1] + + # is_max_logprobs an das results wenn return_is_max_logprobs true + result = ( + prompts_plus_generations, + prompts_plus_generations_segments, + output_log_probs, + tokens, + ) + if return_is_max_logprobs: + is_max_logprobs = is_max_logprobs.tolist() + lengths = lengths.tolist() + + for i, length in enumerate(lengths): + is_max_logprobs[i] = is_max_logprobs[i][: length - 1] - return prompts_plus_generations, prompts_plus_generations_segments, \ - output_log_probs, tokens + result = result + (is_max_logprobs,) + return result return None -def generate(model, - prompts=None, - tokens_to_generate=0, - return_output_log_probs=False, - top_k_sampling=0, - top_p_sampling=0.0, - top_p_decay=0.0, - top_p_bound=0.0, - temperature=1.0, - add_BOS=False, - use_eod_token_for_early_termination=True, - stop_on_double_eol=False, - stop_on_eol=False, - prevent_newline_after_colon=False, - random_seed=-1): + +def generate( + model, + prompts=None, + tokens_to_generate=0, + return_output_log_probs=False, + top_k_sampling=0, + top_p_sampling=0.0, + top_p_decay=0.0, + top_p_bound=0.0, + temperature=1.0, + add_BOS=False, + use_eod_token_for_early_termination=True, + stop_on_double_eol=False, + stop_on_eol=False, + prevent_newline_after_colon=False, + random_seed=-1, + return_is_max_logprobs=False, +): """Given prompts and input parameters, run inference and return: - tokens: prompts plus the generated tokens. - lengths: length of the prompt + generations. Note that we can - discard tokens in the tokens tensor that are after the - corresponding length. - output_log_probs: log probs of the tokens. + tokens: prompts plus the generated tokens. + lengths: length of the prompt + generations. Note that we can + discard tokens in the tokens tensor that are after the + corresponding length. + output_log_probs: log probs of the tokens. """ # Make sure input params are avaialble to all ranks. - values = [tokens_to_generate, - return_output_log_probs, - top_k_sampling, top_p_sampling, top_p_decay, top_p_bound, - temperature, add_BOS, use_eod_token_for_early_termination, - stop_on_double_eol, - stop_on_eol, - prevent_newline_after_colon, - random_seed] + values = [ + tokens_to_generate, + return_output_log_probs, + top_k_sampling, + top_p_sampling, + top_p_decay, + top_p_bound, + temperature, + add_BOS, + use_eod_token_for_early_termination, + stop_on_double_eol, + stop_on_eol, + prevent_newline_after_colon, + random_seed, + return_is_max_logprobs, + ] values_float_tensor = broadcast_float_list(len(values), float_list=values) tokens_to_generate = int(values_float_tensor[0].item()) return_output_log_probs = bool(values_float_tensor[1].item()) @@ -112,6 +154,7 @@ def generate(model, stop_on_eol = bool(values_float_tensor[10].item()) prevent_newline_after_colon = bool(values_float_tensor[11].item()) random_seed = int(values_float_tensor[12].item()) + return_is_max_logprobs = bool(values_float_tensor[13].item()) if random_seed != -1: torch.random.manual_seed(random_seed) @@ -120,18 +163,32 @@ def generate(model, # Note that these tensors are broadcaseted to all ranks. if torch.distributed.get_rank() == 0: assert prompts is not None - + context_tokens_tensor, context_length_tensor = tokenize_prompts( - prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS) + prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS + ) + # wenn ich nichts generiere d.h. für loglikelihood Anfragen (da interessieren mich nur die logprobs) if tokens_to_generate == 0: return score_and_return_on_first_stage( - model, context_tokens_tensor, context_length_tensor) - + model, + context_tokens_tensor, + context_length_tensor, + return_is_max_logprobs=return_is_max_logprobs, + ) + + if return_is_max_logprobs: + raise NotImplementedError( + "return_is_max_logprobs only implemented for tokens_to_generate == 0" + ) + # Main inference function. # Note that the outputs are available on the first stage. + # Hier kommt neben den logprobs auch was generiert wurde zurück return generate_tokens_probs_and_return_on_first_stage( - model, context_tokens_tensor, context_length_tensor, + model, + context_tokens_tensor, + context_length_tensor, return_output_log_probs=return_output_log_probs, top_k=top_k_sampling, top_p=top_p_sampling, @@ -141,48 +198,78 @@ def generate(model, use_eod_token_for_early_termination=use_eod_token_for_early_termination, stop_on_double_eol=stop_on_double_eol, stop_on_eol=stop_on_eol, - prevent_newline_after_colon=prevent_newline_after_colon) - -def beam_search_and_post_process(model, - prompts=None, - tokens_to_generate=0, - beam_size=0, - add_BOS=False, - stop_token=50256, - num_return_gen=1, - length_penalty=1, - prevent_newline_after_colon=False): + prevent_newline_after_colon=prevent_newline_after_colon, + ) + + +def beam_search_and_post_process( + model, + prompts=None, + tokens_to_generate=0, + beam_size=0, + add_BOS=False, + stop_token=50256, + num_return_gen=1, + length_penalty=1, + prevent_newline_after_colon=False, +): """Run beam search and post-process outputs, i.e., detokenize, move to cpu and convert to list.""" # Main inference. - tokens, scores = beam_search(model, - prompts=prompts, - tokens_to_generate=tokens_to_generate, - beam_size=beam_size, - add_BOS=add_BOS, - stop_token=stop_token, - num_return_gen=num_return_gen, - length_penalty=length_penalty, - prevent_newline_after_colon=prevent_newline_after_colon) + tokens, scores = beam_search( + model, + prompts=prompts, + tokens_to_generate=tokens_to_generate, + beam_size=beam_size, + add_BOS=add_BOS, + stop_token=stop_token, + num_return_gen=num_return_gen, + length_penalty=length_penalty, + prevent_newline_after_colon=prevent_newline_after_colon, + ) # Only post-process on first stage. if mpu.is_pipeline_first_stage(): - lengths = tokens.size(1)*torch.ones(beam_size, dtype=torch.int64, device=torch.cuda.current_device()) - tokens, prompts_plus_generations, prompts_plus_generations_segments = detokenize_generations(tokens, lengths, True) + lengths = tokens.size(1) * torch.ones( + beam_size, dtype=torch.int64, device=torch.cuda.current_device() + ) + ( + tokens, + prompts_plus_generations, + prompts_plus_generations_segments, + ) = detokenize_generations(tokens, lengths, True) scores = scores.cpu().numpy().tolist() - return prompts_plus_generations, prompts_plus_generations_segments, scores + return ( + prompts_plus_generations, + prompts_plus_generations_segments, + scores, + tokens, + ) return None -def beam_search(model, prompts=None, tokens_to_generate=0, beam_size=0, add_BOS=False, stop_token=50256, num_return_gen=1, length_penalty=1, prevent_newline_after_colon=False): + +def beam_search( + model, + prompts=None, + tokens_to_generate=0, + beam_size=0, + add_BOS=False, + stop_token=50256, + num_return_gen=1, + length_penalty=1, + prevent_newline_after_colon=False, +): # Make sure input params are avaialble to all ranks. - values = [tokens_to_generate, - beam_size, - add_BOS, - stop_token, - num_return_gen, - length_penalty, - prevent_newline_after_colon] + values = [ + tokens_to_generate, + beam_size, + add_BOS, + stop_token, + num_return_gen, + length_penalty, + prevent_newline_after_colon, + ] values_float_tensor = broadcast_float_list(len(values), float_list=values) tokens_to_generate = int(values_float_tensor[0].item()) beam_size = int(values_float_tensor[1].item()) @@ -193,8 +280,16 @@ def beam_search(model, prompts=None, tokens_to_generate=0, beam_size=0, add_BOS= prevent_newline_after_colon = values_float_tensor[6].item() context_tokens_tensor, context_length_tensor = tokenize_prompts( - prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS) - - return beam_search_and_return_on_first_stage(model, context_tokens_tensor, context_length_tensor, - beam_size, stop_token=stop_token, num_return_gen=num_return_gen, length_penalty=length_penalty, - prevent_newline_after_colon=prevent_newline_after_colon) + prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS + ) + + return beam_search_and_return_on_first_stage( + model, + context_tokens_tensor, + context_length_tensor, + beam_size, + stop_token=stop_token, + num_return_gen=num_return_gen, + length_penalty=length_penalty, + prevent_newline_after_colon=prevent_newline_after_colon, + ) diff --git a/megatron/text_generation/generation.py b/megatron/text_generation/generation.py index 098706ee6d1..69bc7da279e 100644 --- a/megatron/text_generation/generation.py +++ b/megatron/text_generation/generation.py @@ -2,21 +2,28 @@ """Generation utilities.""" +import os + import torch import torch.nn.functional as F -from megatron import get_args, get_tokenizer +from megatron import get_args, get_tokenizer, print_rank_0 from megatron.core import mpu from megatron.utils import get_ltor_masks_and_position_ids from .communication import ( copy_from_last_to_first_pipeline_stage, broadcast_from_last_pipeline_stage, - broadcast_from_last_to_first_pipeline_stage) + broadcast_from_last_to_first_pipeline_stage, +) from .forward_step import ForwardStep from .sampling import sample from .beam_utils import BeamHypotheses -def score_and_return_on_first_stage(model, tokens, lengths): +output_done = False + +def score_and_return_on_first_stage( + model, tokens, lengths, return_is_max_logprobs=False +): """Function for just scoring. Arguments: model: no interleaving is supported. @@ -24,24 +31,33 @@ def score_and_return_on_first_stage(model, tokens, lengths): lengths: original prompt length, size: [b] Note: Outside of model, other parameters only need to be available on rank 0. - Outputs: + Outputs: output_log_probs: log probability of the selected tokens. size: [b, s] """ args = get_args() batch_size = tokens.size(0) + seq_length = tokens.size(1) max_prompt_length = lengths.max().item() - assert max_prompt_length == tokens.size(1) - - if max_prompt_length > args.max_position_embeddings: + if args.pad_to_seq_length: + assert seq_length == args.seq_length + else: + assert seq_length == max_prompt_length + + if seq_length > args.max_position_embeddings: raise ValueError("Length of prompt + tokens_to_generate longer than allowed") - - if max_prompt_length * batch_size > args.max_tokens_to_oom: - raise ValueError("Too many tokens. " + str(max_prompt_length*batch_size)+ " is greater than "+str(args.max_tokens_to_oom)) + + if seq_length * batch_size > args.max_tokens_to_oom: + raise ValueError( + "Too many tokens. " + + str(seq_length * batch_size) + + " is greater than " + + str(args.max_tokens_to_oom) + ) # forward step. - forward_step = ForwardStep(model, batch_size, max_prompt_length) + forward_step = ForwardStep(model, batch_size, seq_length) # =================== # Pre-allocate memory @@ -49,52 +65,90 @@ def score_and_return_on_first_stage(model, tokens, lengths): # Log probability of the sequence (prompt + generated tokens). output_log_probs = None - output_log_probs_size = (batch_size, max_prompt_length - 1) - + is_max_logprobs = None + + output_log_probs_size = (batch_size, seq_length - 1) + is_max_logprobs_size = (batch_size, seq_length - 1) + if mpu.is_pipeline_last_stage(): - output_log_probs = torch.empty(output_log_probs_size, - dtype=torch.float32, - device=torch.cuda.current_device()) - + output_log_probs = torch.empty( + output_log_probs_size, + dtype=torch.float32, + device=torch.cuda.current_device(), + ) + + is_max_logprobs = torch.empty( + is_max_logprobs_size, + dtype=torch.bool, + device=torch.cuda.current_device() + ) + # ============= # Run infernece # ============= with torch.no_grad(): attention_mask, position_ids = _build_attention_mask_and_position_ids(tokens) - + # logits will be meanigful only in the last pipeline stage. logits = forward_step(tokens, position_ids, attention_mask) if mpu.is_pipeline_last_stage(): # Always the last stage should have an output. assert logits is not None + + global output_done + if os.getenv("MEGATRON_OUTPUT_FIRST_LOGITS") and not output_done: + print_rank_0("TOKENS:") + print_rank_0(tokens.tolist()[0]) + print_rank_0("-------") + print_rank_0("LOGITS:") + print_rank_0(logits.tolist()[0]) + print_rank_0("#######") + output_done = True + log_probs = F.log_softmax(logits, dim=2) - + # Pick the tokens that we need to get the log # probabilities for. Note that next input token is # the token which we selected in the current logits, # so shift by 1. indices = torch.unsqueeze(tokens[:, 1:], 2) output_log_probs = torch.gather(log_probs, 2, indices).squeeze(2) - + max_logprobs = torch.max(log_probs, 2).values + is_max_logprobs = torch.eq(output_log_probs, max_logprobs[:, :-1]) + # ====================================== # Broadcast to the first pipeline stage. # ====================================== output_log_probs = broadcast_from_last_to_first_pipeline_stage( - output_log_probs_size, torch.float32, output_log_probs) + output_log_probs_size, torch.float32, output_log_probs + ) + + if return_is_max_logprobs: + is_max_logprobs = broadcast_from_last_to_first_pipeline_stage( + is_max_logprobs_size, torch.bool, is_max_logprobs + ) + return tokens, lengths, output_log_probs, is_max_logprobs return tokens, lengths, output_log_probs + def generate_tokens_probs_and_return_on_first_stage( - model, tokens, lengths, - return_output_log_probs=False, - top_k=0, top_p=0.0, top_p_decay=0.0, top_p_bound=0.0, - temperature=1.0, - use_eod_token_for_early_termination=True, - stop_on_double_eol=False, - stop_on_eol=False, - prevent_newline_after_colon=True - ): + model, + tokens, + lengths, + return_output_log_probs=False, + top_k=0, + top_p=0.0, + top_p_decay=0.0, + top_p_bound=0.0, + temperature=1.0, + use_eod_token_for_early_termination=True, + stop_on_double_eol=False, + stop_on_eol=False, + prevent_newline_after_colon=True, + return_is_max_logprob=False, +): """Main token generation function. Arguments: model: no interleaving is supported. @@ -131,16 +185,21 @@ def generate_tokens_probs_and_return_on_first_stage( if max_sequence_length > args.max_position_embeddings: raise ValueError("Length of prompt + tokens_to_generate longer than allowed") - + if max_sequence_length * batch_size > args.max_tokens_to_oom: - raise ValueError("Too many tokens. " + str(max_sequence_length*batch_size)+ " is greater than "+str(args.max_tokens_to_oom)) + raise ValueError( + "Too many tokens. " + + str(max_sequence_length * batch_size) + + " is greater than " + + str(args.max_tokens_to_oom) + ) # forward step. forward_step = ForwardStep(model, batch_size, max_sequence_length) # Added termination_id to support the case that we want to terminate the # generation once that id is generated. - if hasattr(args, 'eos_id'): + if hasattr(args, "eos_id"): termination_id = args.eos_id else: termination_id = tokenizer.eod @@ -156,49 +215,70 @@ def generate_tokens_probs_and_return_on_first_stage( generated_sequence_lengths = None if mpu.is_pipeline_last_stage(): if return_output_log_probs: - output_log_probs = torch.empty(output_log_probs_size, - dtype=torch.float32, - device=torch.cuda.current_device()) - generated_sequence_lengths = torch.ones( - batch_size, dtype=torch.int64, - device=torch.cuda.current_device()) * max_sequence_length - + output_log_probs = torch.empty( + output_log_probs_size, + dtype=torch.float32, + device=torch.cuda.current_device(), + ) + generated_sequence_lengths = ( + torch.ones( + batch_size, dtype=torch.int64, device=torch.cuda.current_device() + ) + * max_sequence_length + ) + # Whether we have reached a termination id. - is_generation_done = torch.zeros(batch_size, dtype=torch.uint8, - device=torch.cuda.current_device()) + is_generation_done = torch.zeros( + batch_size, dtype=torch.uint8, device=torch.cuda.current_device() + ) # ============= # Run infernece # ============= with torch.no_grad(): - attention_mask, position_ids = _build_attention_mask_and_position_ids( - tokens) + attention_mask, position_ids = _build_attention_mask_and_position_ids(tokens) prev_context_length = 0 for context_length in range(min_prompt_length, max_sequence_length): - # Pick the slice that we need to pass through the network. tokens2use = tokens[:, prev_context_length:context_length] positions2use = position_ids[:, prev_context_length:context_length] attention_mask2use = attention_mask[ - ..., prev_context_length:context_length, :context_length] + ..., prev_context_length:context_length, :context_length + ] # logits will be meanigful only in the last pipeline stage. logits = forward_step(tokens2use, positions2use, attention_mask2use) if mpu.is_pipeline_last_stage(): + global output_done + if os.getenv("MEGATRON_OUTPUT_FIRST_LOGITS") and not output_done: + print_rank_0("TOKENS:") + print_rank_0(tokens2use.tolist()[0]) + print_rank_0("-------") + print_rank_0("LOGITS:") + print_rank_0(logits.tolist()[0]) + print_rank_0("#######") + print_rank_0_done = True + if prevent_newline_after_colon: - logits[tokens2use[:, -1] == tokenizer.tokenize(':')[0], -1, tokenizer.tokenize('\n')[0]] = -1e10 # disable "\n" after ":" + logits[ + tokens2use[:, -1] == tokenizer.tokenize(":")[0], + -1, + tokenizer.tokenize("\n")[0], + ] = -1e10 # disable "\n" after ":" # Always the last stage should have an output. assert logits is not None # Sample. last_token_logits = logits[:, -1, :] - new_sample = sample(last_token_logits, - top_k=top_k, - top_p=top_p, - temperature=temperature, - vocab_size=tokenizer.vocab_size) + new_sample = sample( + last_token_logits, + top_k=top_k, + top_p=top_p, + temperature=temperature, + vocab_size=tokenizer.vocab_size, + ) if top_p > 0.0 and top_p_decay > 0.0: top_p = top_p * top_p_decay if top_p_bound > 0.0: @@ -219,18 +299,18 @@ def generate_tokens_probs_and_return_on_first_stage( # the token which we selected in the current logits, # so shift by 1. indices = torch.unsqueeze( - tokens[ - :, - (prev_context_length + 1):(context_length + 1)], - 2) - output_log_probs[:, - prev_context_length:context_length] = \ - torch.gather(log_probs, 2, indices).squeeze(2) + tokens[:, (prev_context_length + 1) : (context_length + 1)], + 2, + ) + output_log_probs[ + :, prev_context_length:context_length + ] = torch.gather(log_probs, 2, indices).squeeze(2) # Update the tokens on the first stage so the next input to # the network is correct. - copy_from_last_to_first_pipeline_stage(batch_size, torch.int64, - tokens[:, context_length]) + copy_from_last_to_first_pipeline_stage( + batch_size, torch.int64, tokens[:, context_length] + ) # Update the context length for the next token generation. prev_context_length = context_length @@ -242,31 +322,32 @@ def generate_tokens_probs_and_return_on_first_stage( # instead tokenization should be in the inference loop so stop sequences can be used if stop_on_double_eol: hit_double_eol = (new_sample == 628).byte() & started.byte() - hit_two_eols = (new_sample == 198).byte() & (tokens[:, context_length-1] == 198).byte() & started.byte() + hit_two_eols = ( + (new_sample == 198).byte() + & (tokens[:, context_length - 1] == 198).byte() + & started.byte() + ) done_token = hit_double_eol | hit_two_eols elif stop_on_eol: hit_double_eol = (new_sample == 628).byte() & started.byte() hit_eol = (new_sample == 198).byte() & started.byte() done_token = hit_double_eol | hit_eol - else: - done_token = (new_sample == termination_id).byte() & \ - started.byte() - + else: + done_token = (new_sample == termination_id).byte() & started.byte() + just_finished = (done_token & ~is_generation_done).bool() - generated_sequence_lengths[just_finished.view(-1)] = \ - context_length + 1 + generated_sequence_lengths[just_finished.view(-1)] = context_length + 1 is_generation_done = is_generation_done | done_token done = torch.all(is_generation_done) - done = broadcast_from_last_pipeline_stage(1, torch.uint8, - tensor=done) + done = broadcast_from_last_pipeline_stage(1, torch.uint8, tensor=done) if use_eod_token_for_early_termination and done: break - + # =================================================== # Update the length of based on max generated length. # =================================================== - tokens = tokens[:, :(context_length + 1)] + tokens = tokens[:, : (context_length + 1)] if mpu.is_pipeline_last_stage(): if return_output_log_probs: output_log_probs = output_log_probs[:, :context_length] @@ -276,24 +357,36 @@ def generate_tokens_probs_and_return_on_first_stage( # ====================================== generated_sequence_lengths = broadcast_from_last_to_first_pipeline_stage( - batch_size, torch.int64, generated_sequence_lengths) + batch_size, torch.int64, generated_sequence_lengths + ) if return_output_log_probs: output_log_probs_size = (batch_size, context_length) output_log_probs = broadcast_from_last_to_first_pipeline_stage( - output_log_probs_size, torch.float32, output_log_probs) + output_log_probs_size, torch.float32, output_log_probs + ) return tokens, generated_sequence_lengths, output_log_probs -def beam_search_and_return_on_first_stage(model, tokens, lengths, beam_size, stop_token, num_return_gen, length_penalty, prevent_newline_after_colon=True): + +def beam_search_and_return_on_first_stage( + model, + tokens, + lengths, + beam_size, + stop_token, + num_return_gen, + length_penalty, + prevent_newline_after_colon=True, +): args = get_args() tokenizer = get_tokenizer() batch_size = tokens.size(0) - assert(batch_size == 1) + assert batch_size == 1 prompt_length = lengths.item() final_sequence_length = tokens.size(1) final_sequence_length = min(final_sequence_length, args.max_position_embeddings) - + # If the context is too big, this happens if prompt_length >= final_sequence_length: raise ValueError("context length + tokens_to_generate too large") @@ -304,9 +397,9 @@ def beam_search_and_return_on_first_stage(model, tokens, lengths, beam_size, sto beam_hyp = BeamHypotheses(beam_size, length_penalty) best_batches = None done = torch.zeros(1, dtype=torch.uint8, device=torch.cuda.current_device()) - scores = torch.zeros(beam_size, - dtype=torch.float32, - device=torch.cuda.current_device()).unsqueeze(1) + scores = torch.zeros( + beam_size, dtype=torch.float32, device=torch.cuda.current_device() + ).unsqueeze(1) scores_size_tensor, tokens_size_tensor = None, None # ============= # Run infernece @@ -316,30 +409,40 @@ def beam_search_and_return_on_first_stage(model, tokens, lengths, beam_size, sto attention_mask, position_ids = _build_attention_mask_and_position_ids(tokens) prev_context_length = 0 for context_length in range(prompt_length, final_sequence_length): - # Pick the slice that we need to pass through the network. tokens2use = tokens[:, prev_context_length:context_length] positions2use = position_ids[:, prev_context_length:context_length] attention_mask2use = attention_mask[ - ..., prev_context_length:context_length, :context_length] + ..., prev_context_length:context_length, :context_length + ] # logits will be meanigful only in the last pipeline stage. logits = forward_step(tokens2use, positions2use, attention_mask2use) if mpu.is_pipeline_last_stage(): if prevent_newline_after_colon: - logits[tokens2use[:, -1] == tokenizer.tokenize(':')[0], -1, tokenizer.tokenize('\n')[0]] = -1e10 # disable "\n" after ":" + logits[ + tokens2use[:, -1] == tokenizer.tokenize(":")[0], + -1, + tokenizer.tokenize("\n")[0], + ] = -1e10 # disable "\n" after ":" vocab_size = logits.size(2) log_probs = F.log_softmax(logits, dim=2) new_scores = log_probs[:, -1, :] + scores if context_length == prompt_length: # if this is the first one - sorted_scores, indices = torch.sort(new_scores[0,:], descending=True) + sorted_scores, indices = torch.sort( + new_scores[0, :], descending=True + ) else: - sorted_scores, indices = torch.sort(new_scores.view(-1), descending=True) - - best_beam_ids = torch.div(indices[: 2 * beam_size], vocab_size).trunc().long() - best_words = indices[:2 * beam_size] % vocab_size + sorted_scores, indices = torch.sort( + new_scores.view(-1), descending=True + ) + + best_beam_ids = ( + torch.div(indices[: 2 * beam_size], vocab_size).trunc().long() + ) + best_words = indices[: 2 * beam_size] % vocab_size best_scores = sorted_scores[: 2 * beam_size] next_beams = [] @@ -348,13 +451,15 @@ def beam_search_and_return_on_first_stage(model, tokens, lengths, beam_size, sto ): if token_id.item() == stop_token: # if beam_token does not belong to top num_beams tokens, it should not be added - is_beam_token_worse_than_top_num_beams = beam_token_rank >= beam_size + is_beam_token_worse_than_top_num_beams = ( + beam_token_rank >= beam_size + ) if is_beam_token_worse_than_top_num_beams: continue beam_hyp.add( tokens[beam_id].clone(), beam_score, - context_length + 1 - prompt_length + context_length + 1 - prompt_length, ) else: # add next predicted token since it is not eos_token @@ -363,14 +468,18 @@ def beam_search_and_return_on_first_stage(model, tokens, lengths, beam_size, sto if len(next_beams) == beam_size: break - if beam_hyp.is_done(best_scores.max().item(), context_length + 1 - prompt_length): - done = torch.ones(1, dtype=torch.uint8, device=torch.cuda.current_device()) - + if beam_hyp.is_done( + best_scores.max().item(), context_length + 1 - prompt_length + ): + done = torch.ones( + 1, dtype=torch.uint8, device=torch.cuda.current_device() + ) + best_batches = tokens.new([item[2] for item in next_beams]) - tokens = tokens[best_batches,:] + tokens = tokens[best_batches, :] tokens[:, context_length] = tokens.new([item[0] for item in next_beams]) scores = scores.new([item[1] for item in next_beams]).unsqueeze(1) - + # torch.distributed.barrier() done = broadcast_from_last_pipeline_stage(1, torch.uint8, done) if done: @@ -378,11 +487,12 @@ def beam_search_and_return_on_first_stage(model, tokens, lengths, beam_size, sto # Update the tokens on the first stage so the next input to # the network is correct. - copy_from_last_to_first_pipeline_stage(tokens.size(), torch.int64, - tokens) + copy_from_last_to_first_pipeline_stage(tokens.size(), torch.int64, tokens) # set inference key values to make it consistent with best beam index - best_batches = broadcast_from_last_pipeline_stage(beam_size, torch.int64, best_batches) + best_batches = broadcast_from_last_pipeline_stage( + beam_size, torch.int64, best_batches + ) forward_step.inference_params.swap_key_value_dict(best_batches) # Update the context length for the next token generation. @@ -392,7 +502,11 @@ def beam_search_and_return_on_first_stage(model, tokens, lengths, beam_size, sto # if cannot find stop token, add open beams to hyps if not done: for beam_id in range(beam_size): - beam_hyp.add(tokens[beam_id].clone(), scores[beam_id].squeeze(), context_length + 1 - prompt_length) + beam_hyp.add( + tokens[beam_id].clone(), + scores[beam_id].squeeze(), + context_length + 1 - prompt_length, + ) # rank based on scores sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0], reverse=True) @@ -401,14 +515,26 @@ def beam_search_and_return_on_first_stage(model, tokens, lengths, beam_size, sto tokens = [sorted_hyps[i][1] for i in range(num_return_gen)] scores = torch.stack(scores, dim=0) tokens = torch.stack(tokens, dim=0) - scores_size_tensor = torch.tensor(scores.shape, dtype=torch.int64, device=torch.cuda.current_device()) - tokens_size_tensor = torch.tensor(tokens.shape, dtype=torch.int64, device=torch.cuda.current_device()) - - scores_size_tensor = broadcast_from_last_pipeline_stage(1, torch.int64, scores_size_tensor) - tokens_size_tensor = broadcast_from_last_pipeline_stage(2, torch.int64, tokens_size_tensor) - - scores = broadcast_from_last_to_first_pipeline_stage(tuple(scores_size_tensor), torch.float32, scores) - tokens = broadcast_from_last_to_first_pipeline_stage(tuple(tokens_size_tensor), torch.int64, tokens) + scores_size_tensor = torch.tensor( + scores.shape, dtype=torch.int64, device=torch.cuda.current_device() + ) + tokens_size_tensor = torch.tensor( + tokens.shape, dtype=torch.int64, device=torch.cuda.current_device() + ) + + scores_size_tensor = broadcast_from_last_pipeline_stage( + 1, torch.int64, scores_size_tensor + ) + tokens_size_tensor = broadcast_from_last_pipeline_stage( + 2, torch.int64, tokens_size_tensor + ) + + scores = broadcast_from_last_to_first_pipeline_stage( + tuple(scores_size_tensor), torch.float32, scores + ) + tokens = broadcast_from_last_to_first_pipeline_stage( + tuple(tokens_size_tensor), torch.int64, tokens + ) return tokens, scores @@ -423,6 +549,7 @@ def _build_attention_mask_and_position_ids(tokens): eod_token=None, reset_position_ids=False, reset_attention_mask=False, - eod_mask_loss=False) + eod_mask_loss=False, + ) return attention_mask, position_ids diff --git a/megatron/text_generation/tokenization.py b/megatron/text_generation/tokenization.py index accead319a1..0e99a32c317 100644 --- a/megatron/text_generation/tokenization.py +++ b/megatron/text_generation/tokenization.py @@ -17,6 +17,7 @@ def detokenize_generations(tokens_gpu_tensor, tokenizer = get_tokenizer() args = get_args() + token_lists = [] prompts_plus_generations = [] if return_segments: prompts_plus_generations_segments = [] @@ -25,14 +26,15 @@ def detokenize_generations(tokens_gpu_tensor, lengths = lengths_gpu_tensor.cpu().numpy().tolist() for sequence_tokens, length in zip(tokens, lengths): sequence_tokens = sequence_tokens[:length] + token_lists.append(sequence_tokens) prompts_plus_generations.append( tokenizer.detokenize(sequence_tokens)) if return_segments: words = [] for token in sequence_tokens: if args.tokenizer_type in ['SentencePieceTokenizer', - 'GPTSentencePieceTokenizer']: - word = tokenizer.decoder[token] + 'GPTSentencePieceTokenizer', 'OpenGPTX-SPTokenizer', 'OpenGPTX-HFTokenizer', "OpenGPTX-PretrainedHFTokenizer"]: + word = tokenizer.decoder[token] elif args.tokenizer_type == 'NullTokenizer': word = str(token) else: @@ -44,10 +46,10 @@ def detokenize_generations(tokens_gpu_tensor, prompts_plus_generations_segments.append(words) if return_segments: - return tokens, prompts_plus_generations, \ + return token_lists, prompts_plus_generations, \ prompts_plus_generations_segments - return tokens, prompts_plus_generations + return token_lists, prompts_plus_generations def tokenize_prompts(prompts=None, tokens_to_generate=None, @@ -94,13 +96,21 @@ def _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS): into a 2D tensor. """ - # Tokenize all the prompts. tokenizer = get_tokenizer() - if add_BOS: - prompts_tokens = [[tokenizer.eod] + tokenizer.tokenize(prompt) - for prompt in prompts] - else: - prompts_tokens = [tokenizer.tokenize(prompt) for prompt in prompts] + args = get_args() + + if isinstance(prompts[0], str): + # Tokenize all the prompts. + if add_BOS: + prompts_tokens = [[tokenizer.eod] + tokenizer.tokenize(prompt) for prompt in prompts] + else: + prompts_tokens = [tokenizer.tokenize(prompt) for prompt in prompts] + elif isinstance(prompts[0], list): + # Prompts are already tokenized + if add_BOS: + prompts_tokens = [[tokenizer.eod] + prompt for prompt in prompts] + else: + prompts_tokens = prompts # Now we have a list of list of tokens which each list has a different # size. We want to extend this list to: @@ -112,9 +122,12 @@ def _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS): max_prompt_len = max(prompts_length) # Number of tokens in the each sample of the batch. samples_length = max_prompt_len + tokens_to_generate + + padded_length = args.seq_length if args.pad_to_seq_length else samples_length + # Now update the list of list to be of the same size: samples_length. for prompt_tokens, prompt_length in zip(prompts_tokens, prompts_length): - padding_size = samples_length - prompt_length + padding_size = padded_length - prompt_length prompt_tokens.extend([tokenizer.eod] * padding_size) # Now we are in a structured format, we can convert to tensors. diff --git a/megatron/text_generation_server.py b/megatron/text_generation_server.py index 58550f2e631..c97bf20d0d2 100644 --- a/megatron/text_generation_server.py +++ b/megatron/text_generation_server.py @@ -1,19 +1,23 @@ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. import datetime -import torch -import json import threading -from flask import Flask, request, jsonify, current_app -from flask_restful import Resource, Api -from megatron import get_args -from megatron.text_generation import generate_and_post_process -from megatron.text_generation import beam_search_and_post_process +import traceback + +import torch +from flask import Flask, jsonify, request +from flask_restful import Api, Resource +from megatron import get_args, get_tokenizer +from megatron.text_generation import ( + beam_search_and_post_process, + generate_and_post_process, +) GENERATE_NUM = 0 BEAM_NUM = 1 lock = threading.Lock() + class MegatronGenerate(Resource): def __init__(self, model): self.model = model @@ -22,21 +26,21 @@ def __init__(self, model): def send_do_generate(): choice = torch.cuda.LongTensor([GENERATE_NUM]) torch.distributed.broadcast(choice, 0) - + @staticmethod def send_do_beam_search(): choice = torch.cuda.LongTensor([BEAM_NUM]) torch.distributed.broadcast(choice, 0) - + def put(self): args = get_args() - - if not "prompts" in request.get_json(): + + if "prompts" not in request.get_json(): return "prompts argument required", 400 - + if "max_len" in request.get_json(): return "max_len is no longer used. Replace with tokens_to_generate", 400 - + if "sentences" in request.get_json(): return "sentences is no longer used. Replace with prompts", 400 @@ -46,79 +50,103 @@ def put(self): if len(prompts) == 0: return "prompts is empty", 400 - + if len(prompts) > 128: return "Maximum number of prompts is 128", 400 - - tokens_to_generate = 64 # Choosing hopefully sane default. Full sequence is slow + + tokens_to_generate = ( + 64 # Choosing hopefully sane default. Full sequence is slow + ) + if "tokens_to_generate" in request.get_json(): tokens_to_generate = request.get_json()["tokens_to_generate"] if not isinstance(tokens_to_generate, int): - return "tokens_to_generate must be an integer greater than 0" + return "tokens_to_generate must be an integer greater than 0", 400 if tokens_to_generate < 0: - return "tokens_to_generate must be an integer greater than or equal to 0" + return ( + "tokens_to_generate must be an integer greater than or equal to 0", + 400, + ) logprobs = False if "logprobs" in request.get_json(): logprobs = request.get_json()["logprobs"] if not isinstance(logprobs, bool): - return "logprobs must be a boolean value" - + return "logprobs must be a boolean value", 400 + if tokens_to_generate == 0 and not logprobs: - return "tokens_to_generate=0 implies logprobs should be True" - + return "tokens_to_generate=0 implies logprobs should be True", 400 + temperature = 1.0 if "temperature" in request.get_json(): temperature = request.get_json()["temperature"] if not (type(temperature) == int or type(temperature) == float): - return "temperature must be a positive number less than or equal to 100.0" + return ( + "temperature must be a positive number less than or equal to 100.0", + 400, + ) if not (0.0 < temperature <= 100.0): - return "temperature must be a positive number less than or equal to 100.0" - - top_k = 0.0 + return ( + "temperature must be a positive number less than or equal to 100.0", + 400, + ) + + top_k = 0 if "top_k" in request.get_json(): top_k = request.get_json()["top_k"] if not (type(top_k) == int): - return "top_k must be an integer equal to or greater than 0 and less than or equal to 1000" + return ( + "top_k must be an integer equal to or greater than 0 and less than or equal to 1000", + 400, + ) if not (0 <= top_k <= 1000): - return "top_k must be equal to or greater than 0 and less than or equal to 1000" - + return ( + "top_k must be equal to or greater than 0 and less than or equal to 1000", + 400, + ) + top_p = 0.0 if "top_p" in request.get_json(): top_p = request.get_json()["top_p"] if not (type(top_p) == float): - return "top_p must be a positive float less than or equal to 1.0" + return "top_p must be a positive float less than or equal to 1.0", 400 if top_p > 0.0 and top_k > 0.0: - return "cannot set both top-k and top-p samplings." + return "cannot set both top-k and top-p samplings.", 400 if not (0 <= top_p <= 1.0): - return "top_p must be less than or equal to 1.0" - + return "top_p must be less than or equal to 1.0", 400 + top_p_decay = 0.0 if "top_p_decay" in request.get_json(): top_p_decay = request.get_json()["top_p_decay"] if not (type(top_p_decay) == float): - return "top_p_decay must be a positive float less than or equal to 1.0" + return ( + "top_p_decay must be a positive float less than or equal to 1.0", + 400, + ) if top_p == 0.0: - return "top_p_decay cannot be set without top_p" + return "top_p_decay cannot be set without top_p", 400 if not (0 <= top_p_decay <= 1.0): - return "top_p_decay must be less than or equal to 1.0" - + return "top_p_decay must be less than or equal to 1.0", 400 + top_p_bound = 0.0 if "top_p_bound" in request.get_json(): top_p_bound = request.get_json()["top_p_bound"] if not (type(top_p_bound) == float): - return "top_p_bound must be a positive float less than or equal to top_p" + return ( + "top_p_bound must be a positive float less than or equal to top_p", + 400, + ) if top_p == 0.0: - return "top_p_bound cannot be set without top_p" + return "top_p_bound cannot be set without top_p", 400 if not (0.0 < top_p_bound <= top_p): - return "top_p_bound must be greater than 0 and less than top_p" - + return "top_p_bound must be greater than 0 and less than top_p", 400 + add_BOS = False if "add_BOS" in request.get_json(): add_BOS = request.get_json()["add_BOS"] if not isinstance(add_BOS, bool): - return "add_BOS must be a boolean value" - + return "add_BOS must be a boolean value", 400 + if any([len(prompt) == 0 for prompt in prompts]) and not add_BOS: return "Empty prompts require add_BOS=true" @@ -126,86 +154,97 @@ def put(self): if "stop_on_double_eol" in request.get_json(): stop_on_double_eol = request.get_json()["stop_on_double_eol"] if not isinstance(stop_on_double_eol, bool): - return "stop_on_double_eol must be a boolean value" - + return "stop_on_double_eol must be a boolean value", 400 + stop_on_eol = False if "stop_on_eol" in request.get_json(): stop_on_eol = request.get_json()["stop_on_eol"] if not isinstance(stop_on_eol, bool): - return "stop_on_eol must be a boolean value" + return "stop_on_eol must be a boolean value", 400 prevent_newline_after_colon = False if "prevent_newline_after_colon" in request.get_json(): - prevent_newline_after_colon = request.get_json()["prevent_newline_after_colon"] + prevent_newline_after_colon = request.get_json()[ + "prevent_newline_after_colon" + ] if not isinstance(prevent_newline_after_colon, bool): - return "prevent_newline_after_colon must be a boolean value" + return "prevent_newline_after_colon must be a boolean value", 400 random_seed = -1 if "random_seed" in request.get_json(): random_seed = request.get_json()["random_seed"] if not isinstance(random_seed, int): - return "random_seed must be integer" - if random_seed < 0: - return "random_seed must be a positive integer" + return "random_seed must be integer", 400 + if random_seed < 0: + return "random_seed must be a positive integer", 400 no_log = False if "no_log" in request.get_json(): no_log = request.get_json()["no_log"] if not isinstance(no_log, bool): - return "no_log must be a boolean value" - + return "no_log must be a boolean value", 400 + beam_width = None if "beam_width" in request.get_json(): beam_width = request.get_json()["beam_width"] if not isinstance(beam_width, int): - return "beam_width must be integer" + return "beam_width must be integer", 400 if beam_width < 1: - return "beam_width must be an integer > 1" + return "beam_width must be an integer > 1", 400 if len(prompts) > 1: - return "When doing beam_search, batch size must be 1" + return "When doing beam_search, batch size must be 1", 400 - stop_token=50256 + stop_token = get_tokenizer().eod if "stop_token" in request.get_json(): stop_token = request.get_json()["stop_token"] if not isinstance(stop_token, int): - return "stop_token must be an integer" - - length_penalty = 1 + return "stop_token must be an integer", 400 + + length_penalty = 1 if "length_penalty" in request.get_json(): length_penalty = request.get_json()["length_penalty"] if not isinstance(length_penalty, float): - return "length_penalty must be a float" - + return "length_penalty must be a float", 400 + with lock: # Need to get lock to keep multiple threads from hitting code - if not no_log: print("request IP: " + str(request.remote_addr)) - print(json.dumps(request.get_json()),flush=True) + # print(json.dumps(request.get_json()), flush=True) print("start time: ", datetime.datetime.now()) - + try: if beam_width is not None: MegatronGenerate.send_do_beam_search() # Tell other ranks we're doing beam_search - response, response_seg, response_scores = \ - beam_search_and_post_process( + ( + response, + response_seg, + response_scores, + response_tokens, + ) = beam_search_and_post_process( self.model, prompts=prompts, tokens_to_generate=tokens_to_generate, - beam_size = beam_width, + beam_size=beam_width, add_BOS=add_BOS, stop_token=stop_token, num_return_gen=beam_width, # Returning whole beam length_penalty=length_penalty, - prevent_newline_after_colon=prevent_newline_after_colon - ) - - return jsonify({"text": response, + prevent_newline_after_colon=prevent_newline_after_colon, + ) + + result = { + "text": response, "segments": response_seg, - "scores": response_scores}) + "scores": response_scores, + "tokens": response_tokens, + } + else: MegatronGenerate.send_do_generate() # Tell other ranks we're doing generate - response, response_seg, response_logprobs, _ = \ - generate_and_post_process( + + return_is_max_logprobs = tokens_to_generate == 0 + + generate_result = generate_and_post_process( self.model, prompts=prompts, tokens_to_generate=tokens_to_generate, @@ -220,22 +259,137 @@ def put(self): stop_on_double_eol=stop_on_double_eol, stop_on_eol=stop_on_eol, prevent_newline_after_colon=prevent_newline_after_colon, - random_seed=random_seed) + random_seed=random_seed, + return_is_max_logprobs=return_is_max_logprobs, + ) + + if return_is_max_logprobs: + ( + response, + response_seg, + response_logprobs, + response_tokens, + response_is_max_logprobs, + ) = generate_result + else: + ( + response, + response_seg, + response_logprobs, + response_tokens, + ) = generate_result + response_is_max_logprobs = None - return jsonify({"text": response, + result = { + "text": response, "segments": response_seg, - "logprobs": response_logprobs}) + "logprobs": response_logprobs, + "tokens": response_tokens, + "is_max_logprobs": response_is_max_logprobs, + } + return jsonify(result) except ValueError as ve: + traceback.print_exc() return ve.args[0] print("end time: ", datetime.datetime.now()) - + + +class MegatronTokenizer(Resource): + def __init__(self, tokenizer): + self.tokenizer = tokenizer + + def put(self): + req_json = request.get_json() + if "prompts" not in req_json: + return "prompts argument required", 400 + + prompts = req_json["prompts"] + if not isinstance(prompts, list): + return "prompts is not a list of strings", 400 + + if len(prompts) == 0: + return "prompts is empty", 400 + + is_continuation = ( + req_json["is_continuation"] if "is_continuation" in req_json else False + ) + + return jsonify( + { + "tokens": [ + self.tokenizer.tokenize(prompt, is_continuation=is_continuation) + for prompt in prompts + ] + } + ) + + +class MegatronDetokenizer(Resource): + def __init__(self, tokenizer): + self.tokenizer = tokenizer + + def put(self): + if "seqs" not in request.get_json(): + return "seqs argument required", 400 + + seqs = request.get_json()["seqs"] + if not isinstance(seqs, list): + return "seqs is not a list of strings", 400 + + if len(seqs) == 0: + return "seqs is empty", 400 + + return jsonify({"text": [self.tokenizer.detokenize(seq) for seq in seqs]}) + + def options(): + return jsonify({}) + + +class MegatronMetadata(Resource): + def __init__(self, tokenizer): + self.tokenizer = tokenizer + + def put(self): + args = get_args() + + try: + if hasattr(args, "eos_id"): + eod_id = args.eos_id + else: + try: + eod_id = self.tokenizer.eod + except NotImplementedError: + eod_id = self.tokenizer.eos_token_id + max_length = args.max_position_embeddings + if max_length is None: + max_length = args.seq_length + + return jsonify( + { + "vocab_size": self.tokenizer.vocab_size, + "eod_token_id": eod_id, + "max_length": max_length, + } + ) + except ValueError as ve: + return ve.args[0], 300 + class MegatronServer(object): def __init__(self, model): - self.app = Flask(__name__, static_url_path='') + self.app = Flask(__name__, static_url_path="") api = Api(self.app) - api.add_resource(MegatronGenerate, '/api', resource_class_args=[model]) - - def run(self, url): - self.app.run(url, threaded=True, debug=False) + api.add_resource(MegatronGenerate, "/api", resource_class_args=[model]) + api.add_resource( + MegatronTokenizer, "/tokenize", resource_class_args=[get_tokenizer()] + ) + api.add_resource( + MegatronDetokenizer, "/detokenize", resource_class_args=[get_tokenizer()] + ) + api.add_resource( + MegatronMetadata, "/metadata", resource_class_args=[get_tokenizer()] + ) + + def run(self, url, port): + self.app.run(url, port, threaded=True, debug=False) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 97cbd9750c2..b79d1ec412b 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -4,29 +4,10 @@ from abc import ABC from abc import abstractmethod - -try: - from gptxdata.tokenization import ( - HFTokenizer, - PretrainedHFTokenizer, - SPTokenizer, - ) -except ImportError: - HFTokenizer = None - PretrainedHFTokenizer = None - SPTokenizer = None - +from gptxdata.tokenization import HFTokenizer, SPTokenizer, PretrainedHFTokenizer from .bert_tokenization import FullTokenizer as FullBertTokenizer from .gpt2_tokenization import GPT2Tokenizer - -def _assert_gptx_tokenizer_available(tokenizer_name, tokenizer_cls): - assert tokenizer_cls is not None, ( - f'Please install `gptxdata` to use {tokenizer_name}, e.g., with ' - f'`pip install git+https://github.com/OpenGPTX/opengptx_data.git`' - ) - - def build_tokenizer(args): """Initialize tokenizer.""" if args.rank == 0: @@ -58,15 +39,11 @@ def build_tokenizer(args): assert args.vocab_size is not None tokenizer = _NullTokenizer(args.vocab_size) elif args.tokenizer_type == "OpenGPTX-HFTokenizer": - _assert_gptx_tokenizer_available(args.tokenizer_type, HFTokenizer) - tokenizer = HFTokenizer.instantiate_from_file_or_name(model_file_or_name=args.tokenizer_model) + tokenizer = _HFTokenizer(model_file=args.tokenizer_model) elif args.tokenizer_type == "OpenGPTX-PretrainedHFTokenizer": - _assert_gptx_tokenizer_available( - args.tokenizer_type, PretrainedHFTokenizer) - tokenizer = PretrainedHFTokenizer.instantiate_from_file_or_name(model_file_or_name=args.tokenizer_model) + tokenizer = _PretrainedHFTokenizer(model_file=args.tokenizer_model) elif args.tokenizer_type == "OpenGPTX-SPTokenizer": - _assert_gptx_tokenizer_available(args.tokenizer_type, SPTokenizer) - tokenizer = SPTokenizer.instantiate_from_file_or_name(model_file_or_name=args.tokenizer_model) + tokenizer = _SPTokenizer(model_file=args.tokenizer_model) else: raise NotImplementedError('{} tokenizer is not ' 'implemented.'.format(args.tokenizer_type)) @@ -119,7 +96,7 @@ def inv_vocab(self): pass @abstractmethod - def tokenize(self, text): + def tokenize(self, text, is_continuation: bool = False): pass def detokenize(self, token_ids): @@ -210,7 +187,7 @@ def vocab(self): def inv_vocab(self): return self.tokenizer.inv_vocab - def tokenize(self, text): + def tokenize(self, text, is_continuation: bool = False): text_tokens = self.tokenizer.tokenize(text) return self.tokenizer.convert_tokens_to_ids(text_tokens) @@ -306,7 +283,7 @@ def vocab(self): def inv_vocab(self): return self.tokenizer.decoder - def tokenize(self, text): + def tokenize(self, text, is_continuation: bool = False): return self.tokenizer.encode(text) def detokenize(self, token_ids): @@ -412,7 +389,7 @@ def encoder(self): # From: # https://github.com/NVIDIA/NeMo/blob/c8fa217e811d60d11d014827c7f3845ff6c99ae7/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py#L89 - def tokenize(self, text): + def tokenize(self, text, is_continuation: bool = False): ids = [] idx = 0 @@ -504,7 +481,7 @@ def _initalize(self, vocab_extra_ids): self._bos_id = self.tokenizer.bos_id() self._eos_id = self.tokenizer.eos_id() - def tokenize(self, text): + def tokenize(self, text, is_continuation: bool = False): return self.tokenizer.encode_as_ids(text) def detokenize(self, ids): @@ -536,8 +513,8 @@ def __init__(self, vocab_size): self._eos_id = vocab_size self.vocab_size = vocab_size+1 - def tokenize(self, text): - return [int(x) for x in text.split(' ')] + def tokenize(self, text, is_continuation: bool = False): + return [int(x) for x in text.split(" ")] def detokenize(self, ids): text = [str(x) for x in ids] @@ -562,3 +539,219 @@ def eod(self): @property def additional_special_tokens_ids(self): return None + + +class _HFTokenizer(AbstractTokenizer): + def __init__(self, model_file): + name = 'OpenGPTX-HFTokenizer' + super().__init__(name) + self.tokenizer = HFTokenizer.instantiate_from_file_or_name(model_file_or_name=model_file) + + @property + def vocab_size(self): + return self.tokenizer.vocab_size + + @property + def vocab(self): + return self.tokenizer.vocab + + @property + def inv_vocab(self): + return self.tokenizer.inv_vocab + + @property + def decoder(self): + return self.inv_vocab + + @property + def encoder(self): + return self.tokenizer.vocab + + def tokenize(self, text, is_continuation: bool = False): + return self.tokenizer.encode(text) + + def detokenize(self, ids): + return self.tokenizer.decode(ids) + + @property + def pad(self): + return self.tokenizer.tokenizer.get_vocab()[self.tokenizer.pad_token] + + @property + def pad_token_id(self): + return self.pad + + @property + def bos(self): + return self.tokenizer.tokenizer.get_vocab()[self.tokenizer.bos_token] + + @property + def bos_token_id(self): + return self.bos + + @property + def eod(self): + return self.tokenizer.eod + + @property + def eod_token_id(self): + return self.eod + + @property + def eos(self): + #TODO: make sure this makes sense + #return self.tokenizer.tokenizer.get_vocab()[self.tokenizer.eos_token] + return self.eod + @property + def eos_token_id(self): + #TODO: make sure this makes sense + return self.eos + + @property + def mask(self): + raise NotImplementedError + + +class _SPTokenizer(AbstractTokenizer): + def __init__(self, model_file): + name = 'OpenGPTX-SPTokenizer' + super().__init__(name) + self.tokenizer = SPTokenizer.instantiate_from_file_or_name(model_file_or_name=model_file) + + @property + def vocab_size(self): + return self.tokenizer.vocab_size + + @property + def vocab(self): + return self.tokenizer.vocab + + @property + def inv_vocab(self): + return self.tokenizer.inv_vocab + + @property + def decoder(self): + return self.inv_vocab + + @property + def encoder(self): + return self.tokenizer.vocab + + def tokenize(self, text, is_continuation: bool = False): + return self.tokenizer.encode(text, is_continuation=is_continuation) + + def detokenize(self, ids): + return self.tokenizer.decode(ids) + + @property + def pad(self): + return self.tokenizer.tokenizer.PieceToId(self.tokenizer.pad_token) + + + @property + def pad_token_id(self): + return self.tokenizer.pad + + @property + def bos(self): + return self.tokenizer.tokenizer.PieceToId(self.tokenizer.bos_token) + + @property + def bos_token_id(self): + return self.bos + + @property + def eod(self): + return self.tokenizer.eod + + @property + def eod_token_id(self): + return self.eod + + @property + def eos(self): + #TODO: make sure this makes sense + #return self.tokenizer.tokenizer.get_vocab()[self.tokenizer.eos_token] + return self.eod + @property + def eos_token_id(self): + #TODO: make sure this makes sense + return self.eos + + @property + def mask(self): + raise NotImplementedError + + +class _PretrainedHFTokenizer(AbstractTokenizer): + def __init__(self, model_file): + name = 'OpenGPTX-PretrainedHFTokenizer' + super().__init__(name) + self.tokenizer = PretrainedHFTokenizer.instantiate_from_file_or_name(model_file_or_name=model_file) + + @property + def vocab_size(self): + return self.tokenizer.vocab_size + + @property + def vocab(self): + return self.tokenizer.vocab + + @property + def inv_vocab(self): + return self.tokenizer.inv_vocab + + @property + def decoder(self): + return self.inv_vocab + + @property + def encoder(self): + return self.tokenizer.vocab + + def tokenize(self, text, is_continuation: bool = False): + return self.tokenizer.encode(text) + + def detokenize(self, ids): + return self.tokenizer.decode(ids) + + @property + def pad(self): + return self.tokenizer.tokenizer.get_vocab()[self.tokenizer.pad_token] + + @property + def pad_token_id(self): + return self.pad + + @property + def bos(self): + return self.tokenizer.tokenizer.get_vocab()[self.tokenizer.bos_token] + + @property + def bos_token_id(self): + return self.bos + + @property + def eod(self): + return self.tokenizer.eod + + @property + def eod_token_id(self): + return self.eod + + @property + def eos(self): + #TODO: make sure this makes sense + #return self.tokenizer.tokenizer.get_vocab()[self.tokenizer.eos_token] + return self.eod + @property + def eos_token_id(self): + #TODO: make sure this makes sense + return self.eos + + @property + def mask(self): + raise NotImplementedError + + diff --git a/megatron/training.py b/megatron/training.py index 74bc5fe333c..a4a192c0390 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -40,7 +40,6 @@ from megatron.utils import calc_params_l2_norm from megatron.core.pipeline_parallel import get_forward_backward_func from megatron.utils import report_memory -from megatron.model.vision.knn_monitor import compute_feature_bank def print_datetime(string): @@ -455,6 +454,7 @@ def train_step(forward_step_func, data_iterator, # Vision gradients. if args.vision_pretraining and args.vision_pretraining_type == "dino": + from megatron.model.vision.knn_monitor import compute_feature_bank unwrapped_model = unwrap_model(model[0], (torchDDP, LocalDDP, Float16Module)) unwrapped_model.cancel_gradients_last_layer(args.curr_iteration) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 66977f2850d..7190bdceff6 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -194,7 +194,7 @@ def get_args(): group.add_argument('--tokenizer-type', type=str, required=True, choices=['BertWordPieceLowerCase','BertWordPieceCase', 'GPT2BPETokenizer', 'SentencePieceTokenizer', - 'GPTSentencePieceTokenizer', 'NullTokenizer'], + 'GPTSentencePieceTokenizer', 'NullTokenizer','OpenGPTX-SPTokenizer', "OpenGPTX-PretrainedHFTokenizer", "OpenGPTX-HFTokenizer"], help='What type of tokenizer to use.') group.add_argument('--tokenizer-model', type=str, default=None, help='YTTM tokenizer model.') diff --git a/tools/preprocess_data_nmt.py b/tools/preprocess_data_nmt.py index 2505c1e16d6..542830d8793 100644 --- a/tools/preprocess_data_nmt.py +++ b/tools/preprocess_data_nmt.py @@ -39,7 +39,7 @@ def get_args(): group = parser.add_argument_group(title='tokenizer') group.add_argument('--tokenizer-type', type=str, default='YTTMTokenizer', choices=['BertWordPieceLowerCase','BertWordPieceCase', - 'GPT2BPETokenizer', 'SentencePieceTokenizer'], + 'GPT2BPETokenizer', 'SentencePieceTokenizer', OpenGPTX-SPTokenizer], help='What type of tokenizer to use.') group.add_argument('--vocab-file', type=str, default=None, help='Path to the vocab file') diff --git a/tools/run_text_generation_server.py b/tools/run_text_generation_server.py index 52789155b13..0bf93220c87 100644 --- a/tools/run_text_generation_server.py +++ b/tools/run_text_generation_server.py @@ -3,9 +3,11 @@ """Sample Generate GPT""" import os import sys +import socket +import torch + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) -import socket from megatron import get_args from megatron import print_rank_0 from megatron.core import mpu @@ -17,7 +19,9 @@ from megatron.text_generation_server import MegatronServer from megatron.text_generation import generate_and_post_process from megatron.text_generation import beam_search_and_post_process -import torch + +from megatron.arguments import core_transformer_config_from_args + def model_provider(pre_process=True, post_process=True): """Build the model.""" @@ -29,6 +33,7 @@ def model_provider(pre_process=True, post_process=True): return model + def add_text_generate_args(parser): group = parser.add_argument_group(title='text generation') @@ -40,6 +45,9 @@ def add_text_generate_args(parser): help='Top k sampling.') group.add_argument("--out-seq-length", type=int, default=1024, help='Size of the output generated text.') + group.add_argument("--server-port", type=int, default=5000, + help='Server port.') + return parser @@ -66,7 +74,7 @@ def add_text_generate_args(parser): model = model[0] if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0: server = MegatronServer(model) - server.run("0.0.0.0") + server.run("0.0.0.0", port=args.server_port) while True: choice = torch.cuda.LongTensor(1)