From d7b34607b5bce00e198f1e0a8c68767967ea0c1f Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yyoon@habana.ai>
Date: Tue, 21 Jan 2025 20:59:22 +0000
Subject: [PATCH 1/5] Clean up README examples

---
 examples/contrastive-image-text/README.md |  57 +--
 examples/language-modeling/README.md      | 468 +---------------------
 examples/question-answering/README.md     | 231 -----------
 examples/summarization/README.md          |  81 +---
 4 files changed, 8 insertions(+), 829 deletions(-)

diff --git a/examples/contrastive-image-text/README.md b/examples/contrastive-image-text/README.md
index c0aa57ac41..d21eece8bf 100644
--- a/examples/contrastive-image-text/README.md
+++ b/examples/contrastive-image-text/README.md
@@ -163,61 +163,8 @@ python3 ../gaudi_spawn.py --world_size 8 --use_mpi run_clip.py \
 
 ### DeepSpeed
 
-Run the following command for training with DeepSpeed:
-
-```bash
-PT_HPU_LAZY_MODE=0 PT_ENABLE_INT64_SUPPORT=1 \
-python3 ../gaudi_spawn.py --world_size 8 --use_deepspeed run_clip.py \
-    --output_dir=/tmp/clip_roberta \
-    --model_name_or_path=./clip-roberta \
-    --data_dir $PWD/data \
-    --dataset_name ydshieh/coco_dataset_script \
-    --dataset_config_name 2017 \
-    --image_column image_path \
-    --caption_column caption \
-    --remove_unused_columns=False \
-    --do_train --do_eval \
-    --mediapipe_dataloader \
-    --per_device_train_batch_size="64" \
-    --per_device_eval_batch_size="64" \
-    --learning_rate="5e-5" --warmup_steps="0" --weight_decay 0.1 \
-    --overwrite_output_dir \
-    --use_habana \
-    --use_lazy_mode=False \
-    --gaudi_config_name="Habana/clip" \
-    --throughput_warmup_steps=30 \
-    --save_strategy="no" \
-    --dataloader_num_workers=2 \
-    --use_hpu_graphs \
-    --max_steps=100 \
-    --torch_compile_backend=hpu_backend \
-    --torch_compile \
-    --logging_nan_inf_filter \
-    --trust_remote_code \
-    --deepspeed <path_to_my_deepspeed_config>
-
-```
-
-You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
-Here is a DeepSpeed configuration you can use to train your models on Gaudi:
-```json
-{
-    "steps_per_print": 64,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "bf16": {
-        "enabled": true
-    },
-    "gradient_clipping": 1.0,
-    "zero_optimization": {
-        "stage": 2,
-        "overlap_comm": false,
-        "reduce_scatter": false,
-        "contiguous_gradients": false
-    }
-}
-```
+You can check the [DeepSpeed](https://github.com/huggingface/optimum-habana/tree/main/examples#deepspeed) section in Optimum Habana examples for how to run DeepSpeed.
+You can also look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
 
 
 ## BridgeTower
diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md
index 9ef27f9e73..d0aa805b02 100644
--- a/examples/language-modeling/README.md
+++ b/examples/language-modeling/README.md
@@ -29,163 +29,12 @@ First, you should install the requirements:
 pip install -r requirements.txt
 ```
 
-## GPT2/GPT-J/GPT-NeoX and causal language modeling
+## GPT-NeoX and causal language modeling
 
-The following examples fine-tune GPT-2, GPT-J-6B and GPT-NeoX-20B on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before the tokenization). The loss here is the one of causal language modeling.
+The following examples fine-tune GPT-NeoX-20B on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before the tokenization). The loss here is the one of causal language modeling.
 
 
-### Single-card Training (GPT2)
-
-```bash
-python run_clm.py \
-    --model_name_or_path gpt2 \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --do_train \
-    --do_eval \
-    --output_dir /tmp/test-clm \
-    --gaudi_config_name Habana/gpt2 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --throughput_warmup_steps 3
-```
-
-This takes about 13 minutes to train on a single HPU. It reaches
-a perplexity of about 20.9963 once fine-tuned on the dataset.
-
-To run on your own training and validation files, use the following command:
-
-```bash
-python run_clm.py \
-    --model_name_or_path gpt2 \
-    --train_file path_to_train_file \
-    --validation_file path_to_validation_file \
-    --per_device_train_batch_size 8 \
-    --per_device_eval_batch_size 8 \
-    --do_train \
-    --do_eval \
-    --output_dir /tmp/test-clm \
-    --gaudi_config_name Habana/gpt2 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --throughput_warmup_steps 3
-```
-
-
-### Multi-card Training (GPT2)
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_clm.py \
-    --model_name_or_path gpt2 \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --do_train \
-    --do_eval \
-    --output_dir /tmp/test-clm \
-    --gaudi_config_name Habana/gpt2 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gradient_checkpointing \
-    --use_cache False \
-    --throughput_warmup_steps 3
-```
-
-This takes about 4 minutes to train on 8 HPUs. It reaches
-a perplexity of 21.7968 once fine-tuned on the dataset.
-
-
-### Multi-card Training with Deepspeed (GPT-J)
-
-The following command triggers the fine-tuning of [GPT-J-6B](https://huggingface.co/EleutherAI/gpt-j-6b) on WikiText-2 with DeepSpeed ZeRO-2.
-Fine tuning on 8 HPU cards takes around 6 minutes with a batch size of 32 (4 per device).
-It reaches a perplexity of 14.011.
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_clm.py \
-    --model_name_or_path EleutherAI/gpt-j-6b \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 16 \
-    --per_device_eval_batch_size 4 \
-    --do_train \
-    --do_eval \
-    --output_dir /tmp/test-clm-xl-1 \
-    --gaudi_config_name Habana/gpt2 \
-    --use_habana \
-    --use_lazy_mode \
-    --gradient_checkpointing \
-    --use_hpu_graphs_for_inference \
-    --throughput_warmup_steps 3 \
-    --deepspeed path_for_deepspeed_config
-```
-
-This example has been validated with the following DeepSpeed ZeRO-2 config: https://github.com/huggingface/optimum-habana/blob/main/tests/configs/deepspeed_zero_2.json
-
-
-### Multi-card Training with Deepspeed (chatglm3-6b)
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_clm.py \
-    --config_name THUDM/chatglm3-6b \
-    --tokenizer_name THUDM/chatglm3-6b \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 6 \
-    --per_device_eval_batch_size 4 \
-    --do_train \
-    --do_eval \
-    --deepspeed llama2_ds_zero3_config.json \
-    --output_dir /tmp/test-clm \
-    --gaudi_config_name Habana/gpt2 \
-    --use_habana \
-    --use_lazy_mode \
-    --throughput_warmup_steps 3 \
-    --bf16 \
-    --block_size 1024 \
-    --use_cache False \
-    --overwrite_output_dir \
-    --logging_first_step True \
-    --logging_steps 20
-```
-
-### Multi-card Training with Deepspeed (Baichuan2-13B-Chat)
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_clm.py \
-    --config_name baichuan-inc/Baichuan2-13B-Chat \
-    --tokenizer_name baichuan-inc/Baichuan2-13B-Chat \
-    --dataset_name wikitext \
-    --num_train_epochs 30 \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 2 \
-    --per_device_eval_batch_size 2 \
-    --do_train \
-    --do_eval \
-    --deepspeed llama2_ds_zero3_config.json \
-    --output_dir /tmp/test-clm \
-    --gaudi_config_name Habana/gpt2 \
-    --use_habana \
-    --use_lazy_mode \
-    --throughput_warmup_steps 3 \
-    --bf16 \
-    --block_size 1024 \
-    --use_cache False \
-    --overwrite_output_dir \
-    --logging_first_step True \
-    --logging_steps 20
-```
-
-
-## Multi-Node Training with Deepspeed (GPT-NeoX)
+### Multi-Node Training with Deepspeed (GPT-NeoX)
 
 The following command triggers the fine-tuning of [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b) on WikiText-2 with Deepspeed ZeRO-2.
 Fine-tuning on 16 HPU cards (2 Gaudi2 nodes) takes around 9 minutes with a batch size of 32 (2 per device).
@@ -226,52 +75,6 @@ Following the RoBERTa paper, we use dynamic masking rather than static masking.
 converge slightly slower (over-fitting takes more epochs).
 
 
-### Single-card Training
-
-```bash
-python run_mlm.py \
-    --model_name_or_path roberta-base \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 8 \
-    --per_device_eval_batch_size 8 \
-    --do_train \
-    --do_eval \
-    --output_dir /tmp/test-mlm \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/roberta-base \
-    --throughput_warmup_steps 3 \
-    --bf16
-```
-
-To run on your own training and validation files, use the following command:
-
-```bash
-python run_mlm.py \
-    --model_name_or_path roberta-base \
-    --train_file path_to_train_file \
-    --validation_file path_to_validation_file \
-    --per_device_train_batch_size 8 \
-    --per_device_eval_batch_size 8 \
-    --do_train \
-    --do_eval \
-    --output_dir /tmp/test-mlm \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/roberta-base \
-    --throughput_warmup_steps 3 \
-    --bf16
-```
-
-If your dataset is organized with one sample per line, you can use the `--line_by_line` flag (otherwise the script
-concatenates all texts and then splits them into blocks of the same length).
-
-**Note:** On HPU, you should use the flag `--pad_to_max_length` in conjunction with the `--line_by_line` flag to make sure all your batches have the same length.
-
-
 ### Multi-card Training
 
 ```bash
@@ -324,78 +127,6 @@ python run_clm.py \
     --bf16
 ```
 
-
-## Using DeepSpeed
-
-Multi-card examples can be simply adapted to be run with DeepSpeed. Here is the CLM example with GPT2-XL:
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_clm.py \
-    --model_name_or_path gpt2-xl \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 16 \
-    --per_device_eval_batch_size 4 \
-    --do_train \
-    --do_eval \
-    --learning_rate 4e-4 \
-    --output_dir /tmp/test-clm \
-    --gaudi_config_name Habana/gpt2 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gradient_checkpointing \
-    --use_cache False \
-    --throughput_warmup_steps 3 \
-    --deepspeed path_to_my_deepspeed_config
-```
-
-You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
-Here is a DeepSpeed configuration you can use to train your models on Gaudi:
-```json
-{
-    "steps_per_print": 64,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "bf16": {
-        "enabled": true
-    },
-    "gradient_clipping": 1.0,
-    "zero_optimization": {
-        "stage": 2,
-        "overlap_comm": false,
-        "reduce_scatter": false,
-        "contiguous_gradients": false
-    }
-}
-```
-
-Here is another example with Bloom-7B1:
-
-```bash
-DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 PT_HPU_MAX_COMPOUND_OP_SYNC=1 PT_HPU_MAX_COMPOUND_OP_SIZE=1 python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_clm.py \
-    --model_name_or_path bigscience/bloom-7b1 \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 8 \
-    --do_train \
-    --output_dir /tmp/test-clm \
-    --gaudi_config_name Habana/roberta-base \
-    --use_habana \
-    --use_lazy_mode \
-    --gradient_checkpointing \
-    --use_cache False \
-    --throughput_warmup_steps 3 \
-    --save_strategy "no" \
-    --learning_rate 1e-04 \
-    --deepspeed path_to_my_deepspeed_config
-```
-[This](https://github.com/huggingface/optimum-habana/blob/main/tests/configs/deepspeed_zero_3_gaudi1.json) is a DeepSpeed configuration you can use to train this model on Gaudi1.
-
-
 ## Inference
 
 To run only inference, you can start from the commands above and you just have to remove the training-only arguments such as `--do_train`, `--per_device_train_batch_size`, `--num_train_epochs`, etc...
@@ -456,141 +187,6 @@ python3 run_lora_clm.py \
     --validation_split_percentage 4 \
     --adam_epsilon 1e-08
 ```
-- Single-card finetuning of Falcon-40B:
-```bash
-PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST=ops_bf16.txt python3 run_lora_clm.py \
-    --model_name_or_path tiiuae/falcon-40b \
-    --dataset_name timdettmers/openassistant-guanaco \
-    --bf16 True \
-    --output_dir ./model_lora_falcon \
-    --num_train_epochs 3 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 16 \
-    --eval_strategy "no" \
-    --save_strategy "no" \
-    --learning_rate 3e-4 \
-    --max_grad_norm  0.3 \
-    --warmup_ratio  0.03 \
-    --lr_scheduler_type "constant" \
-    --logging_steps 1 \
-    --do_train \
-    --use_habana \
-    --use_lazy_mode \
-    --pipelining_fwd_bwd \
-    --throughput_warmup_steps 3 \
-    --lora_rank=64 \
-    --lora_alpha=16 \
-    --lora_dropout=0.1 \
-    --lora_target_modules "query_key_value" "dense" "dense_h_to_4h" "dense_4h_to_h" \
-    --dataset_concatenation \
-    --max_seq_length 256 \
-    --low_cpu_mem_usage True \
-    --adam_epsilon 1e-08 \
-    --do_eval \
-    --validation_split_percentage 5
-```
-
-- Multi-card finetuning of Llama1-7B:
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_lora_clm.py \
-    --model_name_or_path huggyllama/llama-7b \
-    --dataset_name tatsu-lab/alpaca \
-    --bf16 True \
-    --output_dir ./model_lora_llama_ddp \
-    --num_train_epochs 3 \
-    --per_device_train_batch_size 8 \
-    --gradient_accumulation_steps 2 \
-    --eval_strategy "no" \
-    --save_strategy "no" \
-    --learning_rate 3e-4 \
-    --warmup_ratio  0.03 \
-    --lr_scheduler_type "constant" \
-    --max_grad_norm  0.3 \
-    --logging_steps 1 \
-    --do_train \
-    --do_eval \
-    --use_habana \
-    --use_lazy_mode \
-    --throughput_warmup_steps 3 \
-    --lora_rank=8 \
-    --lora_alpha=16 \
-    --lora_dropout=0.05 \
-    --lora_target_modules "q_proj" "v_proj" \
-    --dataset_concatenation \
-    --max_seq_length 512 \
-    --ddp_bucket_cap_mb 50 \
-    --adam_epsilon 1e-08 \
-    --validation_split_percentage 4 \
-    --low_cpu_mem_usage True
-```
-
-- Multi-card finetuning of Llama2-7B with FP8:
-```bash
-PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST=ops_bf16.txt python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_lora_clm.py \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset_name tatsu-lab/alpaca \
-    --bf16 True \
-    --output_dir ./model_lora_llama \
-    --num_train_epochs 3 \
-    --per_device_train_batch_size 16 \
-    --gradient_accumulation_steps 1 \
-    --eval_strategy "no" \
-    --save_strategy "no" \
-    --learning_rate 3e-4 \
-    --warmup_ratio 0.03 \
-    --lr_scheduler_type "constant" \
-    --max_grad_norm 0.3 \
-    --logging_steps 20 \
-    --do_train \
-    --do_eval \
-    --use_habana \
-    --use_lazy_mode \
-    --throughput_warmup_steps 18 \
-    --lora_rank=8 \
-    --lora_alpha=16 \
-    --lora_dropout=0.05 \
-    --lora_target_modules "q_proj" "v_proj" \
-    --dataset_concatenation \
-    --max_seq_length 512 \
-    --ddp_bucket_cap_mb 50 \
-    --adam_epsilon 1e-08 \
-    --validation_split_percentage 10 \
-    --low_cpu_mem_usage True \
-    --pipelining_fwd_bwd \
-    --fp8 True
-```
-
-- Multi-card finetuning of codegen-16B-mono:
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_lora_clm.py \
-    --model_name_or_path Salesforce/codegen-16B-mono \
-    --dataset_name b-mc2/sql-create-context \
-    --sql_prompt \
-    --bf16 True \
-    --output_dir ./finetuned-models/codegen-finetune-on-sql-create-context-hpu8-lora8-bs4 \
-    --num_train_epochs 5 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --eval_strategy "no" \
-    --save_strategy "no" \
-    --learning_rate 1e-4 \
-    --logging_steps 1 \
-    --dataset_concatenation \
-    --do_train \
-    --use_habana \
-    --use_lazy_mode \
-    --throughput_warmup_steps 3 \
-    --use_hpu_graphs_for_inference \
-    --lora_target_modules "qkv_proj" \
-    --lora_rank 8 \
-    --do_eval \
-    --validation_split_percentage 10 \
-    --use_cache False
-```
 
 - Multi-card finetuning of gemma2 using chat template:
 ```bash
@@ -740,43 +336,6 @@ python3 ../gaudi_spawn.py --world_size 8 --use_mpi run_lora_clm.py \
   --flash_attention_causal_mask True
 ```
 
-- Multi-card finetuning of Falcon-180B:
-  - Falcon-180B example command saves only the LoRA parameters at end
-  - For inference we need to merge the pretrained model and LoRA weights
-```bash
-PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST=ops_bf16.txt python3 ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_lora_clm.py \
-    --model_name_or_path tiiuae/falcon-180B \
-    --dataset_name timdettmers/openassistant-guanaco \
-    --bf16 True \
-    --output_dir ./model_lora_falcon_ddp \
-    --num_train_epochs 3 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 16 \
-    --eval_strategy "no" \
-    --save_strategy "no" \
-    --learning_rate 4e-4 \
-    --max_grad_norm  0.3 \
-    --warmup_ratio  0.03 \
-    --lr_scheduler_type "constant" \
-    --logging_steps 1 \
-    --do_train \
-    --use_habana \
-    --use_lazy_mode \
-    --pipelining_fwd_bwd \
-    --throughput_warmup_steps 3 \
-    --lora_rank=64 \
-    --lora_alpha=16 \
-    --lora_dropout=0.1 \
-    --lora_target_modules "query_key_value" "dense" "dense_h_to_4h" "dense_4h_to_h" \
-    --dataset_concatenation \
-    --max_seq_length 256 \
-    --adam_epsilon 1e-08 \
-    --do_eval \
-    --validation_split_percentage 5 \
-    --deepspeed ds_falcon_180b_z3.json
-```
 Default `peft_type` is `lora`, you could enable adalora or ia3 using `--peft_type adalora` or `--peft_type ia3`, or enable llama-adapter for llama model using `--peft_type llama-adapter`, or enable ln-tuning using `--peft_type ln_tuning`, or enable vera using `--peft_type vera`.
 
 #### Custom Files
@@ -824,7 +383,7 @@ The format of the text files (with extensions .text or .txt) is expected to be
 ### Prompt/Prefix/P-tuning
 
 To run prompt tuning finetuning, you can use `run_prompt_tuning_clm.py`.
-Here are single-/multi-device command examples for Llama2-7B:
+Here are single-card command examples for Llama2-7B:
 - single-card finetuning of meta-llama/Llama-2-7b-hf with dataset "ought/raft" and config "twitter_complaints":
 ```bash
 python3 run_prompt_tuning_clm.py \
@@ -844,25 +403,6 @@ python3 run_prompt_tuning_clm.py \
     --use_lazy_mode
 ```
 
-- multi-card finetuning of meta-llama/Llama-2-7b-hf with dataset "ought/raft" and config "twitter_complaints":
-```bash
-python3 ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_prompt_tuning_clm.py \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --output_dir prompt_tuning_out \
-    --bf16 True \
-    --report_to=none \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --low_cpu_mem_usage True \
-    --logging_steps 1 \
-    --do_train \
-    --num_train_epochs 50 \
-    --do_eval  \
-    --use_habana  \
-    --use_lazy_mode
-```
 Default `peft_type` is `prompt_tuning`, you could enable prefix-tuning or p-tuning using `--peft_type prefix_tuning` or `--peft_type p_tuning`.
 
 Use the prompt finetuned model for text-generation:
diff --git a/examples/question-answering/README.md b/examples/question-answering/README.md
index c7414c777d..9cc9fb66b1 100755
--- a/examples/question-answering/README.md
+++ b/examples/question-answering/README.md
@@ -33,163 +33,6 @@ First, you should install the requirements:
 pip install -r requirements.txt
 ```
 
-## Fine-tuning BERT on SQuAD1.1
-
-For the following cases, an example of a Gaudi configuration file is given
-[here](https://github.com/huggingface/optimum-habana#how-to-use-it).
-
-
-### Single-card Training
-
-This example code fine-tunes BERT on the SQuAD1.1 dataset.
-
-```bash
-python run_qa.py \
-  --model_name_or_path bert-large-uncased-whole-word-masking \
-  --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-  --dataset_name squad \
-  --do_train \
-  --do_eval \
-  --per_device_train_batch_size 32 \
-  --per_device_eval_batch_size 8 \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/squad/ \
-  --use_habana \
-  --use_lazy_mode \
-  --use_hpu_graphs_for_inference \
-  --throughput_warmup_steps 3 \
-  --bf16 \
-  --sdp_on_bf16
-```
-
-For torch.compile mode,
-```bash
-PT_HPU_LAZY_MODE=0 PT_ENABLE_INT64_SUPPORT=1 python run_qa.py \
-  --model_name_or_path bert-large-uncased-whole-word-masking \
-  --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-  --dataset_name squad \
-  --do_train \
-  --do_eval \
-  --per_device_train_batch_size 32 \
-  --per_device_eval_batch_size 8 \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/squad/ \
-  --use_habana \
-  --torch_compile_backend hpu_backend \
-  --torch_compile \
-  --use_lazy_mode false \
-  --throughput_warmup_steps 3 \
-  --bf16 \
-  --sdp_on_bf16
-```
-
-### Multi-card Training
-
-Here is how you would fine-tune the BERT large model (with whole word masking) on the SQuAD dataset using the `run_qa` script, with 8 HPUs:
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_qa.py \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-    --dataset_name squad \
-    --do_train \
-    --do_eval \
-    --per_device_train_batch_size 32 \
-    --per_device_eval_batch_size 8 \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir /tmp/squad_output/ \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --throughput_warmup_steps 3 \
-    --bf16 \
-    --sdp_on_bf16
-```
-
-For torch.compile mode,
-```bash
-PT_HPU_LAZY_MODE=0 PT_ENABLE_INT64_SUPPORT=1 python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_qa.py \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-    --dataset_name squad \
-    --do_train \
-    --do_eval \
-    --per_device_train_batch_size 32 \
-    --per_device_eval_batch_size 8 \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir /tmp/squad_output/ \
-    --use_habana \
-    --torch_compile_backend hpu_backend \
-    --torch_compile \
-    --use_lazy_mode false \
-    --throughput_warmup_steps 3 \
-    --bf16 \
-    --sdp_on_bf16
-```
-
-
-### Using DeepSpeed
-
-Similarly to multi-card training, here is how you would fine-tune the BERT large model (with whole word masking) on the SQuAD dataset using DeepSpeed with 8 HPUs:
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_qa.py \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-    --dataset_name squad \
-    --do_train \
-    --do_eval \
-    --per_device_train_batch_size 32 \
-    --per_device_eval_batch_size 8 \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir /tmp/squad_output/ \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --throughput_warmup_steps 3 \
-    --deepspeed path_to_my_deepspeed_config \
-    --sdp_on_bf16
-```
-
-You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
-Here is a DeepSpeed configuration you can use to train your models on Gaudi:
-```json
-{
-    "steps_per_print": 64,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "bf16": {
-        "enabled": true
-    },
-    "gradient_clipping": 1.0,
-    "zero_optimization": {
-        "stage": 2,
-        "overlap_comm": false,
-        "reduce_scatter": false,
-        "contiguous_gradients": false
-    }
-}
-```
-
 ## Fine-tuning Llama on SQuAD1.1
 
 > [!NOTE]
@@ -224,77 +67,3 @@ python ../gaudi_spawn.py \
 ## Inference
 
 To run only inference, you can start from the commands above and you just have to remove the training-only arguments such as `--do_train`, `--per_device_train_batch_size`, `--num_train_epochs`, etc...
-
-For instance, you can run inference with BERT on SQuAD on 1 Gaudi card with the following command:
-```bash
-python run_qa.py \
-  --model_name_or_path bert-large-uncased-whole-word-masking \
-  --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-  --dataset_name squad \
-  --do_eval \
-  --per_device_eval_batch_size 8 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/squad/ \
-  --use_habana \
-  --use_lazy_mode \
-  --use_hpu_graphs_for_inference \
-  --bf16 \
-  --sdp_on_bf16
-```
-
-
-## Recommended Hyperparameters for Mixed Precision
-
-| | learning_rate | num_train_epochs | per_device_train_batch_size | per_device_eval_batch_size |
-|----------------------------|:----:|:--:|:-:|:-:|
-| BERT base                  | 3e-5 | 2 | 24 | 8 |
-| BERT large                 | 3e-5 | 2 | 24 | 8 |
-| RoBERTa base               | 3e-5 | 2 | 12 | 8 |
-| RoBERTa large              | 3e-5 | 2 | 12 | 8 |
-| ALBERT large (single-card) | 5e-5 | 2 | 32 | 4 |
-| ALBERT large (multi-card)  | 6e-5 | 2 | 32 | 4 |
-| ALBERT XXL (single-card)   | 5e-6 | 2 | 16 | 2 |
-| ALBERT XXL (multi-card)    | 5e-5 | 2 | 16 | 2 |
-| DistilBERT                 | 5e-5 | 3 | 8  | 8 |
-| meta-llama/Llama-2-13b-chat-hf (multi-card) | 3e-5 | 2 | 8 | 8 |
-| FlagAlpha/Llama2-Chinese-13b-Chat (multi-card) | 3e-5 | 2 | 8 | 8 |
-
-
-## Fine-tuning T5 on SQuAD2.0
-
-The [`run_seq2seq_qa.py`](https://github.com/huggingface/optimum-habana/blob/main/examples/question-answering/run_seq2seq_qa.py) script is meant for encoder-decoder (also called seq2seq) Transformer models, such as T5 or BART. These models are generative, rather than discriminative. This means that they learn to generate the correct answer, rather than predicting the start and end position of the tokens of the answer.
-
-The following command fine-tunes T5 on the SQuAD2.0 dataset:
-
-```bash
-python run_seq2seq_qa.py \
-  --model_name_or_path t5-small \
-  --gaudi_config_name Habana/t5 \
-  --dataset_name squad_v2 \
-  --version_2_with_negative \
-  --context_column context \
-  --question_column question \
-  --answer_column answers \
-  --do_train \
-  --do_eval \
-  --per_device_train_batch_size 16 \
-  --per_device_eval_batch_size 33 \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/seq2seq_squad/ \
-  --predict_with_generate \
-  --use_habana \
-  --use_lazy_mode \
-  --use_hpu_graphs_for_inference \
-  --ignore_pad_token_for_loss False \
-  --pad_to_max_length \
-  --save_strategy epoch \
-  --throughput_warmup_steps 3 \
-  --sdp_on_bf16 \
-  --bf16
-```
-
-For multi-card and DeepSpeed runs, you can use `python ../gaudi_spawn.py --world_size 8 --use_mpi` and `python ../gaudi_spawn.py --world_size 8 --use_deepspeed` as shown in the previous sections.
diff --git a/examples/summarization/README.md b/examples/summarization/README.md
index 86ab88b790..bdaef78edf 100644
--- a/examples/summarization/README.md
+++ b/examples/summarization/README.md
@@ -179,65 +179,8 @@ python ../gaudi_spawn.py \
 
 ## Using DeepSpeed
 
-Here is an example on 8 HPUs on Gaudi2/Gaudi3 with DeepSpeed-ZeRO3 to fine-tune [FLAN-T5 XXL](https://huggingface.co/google/flan-t5-xxl):
-```bash
-PT_HPU_MAX_COMPOUND_OP_SIZE=512 python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_summarization.py \
-    --model_name_or_path google/flan-t5-xxl \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config '"3.0.0"' \
-    --source_prefix '"summarize: "' \
-    --output_dir ./tst-summarization \
-    --per_device_train_batch_size 22 \
-    --per_device_eval_batch_size 22 \
-    --learning_rate 1e-4 \
-    --num_train_epochs 3 \
-    --overwrite_output_dir \
-    --predict_with_generate \
-    --use_habana \
-    --use_lazy_mode \
-    --gaudi_config_name Habana/t5 \
-    --ignore_pad_token_for_loss False \
-    --pad_to_max_length \
-    --generation_max_length 129 \
-    --save_strategy epoch \
-    --throughput_warmup_steps 3 \
-    --gradient_checkpointing \
-    --adam_epsilon 1e-08 --logging_steps 1 \
-    --deepspeed ds_flan_t5_z3_config_bf16.json
-```
-
-Here is an example on 8 HPUs on Gaudi2 with DeepSpeed-ZeRO2 to fine-tune t5-large:
-```bash
-PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \
-      --world_size 8 \
-      --use_deepspeed run_summarization.py \
-      --deepspeed ../../tests/configs/deepspeed_zero_2.json \
-      --do_train \
-      --do_eval \
-      --overwrite_output_dir \
-      --predict_with_generate \
-      --use_habana \
-      --gaudi_config_name Habana/t5  \
-      --ignore_pad_token_for_loss False \
-      --pad_to_max_length \
-      --save_strategy no \
-      --throughput_warmup_steps 15 \
-      --model_name_or_path t5-large \
-      --source_prefix '"summarize:"' \
-      --dataset_name cnn_dailymail \
-      --dataset_config '"3.0.0"' \
-      --output_dir /tmp/tst-summarization \
-      --per_device_train_batch_size 20 \
-      --per_device_eval_batch_size 20 \
-      --max_train_samples 2000  \
-      --torch_compile_backend hpu_backend \
-      --torch_compile
-```
-
-You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
+You can check the [DeepSpeed](https://github.com/huggingface/optimum-habana/tree/main/examples#deepspeed) section in Optimum Habana examples for how to run DeepSpeed.
+You also can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
 
 
 ## Inference
@@ -267,23 +210,3 @@ python run_summarization.py \
     --bf16_full_eval
 ```
 
-You can run inference with BART on the CNN-DailyMail dataset on 1 Gaudi card with the following command:
-```bash
-python run_summarization.py \
-    --model_name_or_path facebook/bart-large-cnn \
-    --do_predict \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --output_dir /tmp/tst-summarization \
-    --per_device_eval_batch_size 2 \
-    --overwrite_output_dir \
-    --predict_with_generate \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/bart \
-    --ignore_pad_token_for_loss False \
-    --pad_to_max_length \
-    --throughput_warmup_steps 3 \
-    --num_beams 1
-```

From 1b922233df7eda1c0398529b54e85701dcdd90c4 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yyoon@habana.ai>
Date: Tue, 21 Jan 2025 21:14:16 +0000
Subject: [PATCH 2/5] Revert note

---
 examples/language-modeling/README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md
index d0aa805b02..6de7af3936 100644
--- a/examples/language-modeling/README.md
+++ b/examples/language-modeling/README.md
@@ -96,6 +96,11 @@ python ../gaudi_spawn.py \
     --bf16
 ```
 
+If your dataset is organized with one sample per line, you can use the `--line_by_line` flag (otherwise the script
+concatenates all texts and then splits them into blocks of the same length).
+
+**Note:** On HPU, you should use the flag `--pad_to_max_length` in conjunction with the `--line_by_line` flag to make sure all your batches have the same length.
+
 
 ### Training in torch.compile mode
 RoBERTa-Large model training in [torch.compile](pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) mode is enabled by applying the following changes to your command,

From 1c3809a34f37c5e6c0f2d908057b0dfa032383ab Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yyoon@habana.ai>
Date: Fri, 24 Jan 2025 01:53:11 +0000
Subject: [PATCH 3/5] Change llama model name in question-answering README

---
 examples/question-answering/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/question-answering/README.md b/examples/question-answering/README.md
index 9cc9fb66b1..d7a83ea5c8 100755
--- a/examples/question-answering/README.md
+++ b/examples/question-answering/README.md
@@ -42,7 +42,7 @@ Here is a command you can run to train a Llama model for question answering:
 ```bash
 python ../gaudi_spawn.py \
   --world_size 8 --use_deepspeed run_qa.py \
-  --model_name_or_path FlagAlpha/Llama2-Chinese-13b-Chat \
+  --model_name_or_path meta-llama/Llama-2-7b-chat-hf \
   --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
   --dataset_name squad \
   --do_train \

From 1abd691853fbc270e8ab068648c46d278c38e666 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yyoon@habana.ai>
Date: Wed, 5 Feb 2025 00:48:54 +0000
Subject: [PATCH 4/5] Revert GPT2/GPT-J examples in language modeling README

---
 examples/language-modeling/README.md | 103 +++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)

diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md
index 6de7af3936..23e1e5ea26 100644
--- a/examples/language-modeling/README.md
+++ b/examples/language-modeling/README.md
@@ -29,6 +29,109 @@ First, you should install the requirements:
 pip install -r requirements.txt
 ```
 
+## GPT2/GPT-J/GPT-NeoX and causal language modeling
+
+The following examples fine-tune GPT-2, GPT-J-6B and GPT-NeoX-20B on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before the tokenization). The loss here is the one of causal language modeling.
+
+
+### Single-card Training (GPT2)
+
+```bash
+python run_clm.py \
+    --model_name_or_path gpt2 \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 4 \
+    --do_train \
+    --do_eval \
+    --output_dir /tmp/test-clm \
+    --gaudi_config_name Habana/gpt2 \
+    --use_habana \
+    --use_lazy_mode \
+    --use_hpu_graphs_for_inference \
+    --throughput_warmup_steps 3
+```
+
+This takes about 13 minutes to train on a single HPU. It reaches
+a perplexity of about 20.9963 once fine-tuned on the dataset.
+
+To run on your own training and validation files, use the following command:
+
+```bash
+python run_clm.py \
+    --model_name_or_path gpt2 \
+    --train_file path_to_train_file \
+    --validation_file path_to_validation_file \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 8 \
+    --do_train \
+    --do_eval \
+    --output_dir /tmp/test-clm \
+    --gaudi_config_name Habana/gpt2 \
+    --use_habana \
+    --use_lazy_mode \
+    --use_hpu_graphs_for_inference \
+    --throughput_warmup_steps 3
+```
+
+
+### Multi-card Training (GPT2)
+
+```bash
+python ../gaudi_spawn.py \
+    --world_size 8 --use_mpi run_clm.py \
+    --model_name_or_path gpt2 \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 4 \
+    --do_train \
+    --do_eval \
+    --output_dir /tmp/test-clm \
+    --gaudi_config_name Habana/gpt2 \
+    --use_habana \
+    --use_lazy_mode \
+    --use_hpu_graphs_for_inference \
+    --gradient_checkpointing \
+    --use_cache False \
+    --throughput_warmup_steps 3
+```
+
+This takes about 4 minutes to train on 8 HPUs. It reaches
+a perplexity of 21.7968 once fine-tuned on the dataset.
+
+
+### Multi-card Training with Deepspeed (GPT-J)
+
+The following command triggers the fine-tuning of [GPT-J-6B](https://huggingface.co/EleutherAI/gpt-j-6b) on WikiText-2 with DeepSpeed ZeRO-2.
+Fine tuning on 8 HPU cards takes around 6 minutes with a batch size of 32 (4 per device).
+It reaches a perplexity of 14.011.
+
+```bash
+python ../gaudi_spawn.py \
+    --world_size 8 --use_deepspeed run_clm.py \
+    --model_name_or_path EleutherAI/gpt-j-6b \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --do_train \
+    --do_eval \
+    --output_dir /tmp/test-clm-xl-1 \
+    --gaudi_config_name Habana/gpt2 \
+    --use_habana \
+    --use_lazy_mode \
+    --gradient_checkpointing \
+    --use_hpu_graphs_for_inference \
+    --throughput_warmup_steps 3 \
+    --deepspeed path_for_deepspeed_config
+```
+
+This example has been validated with the following DeepSpeed ZeRO-2 config: https://github.com/huggingface/optimum-habana/blob/main/tests/configs/deepspeed_zero_2.json
+
+
+
 ## GPT-NeoX and causal language modeling
 
 The following examples fine-tune GPT-NeoX-20B on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before the tokenization). The loss here is the one of causal language modeling.

From ad88caa2bc385d211b8ebf203f41b4a5a1b03c88 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yyoon@habana.ai>
Date: Wed, 5 Feb 2025 00:57:14 +0000
Subject: [PATCH 5/5] Revert README

---
 examples/language-modeling/README.md | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md
index 23e1e5ea26..5cce1528dc 100644
--- a/examples/language-modeling/README.md
+++ b/examples/language-modeling/README.md
@@ -131,13 +131,7 @@ python ../gaudi_spawn.py \
 This example has been validated with the following DeepSpeed ZeRO-2 config: https://github.com/huggingface/optimum-habana/blob/main/tests/configs/deepspeed_zero_2.json
 
 
-
-## GPT-NeoX and causal language modeling
-
-The following examples fine-tune GPT-NeoX-20B on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before the tokenization). The loss here is the one of causal language modeling.
-
-
-### Multi-Node Training with Deepspeed (GPT-NeoX)
+## Multi-Node Training with Deepspeed (GPT-NeoX)
 
 The following command triggers the fine-tuning of [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b) on WikiText-2 with Deepspeed ZeRO-2.
 Fine-tuning on 16 HPU cards (2 Gaudi2 nodes) takes around 9 minutes with a batch size of 32 (2 per device).