huggingface · BenjaminBossan · Sep 25, 2025 · Aug 4, 2025 · Aug 4, 2025 · Aug 5, 2025
diff --git a/examples/dora_finetuning/QDoRA_finetuning.ipynb b/examples/dora_finetuning/QDoRA_finetuning.ipynb
@@ -6,7 +6,7 @@
     "id": "CV_gQs58bsvM"
    },
    "source": [
-    "# Fine-tuning [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) Dataset using QDora (quantized Lora w/ use_dora=True) on T4 Free Colab GPU."
+    "# Fine-tuning [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) Dataset using QDora (quantized Lora w/ use_dora=True)."
    ]
   },
   {
@@ -1010,6 +1010,7 @@
     "top_p = 0.9\n",
     "temperature = 0.7\n",
     "user_question = \"What is the purpose of quantization in LLMs?\"\n",
+    "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n",
     "\n",
     "\n",
     "prompt = (\n",
@@ -1021,7 +1022,7 @@
     "\n",
     "\n",
     "def generate(model, user_question, max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature):\n",
-    "    inputs = tokenizer(prompt.format(user_question=user_question), return_tensors=\"pt\").to(\"cuda\")\n",
+    "    inputs = tokenizer(prompt.format(user_question=user_question), return_tensors=\"pt\").to(device)\n",
     "\n",
     "    outputs = model.generate(\n",
     "        **inputs,\n",

diff --git a/examples/dora_finetuning/README.md b/examples/dora_finetuning/README.md
@@ -13,7 +13,7 @@ from peft import LoraConfig, get_peft_model
 from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer
 from datasets import load_dataset
 
-model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", device_map="cuda")
+model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", device_map="auto")
 tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
 dataset = load_dataset("timdettmers/openassistant-guanaco", split="train")
 lora_config = LoraConfig(
@@ -70,7 +70,6 @@ python dora_finetuning.py \
     --quantize \
     --eval_step 10 \
     --save_step 100 \
-    --device "cuda:0" \
     --lora_r 16 \
     --lora_alpha 32 \
     --lora_dropout 0.05 \

diff --git a/examples/dora_finetuning/dora_finetuning.py b/examples/dora_finetuning/dora_finetuning.py
@@ -39,22 +39,27 @@ def train_model(
     hf_token = os.getenv("HF_TOKEN")
 
     # Setup device
-    device = torch.device(device)
+    if device == "auto":
+        device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+    else:
+        device = torch.device(device)
     print(f"Using device: {device}")
 
     # load tokenizer
     tokenizer = AutoTokenizer.from_pretrained(base_model, token=hf_token)
 
     # QDoRA (quantized dora): IF YOU WANNA QUANTIZE THE MODEL
     if quantize:
+        if (torch.cuda.is_available() and torch.cuda.is_bf16_supported()) or torch.xpu.is_available():
+            bnb_4bit_compute_dtype = torch.bfloat16
+        else:
+            bnb_4bit_compute_dtype = torch.float16
         model = AutoModelForCausalLM.from_pretrained(
             base_model,
             token=hf_token,
             quantization_config=BitsAndBytesConfig(
                 load_in_4bit=True,
-                bnb_4bit_compute_dtype=(
-                    torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
-                ),
+                bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
                 bnb_4bit_use_double_quant=True,
                 bnb_4bit_quant_type="nf4",
             ),
@@ -117,8 +122,11 @@ def tokenize_function(examples):
         hub_token=hf_token,
     )
 
-    # Clear CUDA cache to free memory
-    torch.cuda.empty_cache()
+    # Clear device cache to free memory
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    elif torch.xpu.is_available():
+        torch.xpu.empty_cache()
 
     # Initialize the Trainer
     trainer = Trainer(
@@ -162,7 +170,7 @@ def tokenize_function(examples):
     parser.add_argument("--quantize", action="store_true", help="Use quantization")
     parser.add_argument("--eval_step", type=int, default=10, help="Evaluation step interval")
     parser.add_argument("--save_step", type=int, default=100, help="Save step interval")
-    parser.add_argument("--device", type=str, default="cuda:0", help="Device to use for training")
+    parser.add_argument("--device", type=str, default="auto", help="Device to use for training")
     parser.add_argument("--lora_r", type=int, default=8, help="LoRA rank")
     parser.add_argument("--lora_alpha", type=int, default=16, help="LoRA alpha")
     parser.add_argument("--lora_dropout", type=float, default=0.05, help="LoRA dropout rate")