feat(server): do not use device_map auto on single GPU (#362)

OlivierDehaene · web-flow · commit e9669a4085a0 · 2023-05-23T19:12:12.000+02:00
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
@@ -468,9 +468,12 @@ def __init__(
             model_id,
             revision=revision,
             torch_dtype=dtype,
-            device_map="auto" if torch.cuda.is_available() else None,
+            device_map="auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
             load_in_8bit=quantize == "bitsandbytes",
         )
+        if torch.cuda.is_available() and torch.cuda.device_count() == 1:
+            model = model.cuda()
+
         tokenizer.pad_token_id = (
             model.config.pad_token_id
             if model.config.pad_token_id is not None
diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
@@ -518,9 +518,12 @@ def __init__(
             model_id,
             revision=revision,
             torch_dtype=dtype,
-            device_map="auto" if torch.cuda.is_available() else None,
+            device_map="auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
             load_in_8bit=quantize == "bitsandbytes",
         )
+        if torch.cuda.is_available() and torch.cuda.device_count() == 1:
+            model = model.cuda()
+
         tokenizer = AutoTokenizer.from_pretrained(
             model_id, revision=revision, padding_side="left", truncation_side="left"
         )

Original file line number	Diff line number	Diff line change
`@@ -518,9 +518,12 @@ def __init__(`
`518`	`518`	`model_id,`
`519`	`519`	`revision=revision,`
`520`	`520`	`torch_dtype=dtype,`
`521`		`- device_map="auto" if torch.cuda.is_available() else None,`
	`521`	`+ device_map="auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,`
`522`	`522`	`load_in_8bit=quantize == "bitsandbytes",`
`523`	`523`	`)`
	`524`	`+ if torch.cuda.is_available() and torch.cuda.device_count() == 1:`
	`525`	`+ model = model.cuda()`
	`526`	`+`
`524`	`527`	`tokenizer = AutoTokenizer.from_pretrained(`
`525`	`528`	`model_id, revision=revision, padding_side="left", truncation_side="left"`
`526`	`529`	`)`