huggingface
diff --git a/‎.github/workflows/build.yaml‎
Lines changed: 1 addition & 2 deletions b/‎.github/workflows/build.yaml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎launcher/src/main.rs‎
Lines changed: 26 additions & 2 deletions b/‎launcher/src/main.rs‎
Lines changed: 26 additions & 2 deletions
diff --git a/‎server/text_generation_server/cli.py‎
Lines changed: 2 additions & 1 deletion b/‎server/text_generation_server/cli.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎server/text_generation_server/models/__init__.py‎
Lines changed: 127 additions & 20 deletions b/‎server/text_generation_server/models/__init__.py‎
Lines changed: 127 additions & 20 deletions
diff --git a/‎server/text_generation_server/models/bloom.py‎
Lines changed: 19 additions & 4 deletions b/‎server/text_generation_server/models/bloom.py‎
Lines changed: 19 additions & 4 deletions
@@ -213,13 +213,12 @@ jobs:
           sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
       - name: Install
         run: |
-          pip install pytest-xdist 
           make install-integration-tests
       - name: Run tests
         run: |
           export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}
           export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          pytest -s -vv -n 2 --dist loadfile integration-tests
+          pytest -s -vv integration-tests
 
   stop-runner:
     name: Stop self-hosted EC2 runner
 
@@ -53,7 +53,7 @@ struct Args {
     #[clap(long, env)]
     revision: Option<String>,
 
-    /// Wether to shard or not the model across multiple GPUs
+    /// Whether to shard the model across multiple GPUs
     /// By default text-generation-inference will use all available GPUs to run
     /// the model. Setting it to `false` deactivates `num_shard`.
     #[clap(long, env)]
@@ -66,11 +66,17 @@ struct Args {
     #[clap(long, env)]
     num_shard: Option<usize>,
 
-    /// Wether you want the model to be quantized or not. This will use `bitsandbytes` for
+    /// Whether you want the model to be quantized. This will use `bitsandbytes` for
     /// quantization on the fly, or `gptq`.
     #[clap(long, env, value_enum)]
     quantize: Option<Quantization>,
 
+    /// Whether you want to execute hub modelling code. Explicitly passing a `revision` is
+    /// encouraged when loading a model with custom code to ensure no malicious code has been
+    /// contributed in a newer revision.
+    #[clap(long, env, value_enum)]
+    trust_remote_code: bool,
+
     /// The maximum amount of concurrent requests for this particular deployment.
     /// Having a low limit will refuse clients requests instead of having them
     /// wait for too long and is usually good to handle backpressure correctly.
@@ -239,6 +245,7 @@ fn shard_manager(
     model_id: String,
     revision: Option<String>,
     quantize: Option<Quantization>,
+    trust_remote_code: bool,
     uds_path: String,
     rank: usize,
     world_size: usize,
@@ -272,6 +279,11 @@ fn shard_manager(
         "--json-output".to_string(),
     ];
 
+    // Activate trust remote code
+    if trust_remote_code {
+        shard_argv.push("--trust-remote-code".to_string());
+    }
+
     // Activate tensor parallelism
     if world_size > 1 {
         shard_argv.push("--sharded".to_string());
@@ -692,6 +704,16 @@ fn spawn_shards(
     status_sender: mpsc::Sender<ShardStatus>,
     running: Arc<AtomicBool>,
 ) -> Result<(), LauncherError> {
+    if args.trust_remote_code {
+        tracing::warn!(
+            "`trust_remote_code` is set. Trusting that model `{}` do not contain malicious code.",
+            args.model_id
+        );
+        if args.revision.is_none() {
+            tracing::warn!("Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.");
+        }
+    }
+
     // Start shard processes
     for rank in 0..num_shard {
         let model_id = args.model_id.clone();
@@ -705,6 +727,7 @@ fn spawn_shards(
         let shutdown_sender = shutdown_sender.clone();
         let otlp_endpoint = args.otlp_endpoint.clone();
         let quantize = args.quantize;
+        let trust_remote_code = args.trust_remote_code;
         let master_port = args.master_port;
         let disable_custom_kernels = args.disable_custom_kernels;
         let watermark_gamma = args.watermark_gamma;
@@ -714,6 +737,7 @@ fn spawn_shards(
                 model_id,
                 revision,
                 quantize,
+                trust_remote_code,
                 uds_path,
                 rank,
                 num_shard,
 
@@ -22,6 +22,7 @@ def serve(
     revision: Optional[str] = None,
     sharded: bool = False,
     quantize: Optional[Quantization] = None,
+    trust_remote_code: bool = False,
     uds_path: Path = "/tmp/text-generation-server",
     logger_level: str = "INFO",
     json_output: bool = False,
@@ -63,7 +64,7 @@ def serve(
 
     # Downgrade enum into str for easier management later on
     quantize = None if quantize is None else quantize.value
-    server.serve(model_id, revision, sharded, quantize, uds_path)
+    server.serve(model_id, revision, sharded, quantize, trust_remote_code, uds_path)
 
 
 @app.command()
 
@@ -91,26 +91,52 @@
 
 
 def get_model(
-    model_id: str, revision: Optional[str], sharded: bool, quantize: Optional[str]
+    model_id: str,
+    revision: Optional[str],
+    sharded: bool,
+    quantize: Optional[str],
+    trust_remote_code: bool,
 ) -> Model:
     if "facebook/galactica" in model_id:
         if sharded:
-            return GalacticaSharded(model_id, revision, quantize=quantize)
+            return GalacticaSharded(
+                model_id,
+                revision,
+                quantize=quantize,
+                trust_remote_code=trust_remote_code,
+            )
         else:
-            return Galactica(model_id, revision, quantize=quantize)
+            return Galactica(
+                model_id,
+                revision,
+                quantize=quantize,
+                trust_remote_code=trust_remote_code,
+            )
 
     if model_id.startswith("bigcode/"):
         if sharded:
             if not FLASH_ATTENTION:
                 raise NotImplementedError(
                     FLASH_ATT_ERROR_MESSAGE.format(f"Sharded Santacoder")
                 )
-            return FlashSantacoderSharded(model_id, revision, quantize=quantize)
+            return FlashSantacoderSharded(
+                model_id,
+                revision,
+                quantize=quantize,
+                trust_remote_code=trust_remote_code,
+            )
         else:
             santacoder_cls = FlashSantacoder if FLASH_ATTENTION else SantaCoder
-            return santacoder_cls(model_id, revision, quantize=quantize)
+            return santacoder_cls(
+                model_id,
+                revision,
+                quantize=quantize,
+                trust_remote_code=trust_remote_code,
+            )
 
-    config = AutoConfig.from_pretrained(model_id, revision=revision)
+    config = AutoConfig.from_pretrained(
+        model_id, revision=revision, trust_remote_code=trust_remote_code
+    )
     model_type = config.model_type
 
     if model_type == "gpt_bigcode":
@@ -119,52 +145,133 @@ def get_model(
                 raise NotImplementedError(
                     FLASH_ATT_ERROR_MESSAGE.format(f"Sharded Santacoder")
                 )
-            return FlashSantacoderSharded(model_id, revision, quantize=quantize)
+            return FlashSantacoderSharded(
+                model_id,
+                revision,
+                quantize=quantize,
+                trust_remote_code=trust_remote_code,
+            )
         else:
             santacoder_cls = FlashSantacoder if FLASH_ATTENTION else SantaCoder
-            return santacoder_cls(model_id, revision, quantize=quantize)
+            return santacoder_cls(
+                model_id,
+                revision,
+                quantize=quantize,
+                trust_remote_code=trust_remote_code,
+            )
 
     if model_type == "bloom":
         if sharded:
-            return BLOOMSharded(model_id, revision, quantize=quantize)
+            return BLOOMSharded(
+                model_id,
+                revision,
+                quantize=quantize,
+                trust_remote_code=trust_remote_code,
+            )
         else:
-            return BLOOM(model_id, revision, quantize=quantize)
+            return BLOOM(
+                model_id,
+                revision,
+                quantize=quantize,
+                trust_remote_code=trust_remote_code,
+            )
 
     if model_type == "gpt_neox":
         if sharded:
             neox_cls = FlashNeoXSharded if FLASH_ATTENTION else GPTNeoxSharded
-            return neox_cls(model_id, revision, quantize=quantize)
+            return neox_cls(
+                model_id,
+                revision,
+                quantize=quantize,
+                trust_remote_code=trust_remote_code,
+            )
         else:
             neox_cls = FlashNeoX if FLASH_ATTENTION else CausalLM
-            return neox_cls(model_id, revision, quantize=quantize)
+            return neox_cls(
+                model_id,
+                revision,
+                quantize=quantize,
+                trust_remote_code=trust_remote_code,
+            )
 
     if model_type == "llama":
         if sharded:
             if FLASH_ATTENTION:
-                return FlashLlamaSharded(model_id, revision, quantize=quantize)
+                return FlashLlamaSharded(
+                    model_id,
+                    revision,
+                    quantize=quantize,
+                    trust_remote_code=trust_remote_code,
+                )
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format(f"Sharded Llama"))
         else:
             llama_cls = FlashLlama if FLASH_ATTENTION else CausalLM
-            return llama_cls(model_id, revision, quantize=quantize)
+            return llama_cls(
+                model_id,
+                revision,
+                quantize=quantize,
+                trust_remote_code=trust_remote_code,
+            )
 
     if config.model_type == "opt":
         if sharded:
-            return OPTSharded(model_id, revision, quantize=quantize)
+            return OPTSharded(
+                model_id,
+                revision,
+                quantize=quantize,
+                trust_remote_code=trust_remote_code,
+            )
         else:
-            return OPT(model_id, revision, quantize=quantize)
+            return OPT(
+                model_id,
+                revision,
+                quantize=quantize,
+                trust_remote_code=trust_remote_code,
+            )
 
     if model_type == "t5":
         if sharded:
-            return T5Sharded(model_id, revision, quantize=quantize)
+            return T5Sharded(
+                model_id,
+                revision,
+                quantize=quantize,
+                trust_remote_code=trust_remote_code,
+            )
         else:
-            return Seq2SeqLM(model_id, revision, quantize=quantize)
+            return Seq2SeqLM(
+                model_id,
+                revision,
+                quantize=quantize,
+                trust_remote_code=trust_remote_code,
+            )
 
     if sharded:
         raise ValueError("sharded is not supported for AutoModel")
 
     if model_type in modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
-        return CausalLM(model_id, revision, quantize=quantize)
+        return CausalLM(
+            model_id, revision, quantize=quantize, trust_remote_code=trust_remote_code
+        )
     if model_type in modeling_auto.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES:
-        return Seq2SeqLM(model_id, revision, quantize=quantize)
+        return Seq2SeqLM(
+            model_id, revision, quantize=quantize, trust_remote_code=trust_remote_code
+        )
+
+    auto_map = getattr(config, "auto_map", None)
+    if trust_remote_code and auto_map is not None:
+        if "AutoModelForCausalLM" in auto_map.keys():
+            return CausalLM(
+                model_id,
+                revision,
+                quantize=quantize,
+                trust_remote_code=trust_remote_code,
+            )
+        if "AutoModelForSeq2SeqLM" in auto_map.keys:
+            return Seq2SeqLM(
+                model_id,
+                revision,
+                quantize=quantize,
+                trust_remote_code=trust_remote_code,
+            )
 
     raise ValueError(f"Unsupported model type {model_type}")
@@ -54,9 +54,13 @@ def __init__(
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
+        trust_remote_code: bool = False,
     ):
         super(BLOOM, self).__init__(
-            model_id=model_id, revision=revision, quantize=quantize
+            model_id=model_id,
+            revision=revision,
+            quantize=quantize,
+            trust_remote_code=trust_remote_code,
         )
 
     @property
@@ -70,6 +74,7 @@ def __init__(
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
+        trust_remote_code: bool = False,
     ):
         self.process_group, rank, world_size = initialize_torch_distributed()
         if torch.cuda.is_available():
@@ -80,19 +85,29 @@ def __init__(
             dtype = torch.float32
 
         tokenizer = AutoTokenizer.from_pretrained(
-            model_id, revision=revision, padding_side="left", truncation_side="left"
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
         )
 
         config = AutoConfig.from_pretrained(
-            model_id, revision=revision, slow_but_exact=False, tp_parallel=True
+            model_id,
+            revision=revision,
+            slow_but_exact=False,
+            tp_parallel=True,
+            trust_remote_code=trust_remote_code,
         )
         config.pad_token_id = 3
 
         torch.distributed.barrier(group=self.process_group)
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
 
         with init_empty_weights():
-            model = AutoModelForCausalLM.from_config(config)
+            model = AutoModelForCausalLM.from_config(
+                config, trust_remote_code=trust_remote_code
+            )
 
         torch.distributed.barrier(group=self.process_group)
         self.load_weights(