huggingface
diff --git a/‎Dockerfile
Lines changed: 9 additions & 1 deletion b/‎Dockerfile
Lines changed: 9 additions & 1 deletion
diff --git a/‎benchmark/src/generation.rs
Lines changed: 1 addition & 0 deletions b/‎benchmark/src/generation.rs
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/_toctree.yml
Lines changed: 4 additions & 1 deletion b/‎docs/source/_toctree.yml
Lines changed: 4 additions & 1 deletion
diff --git a/‎docs/source/basic_tutorials/launcher.md
Lines changed: 8 additions & 0 deletions b/‎docs/source/basic_tutorials/launcher.md
Lines changed: 8 additions & 0 deletions
diff --git a/‎docs/source/conceptual/lora.md
Lines changed: 65 additions & 0 deletions b/‎docs/source/conceptual/lora.md
Lines changed: 65 additions & 0 deletions
diff --git a/‎launcher/src/main.rs
Lines changed: 13 additions & 0 deletions b/‎launcher/src/main.rs
Lines changed: 13 additions & 0 deletions
diff --git a/‎proto/v3/generate.proto
Lines changed: 2 additions & 0 deletions b/‎proto/v3/generate.proto
Lines changed: 2 additions & 0 deletions
diff --git a/‎router/client/src/v3/client.rs
Lines changed: 1 addition & 0 deletions b/‎router/client/src/v3/client.rs
Lines changed: 1 addition & 0 deletions
diff --git a/‎router/client/src/v3/sharded_client.rs
Lines changed: 1 addition & 0 deletions b/‎router/client/src/v3/sharded_client.rs
Lines changed: 1 addition & 0 deletions
diff --git a/‎router/src/infer/v2/queue.rs
Lines changed: 1 addition & 0 deletions b/‎router/src/infer/v2/queue.rs
Lines changed: 1 addition & 0 deletions
@@ -145,6 +145,13 @@ COPY server/marlin/ .
 # Build specific version of transformers
 RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
 
+# Build Lorax Punica kernels
+FROM kernel-builder as lorax-punica-builder
+WORKDIR /usr/src
+COPY server/Makefile-lorax-punica Makefile
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica
+
 # Build Transformers CUDA kernels
 FROM kernel-builder as custom-kernels-builder
 WORKDIR /usr/src
@@ -215,6 +222,7 @@ COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86
 COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
 # Copy build artifacts from marlin kernels builder
 COPY --from=marlin-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
 
 # Copy builds artifacts from vllm builder
 COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
@@ -266,4 +274,4 @@ COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
 RUN chmod +x /tgi-entrypoint.sh
 
 ENTRYPOINT ["/tgi-entrypoint.sh"]
-CMD ["--json-output"]
+# CMD ["--json-output"]
@@ -157,6 +157,7 @@ async fn prefill(
             top_n_tokens: top_n_tokens.unwrap_or(0),
             blocks: vec![],
             slots: vec![],
+            adapter_id: None,
         })
         .collect();
 
 
@@ -60,6 +60,9 @@
   - local: conceptual/speculation
     title: Speculation (Medusa, ngram)
   - local: conceptual/guidance
-    title: How Guidance Works (via outlines)
+    title: How Guidance Works (via outlines
+  - local: conceptual/lora
+    title: LoRA (Low-Rank Adaptation)
+
 
   title: Conceptual Guides
@@ -416,6 +416,14 @@ Options:
           [env: MAX_CLIENT_BATCH_SIZE=]
           [default: 4]
 
+```
+## LORA_ADAPTERS
+```shell
+      --lora-adapters <LORA_ADAPTERS>
+          Lora Adapters a list of adapter ids i.e. `repo/adapter1,repo/adapter2` to load during startup that will be available to callers via the `adapter_id` field in a request
+          
+          [env: LORA_ADAPTERS=]
+
 ```
 ## HELP
 ```shell
 
@@ -0,0 +1,65 @@
+# LoRA (Low-Rank Adaptation)
+
+## What is LoRA?
+
+LoRA is a technique that allows for efficent fine-tuning a model while only updating a small portion of the model's weights. This is useful when you have a large model that has been pre-trained on a large dataset, but you want to fine-tune it on a smaller dataset or for a specific task.
+
+LoRA works by adding a small number of additional weights to the model, which are used to adapt the model to the new dataset or task. These additional weights are learned during the fine-tuning process, while the rest of the model's weights are kept fixed.
+
+## How is it used?
+
+LoRA can be used in many ways and the community is always finding new ways to use it. Here are some examples of how you can use LoRA:
+
+Technically, LoRA can be used to fine-tune a large language model on a small dataset. However, these use cases can span a wide range of applications, such as:
+
+- fine-tuning a language model on a small dataset
+- fine-tuning a language model on a domain-specific dataset
+- fine-tuning a language model on a dataset with limited labels
+
+## Optimizing Inference with LoRA
+
+LoRA's can be used during inference by mutliplying the adapter weights with the model weights at each specified layer. This process can be computationally expensive, but due to awesome work by [punica-ai](https://github.com/punica-ai/punica) and the [lorax](https://github.com/predibase/lorax) team, optimized kernels/and frameworks have been developed to make this process more efficient. TGI leverages these optimizations in order to provide fast and efficient inference with mulitple LoRA models.
+
+## Serving multiple LoRA adapters with TGI
+
+Once a LoRA model has been trained, it can be used to generate text or perform other tasks just like a regular language model. However, because the model has been fine-tuned on a specific dataset, it may perform better on that dataset than a model that has not been fine-tuned.
+
+In practice its often useful to have multiple LoRA models, each fine-tuned on a different dataset or for a different task. This allows you to use the model that is best suited for a particular task or dataset.
+
+Text Generation Inference (TGI) now supports loading multiple LoRA models at startup that can be used in generation requests. This feature is available starting from version `~2.0.6` and is compatible with LoRA models trained using the `peft` library.
+
+### Specifying LoRA models
+
+To use LoRA in TGI, when starting the server, you can specify the list of LoRA models to load using the `LORA_ADAPTERS` environment variable. For example:
+
+```bash
+LORA_ADAPTERS=predibase/customer_support,predibase/dbpedia
+```
+
+In the server logs, you will see the following message:
+
+```txt
+Loading adapter weights into model: predibase/customer_support
+Loading adapter weights into model: predibase/dbpedia
+```
+
+## Generate text
+
+You can then use these models in generation requests by specifying the `lora_model` parameter in the request payload. For example:
+
+```json
+curl 127.0.0.1:3000/generate \
+    -X POST \
+    -H 'Content-Type: application/json' \
+    -d '{
+  "inputs": "Hello who are you?",
+  "parameters": {
+    "max_new_tokens": 40,
+    "adapter_id": "predibase/customer_support"
+  }
+}'
+```
+
+> **Note:** The Lora feature is new and still being improved. If you encounter any issues or have any feedback, please let us know by opening an issue on the [GitHub repository](https://github.com/huggingface/text-generation-inference/issues/new/choose). Additionally documentation and an improved client library will be published soon.
+
+An updated tutorial with detailed examples will be published soon. Stay tuned!
@@ -452,6 +452,11 @@ struct Args {
     /// Control the maximum number of inputs that a client can send in a single request
     #[clap(default_value = "4", long, env)]
     max_client_batch_size: usize,
+
+    /// Lora Adapters a list of adapter ids i.e. `repo/adapter1,repo/adapter2` to load during
+    /// startup that will be available to callers via the `adapter_id` field in a request.
+    #[clap(long, env)]
+    lora_adapters: Option<String>,
 }
 
 #[derive(Debug)]
@@ -485,6 +490,7 @@ fn shard_manager(
     max_total_tokens: usize,
     max_batch_size: Option<usize>,
     max_input_tokens: usize,
+    lora_adapters: Option<String>,
     otlp_endpoint: Option<String>,
     otlp_service_name: String,
     log_level: LevelFilter,
@@ -620,6 +626,11 @@ fn shard_manager(
         envs.push(("MAX_BATCH_SIZE".into(), max_batch_size.to_string().into()));
     }
 
+    // Lora Adapters
+    if let Some(lora_adapters) = lora_adapters {
+        envs.push(("LORA_ADAPTERS".into(), lora_adapters.into()));
+    }
+
     // If huggingface_hub_cache is some, pass it to the shard
     // Useful when running inside a docker container
     if let Some(huggingface_hub_cache) = huggingface_hub_cache {
@@ -1060,6 +1071,7 @@ fn spawn_shards(
         let rope_scaling = args.rope_scaling;
         let rope_factor = args.rope_factor;
         let max_batch_size = args.max_batch_size;
+        let lora_adapters = args.lora_adapters.clone();
         thread::spawn(move || {
             shard_manager(
                 model_id,
@@ -1085,6 +1097,7 @@ fn spawn_shards(
                 max_total_tokens,
                 max_batch_size,
                 max_input_tokens,
+                lora_adapters,
                 otlp_endpoint,
                 otlp_service_name,
                 max_log_level,
 
@@ -134,6 +134,8 @@ message Request {
     repeated uint32 blocks = 9;
     /// Paged attention slots
     repeated uint32  slots = 10;
+    /// LORA adapter index
+    optional string adapter_id = 11;
 }
 
 message Batch {
 
@@ -177,6 +177,7 @@ impl Client {
                 }),
                 prefill_logprobs: true,
                 top_n_tokens: 20,
+                adapter_id: None,
             });
             n_tokens += max_input_length;
 
 
@@ -244,6 +244,7 @@ impl Health for ShardedClient {
             // Block 0 is reserved for health checks
             blocks: vec![0],
             slots: (0..16).collect(),
+            adapter_id: None,
         };
         let batch = Batch {
             id: u64::MAX,
 
@@ -429,6 +429,7 @@ mod tests {
                     stop_sequences: vec![],
                 },
                 top_n_tokens: 0,
+                adapter_id: None,
             },
             response_tx,
             span: info_span!("entry"),
Original file line number	Diff line number	Diff line change
`@@ -134,6 +134,8 @@ message Request {`
`134`	`134`	`repeated uint32 blocks = 9;`
`135`	`135`	`/// Paged attention slots`
`136`	`136`	`repeated uint32 slots = 10;`
	`137`	`+ /// LORA adapter index`
	`138`	`+ optional string adapter_id = 11;`
`137`	`139`	`}`
`138`	`140`
`139`	`141`	`message Batch {`