huggingface
diff --git a/‎Cargo.lock‎
Lines changed: 176 additions & 127 deletions b/‎Cargo.lock‎
Lines changed: 176 additions & 127 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Dockerfile_amd‎
Lines changed: 1 addition & 1 deletion b/‎Dockerfile_amd‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/openapi.json‎
Lines changed: 35 additions & 5 deletions b/‎docs/openapi.json‎
Lines changed: 35 additions & 5 deletions
diff --git a/‎integration-tests/pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎integration-tests/pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎server/Makefile‎
Lines changed: 1 addition & 1 deletion b/‎server/Makefile‎
Lines changed: 1 addition & 1 deletion
@@ -9,7 +9,7 @@ members = [
 resolver = "2"
 
 [workspace.package]
-version = "1.4.0"
+version = "1.4.1"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
 
@@ -225,7 +225,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
     make gen-server && \
     pip install -r requirements_cuda.txt && \
-    pip install ".[bnb, accelerate, quantize, peft]" --no-cache-dir
+    pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir
 
 # Install benchmarker
 COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
 
@@ -150,7 +150,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
     make gen-server && \
     pip install -r requirements_rocm.txt && \
-    pip install ".[accelerate, peft]" --no-cache-dir
+    pip install ".[accelerate, peft, outlines]" --no-cache-dir
 
 # Install benchmarker
 COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
 
@@ -10,7 +10,7 @@
       "name": "Apache 2.0",
       "url": "https://www.apache.org/licenses/LICENSE-2.0"
     },
-    "version": "1.4.0"
+    "version": "1.4.1"
   },
   "paths": {
     "/": {
@@ -590,8 +590,11 @@
             "minimum": 0
           },
           "logprobs": {
-            "type": "number",
-            "format": "float",
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/ChatCompletionLogprobs"
+              }
+            ],
             "nullable": true
           }
         }
@@ -710,7 +713,7 @@
           "presence_penalty": {
             "type": "number",
             "format": "float",
-            "description": "UNUSED\nNumber between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far,\nincreasing the model's likelihood to talk about new topics",
+            "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far,\nincreasing the model's likelihood to talk about new topics",
             "example": 0.1,
             "nullable": true
           },
@@ -734,7 +737,7 @@
           "top_logprobs": {
             "type": "integer",
             "format": "int32",
-            "description": "UNUSED\nAn integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with\nan associated log probability. logprobs must be set to true if this parameter is used.",
+            "description": "An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with\nan associated log probability. logprobs must be set to true if this parameter is used.",
             "example": "5",
             "nullable": true,
             "minimum": 0
@@ -870,6 +873,22 @@
             "default": "false",
             "example": true
           },
+          "frequency_penalty": {
+            "type": "number",
+            "format": "float",
+            "default": "null",
+            "example": 0.1,
+            "nullable": true,
+            "exclusiveMinimum": -2
+          },
+          "grammar": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/GrammarType"
+              }
+            ],
+            "nullable": true
+          },
           "max_new_tokens": {
             "type": "integer",
             "format": "int32",
@@ -1026,6 +1045,12 @@
             "example": "null",
             "nullable": true
           },
+          "max_batch_size": {
+            "type": "integer",
+            "example": "null",
+            "nullable": true,
+            "minimum": 0
+          },
           "max_batch_total_tokens": {
             "type": "integer",
             "format": "int32",
@@ -1119,6 +1144,11 @@
             "type": "string",
             "example": "My name is David and I"
           },
+          "name": {
+            "type": "string",
+            "example": "\"David\"",
+            "nullable": true
+          },
           "role": {
             "type": "string",
             "example": "user"
 
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation-integration-tests"
-version = "1.4.0"
+version = "1.4.1"
 description = "Text Generation Inference integration tests"
 authors = ["Nicolas Patry <[email protected]>"]
 
 
@@ -23,7 +23,7 @@ install-megablocks:
 install: gen-server
 	pip install pip --upgrade
 	pip install -r requirements_cuda.txt
-	pip install -e ".[bnb, accelerate, quantize, peft]"
+	pip install -e ".[bnb, accelerate, quantize, peft, outlines]"
 
 run-dev:
 	SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded