huggingface
diff --git a/‎benchmark/src/generation.rs‎
Lines changed: 7 additions & 1 deletion b/‎benchmark/src/generation.rs‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎benchmark/src/lib.rs‎
Lines changed: 3 additions & 0 deletions b/‎benchmark/src/lib.rs‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎benchmark/src/main.rs‎
Lines changed: 7 additions & 0 deletions b/‎benchmark/src/main.rs‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎benchmark/src/table.rs‎
Lines changed: 2 additions & 0 deletions b/‎benchmark/src/table.rs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎clients/python/text_generation/client.py‎
Lines changed: 16 additions & 0 deletions b/‎clients/python/text_generation/client.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎clients/python/text_generation/types.py‎
Lines changed: 15 additions & 3 deletions b/‎clients/python/text_generation/types.py‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎launcher/src/main.rs‎
Lines changed: 10 additions & 0 deletions b/‎launcher/src/main.rs‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎proto/generate.proto‎
Lines changed: 15 additions & 0 deletions b/‎proto/generate.proto‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎router/client/src/client.rs‎
Lines changed: 1 addition & 0 deletions b/‎router/client/src/client.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎router/src/health.rs‎
Lines changed: 1 addition & 0 deletions b/‎router/src/health.rs‎
Lines changed: 1 addition & 0 deletions
@@ -37,6 +37,7 @@ pub(crate) async fn generation_task(
     batch_size: Vec<u32>,
     sequence_length: u32,
     decode_length: u32,
+    top_n_tokens: Option<u32>,
     n_runs: usize,
     warmups: usize,
     parameters: NextTokenChooserParameters,
@@ -48,7 +49,7 @@ pub(crate) async fn generation_task(
     // End task if a message is received on shutdown_receiver
     // _shutdown_guard_sender will be dropped once the task is finished
     tokio::select! {
-        res = generate_runs(tokenizer, batch_size, sequence_length, decode_length, n_runs, warmups, parameters, client, run_sender.clone())  => {
+        res = generate_runs(tokenizer, batch_size, sequence_length, decode_length, top_n_tokens, n_runs, warmups, parameters, client, run_sender.clone())  => {
             if let Err(err) = res {
                 run_sender.send(Err(err)).await.unwrap_or(());
             }
@@ -64,6 +65,7 @@ async fn generate_runs(
     batch_size: Vec<u32>,
     sequence_length: u32,
     decode_length: u32,
+    top_n_tokens: Option<u32>,
     n_runs: usize,
     warmups: usize,
     parameters: NextTokenChooserParameters,
@@ -82,6 +84,7 @@ async fn generate_runs(
                 b,
                 decode_length,
                 parameters.clone(),
+                top_n_tokens,
                 &mut client,
             )
             .await?;
@@ -97,6 +100,7 @@ async fn generate_runs(
                 b,
                 decode_length,
                 parameters.clone(),
+                top_n_tokens,
                 &mut client,
             )
             .await?;
@@ -130,6 +134,7 @@ async fn prefill(
     batch_size: u32,
     decode_length: u32,
     parameters: NextTokenChooserParameters,
+    top_n_tokens: Option<u32>,
     client: &mut ShardedClient,
 ) -> Result<(Prefill, CachedBatch), ClientError> {
     // Create requests
@@ -145,6 +150,7 @@ async fn prefill(
                 stop_sequences: vec![],
                 ignore_eos_token: true, // Will not stop even if a eos token is generated
             }),
+            top_n_tokens: top_n_tokens.unwrap_or(0),
         })
         .collect();
 
 
@@ -22,6 +22,7 @@ pub async fn run(
     batch_size: Vec<u32>,
     sequence_length: u32,
     decode_length: u32,
+    top_n_tokens: Option<u32>,
     n_runs: usize,
     warmups: usize,
     temperature: Option<f32>,
@@ -70,6 +71,7 @@ pub async fn run(
         batch_size.clone(),
         sequence_length,
         decode_length,
+        top_n_tokens,
         n_runs,
         warmups,
         parameters,
@@ -130,6 +132,7 @@ pub async fn run(
         tokenizer_name,
         sequence_length,
         decode_length,
+        top_n_tokens,
         n_runs,
         warmups,
         temperature,
 
@@ -93,6 +93,11 @@ struct Args {
     /// decoding strategies, for full doc refer to the `text-generation-server`
     #[clap(long, env)]
     do_sample: bool,
+
+    /// Generation parameter in case you want to specifically test/debug particular
+    /// decoding strategies, for full doc refer to the `text-generation-server`
+    #[clap(long, env)]
+    top_n_tokens: Option<u32>,
 }
 
 fn main() -> Result<(), Box<dyn std::error::Error>> {
@@ -117,6 +122,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         watermark,
         do_sample,
         master_shard_uds_path,
+        top_n_tokens,
     } = args;
 
     let batch_size = batch_size.unwrap_or(vec![1, 2, 4, 8, 16, 32]);
@@ -173,6 +179,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
                 batch_size,
                 sequence_length,
                 decode_length,
+                top_n_tokens,
                 runs,
                 warmups,
                 temperature,
 
@@ -7,6 +7,7 @@ pub(crate) fn parameters_table(
     tokenizer_name: String,
     sequence_length: u32,
     decode_length: u32,
+    top_n_tokens: Option<u32>,
     n_runs: usize,
     warmups: usize,
     temperature: Option<f32>,
@@ -24,6 +25,7 @@ pub(crate) fn parameters_table(
     builder.push_record(["Model", &tokenizer_name]);
     builder.push_record(["Sequence Length", &sequence_length.to_string()]);
     builder.push_record(["Decode Length", &decode_length.to_string()]);
+    builder.push_record(["Top N Tokens", &format!("{top_n_tokens:?}")]);
     builder.push_record(["N Runs", &n_runs.to_string()]);
     builder.push_record(["Warmups", &warmups.to_string()]);
     builder.push_record(["Temperature", &format!("{temperature:?}")]);
 
@@ -75,6 +75,7 @@ def generate(
         typical_p: Optional[float] = None,
         watermark: bool = False,
         decoder_input_details: bool = False,
+        top_n_tokens: Optional[int] = None,
     ) -> Response:
         """
         Given a prompt, generate the following text
@@ -113,6 +114,8 @@ def generate(
                 Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
             decoder_input_details (`bool`):
                 Return the decoder input token logprobs and ids
+            top_n_tokens (`int`):
+                Return the `n` most likely tokens at each step
 
         Returns:
             Response: generated response
@@ -134,6 +137,7 @@ def generate(
             typical_p=typical_p,
             watermark=watermark,
             decoder_input_details=decoder_input_details,
+            top_n_tokens=top_n_tokens
         )
         request = Request(inputs=prompt, stream=False, parameters=parameters)
 
@@ -164,6 +168,7 @@ def generate_stream(
         truncate: Optional[int] = None,
         typical_p: Optional[float] = None,
         watermark: bool = False,
+        top_n_tokens: Optional[int] = None,
     ) -> Iterator[StreamResponse]:
         """
         Given a prompt, generate the following stream of tokens
@@ -198,6 +203,8 @@ def generate_stream(
                 See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
             watermark (`bool`):
                 Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+            top_n_tokens (`int`):
+                Return the `n` most likely tokens at each step
 
         Returns:
             Iterator[StreamResponse]: stream of generated tokens
@@ -219,6 +226,7 @@ def generate_stream(
             truncate=truncate,
             typical_p=typical_p,
             watermark=watermark,
+            top_n_tokens=top_n_tokens,
         )
         request = Request(inputs=prompt, stream=True, parameters=parameters)
 
@@ -317,6 +325,7 @@ async def generate(
         typical_p: Optional[float] = None,
         watermark: bool = False,
         decoder_input_details: bool = False,
+        top_n_tokens: Optional[int] = None,
     ) -> Response:
         """
         Given a prompt, generate the following text asynchronously
@@ -355,6 +364,8 @@ async def generate(
                 Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
             decoder_input_details (`bool`):
                 Return the decoder input token logprobs and ids
+            top_n_tokens (`int`):
+                Return the `n` most likely tokens at each step
 
         Returns:
             Response: generated response
@@ -376,6 +387,7 @@ async def generate(
             truncate=truncate,
             typical_p=typical_p,
             watermark=watermark,
+            top_n_tokens=top_n_tokens,
         )
         request = Request(inputs=prompt, stream=False, parameters=parameters)
 
@@ -404,6 +416,7 @@ async def generate_stream(
         truncate: Optional[int] = None,
         typical_p: Optional[float] = None,
         watermark: bool = False,
+        top_n_tokens: Optional[int] = None,
     ) -> AsyncIterator[StreamResponse]:
         """
         Given a prompt, generate the following stream of tokens asynchronously
@@ -438,6 +451,8 @@ async def generate_stream(
                 See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
             watermark (`bool`):
                 Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+            top_n_tokens (`int`):
+                Return the `n` most likely tokens at each step
 
         Returns:
             AsyncIterator[StreamResponse]: stream of generated tokens
@@ -459,6 +474,7 @@ async def generate_stream(
             truncate=truncate,
             typical_p=typical_p,
             watermark=watermark,
+            top_n_tokens=top_n_tokens,
         )
         request = Request(inputs=prompt, stream=True, parameters=parameters)
 
 
@@ -39,6 +39,8 @@ class Parameters(BaseModel):
     details: bool = False
     # Get decoder input token logprobs and ids
     decoder_input_details: bool = False
+    # Return the N most likely tokens at each step
+    top_n_tokens: Optional[int]
 
     @validator("best_of")
     def valid_best_of(cls, field_value, values):
@@ -101,6 +103,12 @@ def valid_typical_p(cls, v):
             raise ValidationError("`typical_p` must be > 0.0 and < 1.0")
         return v
 
+    @validator("top_n_tokens")
+    def valid_top_n_tokens(cls, v):
+        if v is not None and v <= 0:
+            raise ValidationError("`top_n_tokens` must be strictly positive")
+        return v
+
 
 class Request(BaseModel):
     # Prompt
@@ -125,9 +133,7 @@ def valid_best_of_stream(cls, field_value, values):
             and parameters.best_of > 1
             and field_value
         ):
-            raise ValidationError(
-                "`best_of` != 1 is not supported when `stream` == True"
-            )
+            raise ValidationError("`best_of` != 1 is not supported when `stream` == True")
         return field_value
 
 
@@ -179,6 +185,8 @@ class BestOfSequence(BaseModel):
     prefill: List[InputToken]
     # Generated tokens
     tokens: List[Token]
+    # Most likely tokens
+    top_tokens: Optional[List[List[Token]]]
 
 
 # `generate` details
@@ -193,6 +201,8 @@ class Details(BaseModel):
     prefill: List[InputToken]
     # Generated tokens
     tokens: List[Token]
+    # Most likely tokens
+    top_tokens: Optional[List[List[Token]]]
     # Additional sequences when using the `best_of` parameter
     best_of_sequences: Optional[List[BestOfSequence]]
 
@@ -219,6 +229,8 @@ class StreamDetails(BaseModel):
 class StreamResponse(BaseModel):
     # Generated token
     token: Token
+    # Most likely tokens
+    top_tokens: Optional[List[Token]]
     # Complete generated text
     # Only available when the generation is finished
     generated_text: Optional[str]
 
@@ -159,6 +159,14 @@ struct Args {
     #[clap(default_value = "4", long, env)]
     max_stop_sequences: usize,
 
+    /// This is the maximum allowed value for clients to set `top_n_tokens`.
+    /// `top_n_tokens is used to return information about the the `n` most likely
+    /// tokens at each generation step, instead of just the sampled token. This
+    /// information can be used for downstream tasks like for classification or
+    /// ranking.
+    #[clap(default_value = "5", long, env)]
+    max_top_n_tokens: u32,
+
     /// This is the maximum allowed input length (expressed in number of tokens)
     /// for users. The larger this value, the longer prompt users can send which
     /// can impact the overall memory required to handle the load.
@@ -929,6 +937,8 @@ fn spawn_webserver(
         args.max_best_of.to_string(),
         "--max-stop-sequences".to_string(),
         args.max_stop_sequences.to_string(),
+        "--max-top-n-tokens".to_string(),
+        args.max_top_n_tokens.to_string(),
         "--max-input-length".to_string(),
         args.max_input_length.to_string(),
         "--max-total-tokens".to_string(),
 
@@ -91,6 +91,8 @@ message Request {
     StoppingCriteriaParameters stopping_parameters = 5;
     /// Return prefill logprobs
     bool prefill_logprobs = 6;
+    /// Return most likely n tokens
+    uint32 top_n_tokens = 7;
 }
 
 message Batch {
@@ -141,6 +143,17 @@ message PrefillTokens {
     repeated string texts = 3;
 }
 
+message TopTokens {
+    /// Top Token IDs
+    repeated uint32 ids = 1;
+    /// Top Logprobs
+    repeated float logprobs = 2;
+    /// Top Token Texts
+    repeated string texts = 3;
+    /// If the tokens are special
+    repeated bool is_special = 6;
+}
+
 message Generation {
     /// Request ID
     uint64 request_id = 1;
@@ -156,6 +169,8 @@ message Generation {
     bool token_is_special = 6;
     /// Complete generated text
     optional GeneratedText generated_text = 7;
+    /// Top tokens
+    TopTokens top_tokens = 8;
 }
 
 message FilterBatchRequest {
 
@@ -131,6 +131,7 @@ impl Client {
                     ignore_eos_token: false,
                 }),
                 prefill_logprobs: true,
+                top_n_tokens: 20,
             });
             n_tokens += max_input_length;
         }
 
@@ -50,6 +50,7 @@ impl Health {
                     stop_sequences: vec![],
                     ignore_eos_token: false,
                 }),
+                top_n_tokens: 0,
             };
             let batch = Batch {
                 id: BATCH_ID,
Original file line number	Diff line number	Diff line change
`@@ -131,6 +131,7 @@ impl Client {`
`131`	`131`	`ignore_eos_token: false,`
`132`	`132`	`}),`
`133`	`133`	`prefill_logprobs: true,`
	`134`	`+ top_n_tokens: 20,`
`134`	`135`	`});`
`135`	`136`	`n_tokens += max_input_length;`
`136`	`137`	`}`