From 7039bb51b59b40f9117d0aca64b89381b2023aff Mon Sep 17 00:00:00 2001
From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com>
Date: Fri, 22 Mar 2024 17:52:04 -0400
Subject: [PATCH 01/10] Update benchmark_speculator_logical.py to support
 gpt_bigcode/granite

---
 speculator/benchmark_speculator_logical.py | 72 +++++++++++++++++-----
 1 file changed, 55 insertions(+), 17 deletions(-)
diff --git a/speculator/benchmark_speculator_logical.py b/speculator/benchmark_speculator_logical.py
index 93cd561f..0367ca52 100644
--- a/speculator/benchmark_speculator_logical.py
+++ b/speculator/benchmark_speculator_logical.py
@@ -2,9 +2,11 @@
 import itertools
 import os
 import time
+import torch
+import json
 
 import fms_extras.models.paged_llama
-import torch
+import fms_extras.models.paged_gpt_bigcode
 import torch._inductor.config
 from fms.models import get_model
 from fms.utils import generation, tokenizers
@@ -23,6 +25,12 @@
     description="Script to run inference on a causal model"
 )
 parser.add_argument("--device_type", type=str, default="cuda")
+parser.add_argument(
+    "--architecture",
+    type=str,
+    default="paged_llama",
+    help="The model architecture to benchmark",
+)
 parser.add_argument(
     "--variant",
     type=str,
@@ -109,8 +117,33 @@
     help="Disable batch auto-flattening for handling candidate trees?"
 )
 
+parser.add_argument(
+    "--seed",
+    type=int,
+    default=42,
+    help="Seed for torch and data loader",
+)
+
+parser.add_argument(
+    "--n_predict",
+    type=int,
+    default=3,
+    help="Number of speculator heads / number of tokens to guess ahead",
+)
+
+parser.add_argument(
+    "--threshes",
+    type=json.loads,
+    default=[6,4,3],
+    help="number of top k predictions from each head to generate speculator candidate pool; should be same len as n_predict"
+)
+
+
 args = parser.parse_args()
 
+torch.cuda.manual_seed(args.seed)
+torch.manual_seed(args.seed)
+
 local_rank = int(os.getenv("LOCAL_RANK", 0))
 world_size = int(os.getenv("WORLD_SIZE", 1))
 if args.device_type == "cuda":
@@ -138,7 +171,7 @@
         distr_param = None
 
 model = get_model(
-    "paged_llama",
+    args.architecture,
     args.variant,
     model_path=args.model_path,
     checkpoint_sharding=args.checkpoint_sharding,
@@ -157,7 +190,7 @@
 if args.speculator_path is not None:
     print("loading speculator")
     speculator = MLPSpeculator(
-        model.config.emb_dim, 4096, model.config.src_vocab_size, n_predict=3
+        model.config.emb_dim, 4096, model.config.src_vocab_size, n_predict=args.n_predict
     )
     speculator.load_state_dict(
         torch.load(args.speculator_path, map_location=device)["model_state"]
@@ -169,13 +202,17 @@
 # cache setup
 from fms_extras.utils.cache.paged import PagedKVCacheManager
 
-
 use_cache = True
+if hasattr(model.config, "kvheads"):
+    kv_heads = model.config.kvheads
+else:
+    kv_heads = 1 if model.config.multiquery_attn else model.config.nheads
+
 kv_cache_manager = PagedKVCacheManager(
     model.config.nlayers,
     model.config.nheads,
     model.config.emb_dim,
-    kv_heads=model.config.kvheads,
+    kv_heads=kv_heads,
     tensor_parallel_size=dist.get_world_size() if args.distributed else 1,
     dtype=torch.get_default_dtype(),
     device=device,
@@ -192,6 +229,7 @@
     datasets=[
         args.subdata,
     ],
+    seed=args.seed,
     min_length=2148,
     max_chunksize=8192,
 )
@@ -209,13 +247,12 @@
         in_middle = True
 data = torch.IntTensor(data).to(device)
 
-
-# def ids_for_prompt(prompt):
-#     tokens = tokenizer.tokenize(prompt)
-#     tokens = ["<s>"] + tokens
-#     ids = tokenizer.convert_tokens_to_ids(tokens)
-#     ids = torch.tensor(ids, dtype=torch.long, device=device)
-#     return ids
+def ids_for_prompt(prompt):
+    tokens = tokenizer.tokenize(prompt)
+    tokens = ["<s>"] + tokens
+    ids = tokenizer.convert_tokens_to_ids(tokens)
+    ids = torch.tensor(ids, dtype=torch.long, device=device)
+    return ids
 
 
 def print_result(result, inp, n_steps):
@@ -232,10 +269,11 @@ def print_result(result, inp, n_steps):
     print()
 
 
-def infer(ids, k, warmup, model, decode_model, speculator, flatting):
+def infer(ids, k, warmup, model, decode_model, speculator):
     # With greedy generation (do_sample=False) we _should_ always get the same results.
     # There is currently a bug in start_pos for batched rotary embeddings that can lead
     # varying results for the same prompt.
+    max_seq_len = model.config.max_expected_seq_len if hasattr(model.config, "max_expected_seq_len") else model.config.max_pos
 
     if k != 0:
         result, n_steps, generated_token_time_out = speculative_generate(
@@ -244,11 +282,11 @@ def infer(ids, k, warmup, model, decode_model, speculator, flatting):
             speculator,
             kv_cache_manager,
             new_tokens=100,
-            max_seq_len=model.config.max_expected_seq_len,
+            max_seq_len=max_seq_len,
             decode_model=decode_model,
             top_k=k,
-            threshes=[6, 4, 3],
-            flatting=flatting,
+            threshes=args.threshes,
+            flatting=not args.no_flat,
         )
     else:
         result, n_steps, generated_token_time_out = paged_generate(
@@ -256,7 +294,7 @@ def infer(ids, k, warmup, model, decode_model, speculator, flatting):
             ids,
             kv_cache_manager,
             max_new_tokens=100,
-            max_seq_len=model.config.max_expected_seq_len,
+            max_seq_len=max_seq_len,
             do_sample=False,
             decode_model=decode_model,
         )

From 9dce65d0cc2ea6800bc87f70c623868cda9c42a3 Mon Sep 17 00:00:00 2001
From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com>
Date: Fri, 22 Mar 2024 17:58:44 -0400
Subject: [PATCH 02/10] Update benchmark_speculator_logical.py

---
 speculator/benchmark_speculator_logical.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/speculator/benchmark_speculator_logical.py b/speculator/benchmark_speculator_logical.py
index 0367ca52..754eb76d 100644
--- a/speculator/benchmark_speculator_logical.py
+++ b/speculator/benchmark_speculator_logical.py
@@ -29,7 +29,7 @@
     "--architecture",
     type=str,
     default="paged_llama",
-    help="The model architecture to benchmark",
+    help="The model architecture to benchmark, e.g. 'paged_llama', 'paged_gpt_bigcode',
 )
 parser.add_argument(
     "--variant",

From 18ecee853656f766869b41a2f7c978aa8f483515 Mon Sep 17 00:00:00 2001
From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com>
Date: Mon, 25 Mar 2024 12:57:47 -0400
Subject: [PATCH 03/10] Update benchmark_speculator_logical.py

---
 speculator/benchmark_speculator_logical.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/speculator/benchmark_speculator_logical.py b/speculator/benchmark_speculator_logical.py
index 754eb76d..f6f86a77 100644
--- a/speculator/benchmark_speculator_logical.py
+++ b/speculator/benchmark_speculator_logical.py
@@ -18,8 +18,9 @@
 from fms_fsdp.utils.dataset_utils import Streaming_Doc_Dataset
 
 
-# This example script validates the LLaMA implementation by running inference on a couple of prompts.
-# torchrun --nproc_per_node=1 scripts/inference.py --variant=7b --model_path=~/models/7B-F --tokenizer=~/models/tokenizer.model --model_source=meta --speculator_path=~/models/speculator_7B_F.pth --compile
+# This example script measures the logical speedup of running a speculator atop a base model. Run as:
+# export CUDA_VISIBLE_DEVICES=1
+# torchrun --nproc_per_node=1 benchmark_speculator_logical.py --architecture=paged_llama --variant=7b --model_path=~/models/7B-F --tokenizer=~/models/tokenizer.model --model_source=hf --speculator_path=~/models/speculator_7B_F.pth --compile
 
 parser = argparse.ArgumentParser(
     description="Script to run inference on a causal model"

From 268f8c84583b3ea078550b6e333a5317879a2cae Mon Sep 17 00:00:00 2001
From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com>
Date: Mon, 25 Mar 2024 13:00:26 -0400
Subject: [PATCH 04/10] Update benchmark_speculator_logical.py

---
 speculator/benchmark_speculator_logical.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/speculator/benchmark_speculator_logical.py b/speculator/benchmark_speculator_logical.py
index f6f86a77..e5a3a425 100644
--- a/speculator/benchmark_speculator_logical.py
+++ b/speculator/benchmark_speculator_logical.py
@@ -20,7 +20,8 @@
 
 # This example script measures the logical speedup of running a speculator atop a base model. Run as:
 # export CUDA_VISIBLE_DEVICES=1
-# torchrun --nproc_per_node=1 benchmark_speculator_logical.py --architecture=paged_llama --variant=7b --model_path=~/models/7B-F --tokenizer=~/models/tokenizer.model --model_source=hf --speculator_path=~/models/speculator_7B_F.pth --compile
+# e.g., #1: torchrun --nproc_per_node=1 benchmark_speculator_logical.py --architecture=paged_llama --variant=7b --model_path=~/models/7B-F --tokenizer=~/models/tokenizer.model --model_source=hf --speculator_path=~/models/speculator_7B_F.pth --compile
+# e.g., #2: torchrun --nproc_per_node=1 benchmark_speculator_logical.py --architecture=paged_gpt_bigcode --variant=ibm.20b --model_path=~/models/granite-20b-instruct --tokenizer=~/models/granite-20b-instruct --model_source=hf --speculator_path=~/models/speculator_granite20B.pth --n_predict=4 --threshes=[6,4,3,3]
 
 parser = argparse.ArgumentParser(
     description="Script to run inference on a causal model"

From f0c413f2d1f5e52c73df6009303f49d233b42363 Mon Sep 17 00:00:00 2001
From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com>
Date: Mon, 25 Mar 2024 18:22:11 -0400
Subject: [PATCH 05/10] Update benchmark_speculator_logical.py

---
 speculator/benchmark_speculator_logical.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/speculator/benchmark_speculator_logical.py b/speculator/benchmark_speculator_logical.py
index e5a3a425..40f9b5f9 100644
--- a/speculator/benchmark_speculator_logical.py
+++ b/speculator/benchmark_speculator_logical.py
@@ -20,8 +20,8 @@
 
 # This example script measures the logical speedup of running a speculator atop a base model. Run as:
 # export CUDA_VISIBLE_DEVICES=1
-# e.g., #1: torchrun --nproc_per_node=1 benchmark_speculator_logical.py --architecture=paged_llama --variant=7b --model_path=~/models/7B-F --tokenizer=~/models/tokenizer.model --model_source=hf --speculator_path=~/models/speculator_7B_F.pth --compile
-# e.g., #2: torchrun --nproc_per_node=1 benchmark_speculator_logical.py --architecture=paged_gpt_bigcode --variant=ibm.20b --model_path=~/models/granite-20b-instruct --tokenizer=~/models/granite-20b-instruct --model_source=hf --speculator_path=~/models/speculator_granite20B.pth --n_predict=4 --threshes=[6,4,3,3]
+# e.g., #1: torchrun --nproc_per_node=1 benchmark_speculator_logical.py --architecture=paged_llama --variant=7b --model_path=~/models/7B-F --tokenizer=~/models/tokenizer.model --model_source=hf --speculator_path=~/models/speculator_7B_F.pth --compile --data_path="/path/to/training_dataset_tokens/" --subdata="webhose"
+# e.g., #2: torchrun --nproc_per_node=1 benchmark_speculator_logical.py --architecture=paged_gpt_bigcode --variant=ibm.20b --model_path=~/models/granite-20b-instruct --tokenizer=~/models/granite-20b-instruct --model_source=hf --speculator_path=~/models/speculator_granite20B.pth --data_path="/path/to/training_dataset_tokens/" --subdata="github" --n_predict=4 --threshes=[6,4,3,3]
 
 parser = argparse.ArgumentParser(
     description="Script to run inference on a causal model"

From a0798ada82e2eb2b84c503f446886c9cf87d9adc Mon Sep 17 00:00:00 2001
From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com>
Date: Mon, 25 Mar 2024 18:26:50 -0400
Subject: [PATCH 06/10] Update benchmark_speculator_logical.py

---
 speculator/benchmark_speculator_logical.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/speculator/benchmark_speculator_logical.py b/speculator/benchmark_speculator_logical.py
index 40f9b5f9..489355fb 100644
--- a/speculator/benchmark_speculator_logical.py
+++ b/speculator/benchmark_speculator_logical.py
@@ -31,7 +31,7 @@
     "--architecture",
     type=str,
     default="paged_llama",
-    help="The model architecture to benchmark, e.g. 'paged_llama', 'paged_gpt_bigcode',
+    help="The model architecture to benchmark, e.g. 'paged_llama', 'paged_gpt_bigcode'",
 )
 parser.add_argument(
     "--variant",

From ef983648bb4fd5c36f78d5a3f1a2ee3fd44124b9 Mon Sep 17 00:00:00 2001
From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com>
Date: Wed, 27 Mar 2024 08:50:14 -0400
Subject: [PATCH 07/10] Update benchmark_speculator_logical.py

---
 speculator/benchmark_speculator_logical.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/speculator/benchmark_speculator_logical.py b/speculator/benchmark_speculator_logical.py
index 489355fb..33641232 100644
--- a/speculator/benchmark_speculator_logical.py
+++ b/speculator/benchmark_speculator_logical.py
@@ -116,7 +116,7 @@
 parser.add_argument(
     "--no_flat",
     action="store_true",
-    help="Disable batch auto-flattening for handling candidate trees?"
+    help="Disable batch auto-flattening for handling candidate trees?",
 )
 
 parser.add_argument(
@@ -137,7 +137,7 @@
     "--threshes",
     type=json.loads,
     default=[6,4,3],
-    help="number of top k predictions from each head to generate speculator candidate pool; should be same len as n_predict"
+    help="number of top k predictions from each head to generate speculator candidate pool; should be same len as n_predict",
 )
 
 
@@ -192,7 +192,10 @@
 if args.speculator_path is not None:
     print("loading speculator")
     speculator = MLPSpeculator(
-        model.config.emb_dim, 4096, model.config.src_vocab_size, n_predict=args.n_predict
+        model.config.emb_dim,
+        4096,
+        model.config.src_vocab_size,
+        n_predict=args.n_predict,
     )
     speculator.load_state_dict(
         torch.load(args.speculator_path, map_location=device)["model_state"]
@@ -249,6 +252,7 @@
         in_middle = True
 data = torch.IntTensor(data).to(device)
 
+
 def ids_for_prompt(prompt):
     tokens = tokenizer.tokenize(prompt)
     tokens = ["<s>"] + tokens
@@ -275,7 +279,11 @@ def infer(ids, k, warmup, model, decode_model, speculator):
     # With greedy generation (do_sample=False) we _should_ always get the same results.
     # There is currently a bug in start_pos for batched rotary embeddings that can lead
     # varying results for the same prompt.
-    max_seq_len = model.config.max_expected_seq_len if hasattr(model.config, "max_expected_seq_len") else model.config.max_pos
+    max_seq_len = (
+        model.config.max_expected_seq_len
+        if hasattr(model.config, "max_expected_seq_len")
+        else model.config.max_pos
+    )
 
     if k != 0:
         result, n_steps, generated_token_time_out = speculative_generate(
@@ -309,6 +317,7 @@ def infer(ids, k, warmup, model, decode_model, speculator):
         return generated_token_time_out / avg_tokens, avg_tokens / n_steps
     return None
 
+
 torch._dynamo.config.cache_size_limit = 64
 
 torch.cuda.empty_cache()

From 85f2bf00874b8fd3369899c7eed67d8dbe5790d5 Mon Sep 17 00:00:00 2001
From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com>
Date: Wed, 27 Mar 2024 08:51:47 -0400
Subject: [PATCH 08/10] Update benchmark_speculator_logical.py

---
 speculator/benchmark_speculator_logical.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/speculator/benchmark_speculator_logical.py b/speculator/benchmark_speculator_logical.py
index 33641232..078aae96 100644
--- a/speculator/benchmark_speculator_logical.py
+++ b/speculator/benchmark_speculator_logical.py
@@ -136,7 +136,7 @@
 parser.add_argument(
     "--threshes",
     type=json.loads,
-    default=[6,4,3],
+    default=[6, 4, 3],
     help="number of top k predictions from each head to generate speculator candidate pool; should be same len as n_predict",
 )
 

From eba5053b9137bbbb39df13a47c6a3647473c436f Mon Sep 17 00:00:00 2001
From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com>
Date: Wed, 27 Mar 2024 08:54:04 -0400
Subject: [PATCH 09/10] Update benchmark_speculator.py

---
 speculator/benchmark_speculator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/speculator/benchmark_speculator.py b/speculator/benchmark_speculator.py
index 5658eec0..45d9b70e 100644
--- a/speculator/benchmark_speculator.py
+++ b/speculator/benchmark_speculator.py
@@ -100,7 +100,7 @@
 parser.add_argument(
     "--no_flat",
     action="store_true",
-    help="Disable batch auto-flattening for handling candidate trees?"
+    help="Disable batch auto-flattening for handling candidate trees?",
 )
 
 args = parser.parse_args()

From 1d5858bf9d4c6952c852206b308a8d7cd0adb909 Mon Sep 17 00:00:00 2001
From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com>
Date: Wed, 27 Mar 2024 13:57:19 -0400
Subject: [PATCH 10/10] Update benchmark_speculator_logical.py

---
 speculator/benchmark_speculator_logical.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/speculator/benchmark_speculator_logical.py b/speculator/benchmark_speculator_logical.py
index 078aae96..0f4eca6f 100644
--- a/speculator/benchmark_speculator_logical.py
+++ b/speculator/benchmark_speculator_logical.py
@@ -1,12 +1,12 @@
 import argparse
 import itertools
+import json
 import os
 import time
-import torch
-import json
 
-import fms_extras.models.paged_llama
 import fms_extras.models.paged_gpt_bigcode
+import fms_extras.models.paged_llama
+import torch
 import torch._inductor.config
 from fms.models import get_model
 from fms.utils import generation, tokenizers
@@ -207,6 +207,7 @@
 # cache setup
 from fms_extras.utils.cache.paged import PagedKVCacheManager
 
+
 use_cache = True
 if hasattr(model.config, "kvheads"):
     kv_heads = model.config.kvheads