From 7039bb51b59b40f9117d0aca64b89381b2023aff Mon Sep 17 00:00:00 2001 From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com> Date: Fri, 22 Mar 2024 17:52:04 -0400 Subject: [PATCH 01/10] Update benchmark_speculator_logical.py to support gpt_bigcode/granite --- speculator/benchmark_speculator_logical.py | 72 +++++++++++++++++----- 1 file changed, 55 insertions(+), 17 deletions(-) diff --git a/speculator/benchmark_speculator_logical.py b/speculator/benchmark_speculator_logical.py index 93cd561f..0367ca52 100644 --- a/speculator/benchmark_speculator_logical.py +++ b/speculator/benchmark_speculator_logical.py @@ -2,9 +2,11 @@ import itertools import os import time +import torch +import json import fms_extras.models.paged_llama -import torch +import fms_extras.models.paged_gpt_bigcode import torch._inductor.config from fms.models import get_model from fms.utils import generation, tokenizers @@ -23,6 +25,12 @@ description="Script to run inference on a causal model" ) parser.add_argument("--device_type", type=str, default="cuda") +parser.add_argument( + "--architecture", + type=str, + default="paged_llama", + help="The model architecture to benchmark", +) parser.add_argument( "--variant", type=str, @@ -109,8 +117,33 @@ help="Disable batch auto-flattening for handling candidate trees?" ) +parser.add_argument( + "--seed", + type=int, + default=42, + help="Seed for torch and data loader", +) + +parser.add_argument( + "--n_predict", + type=int, + default=3, + help="Number of speculator heads / number of tokens to guess ahead", +) + +parser.add_argument( + "--threshes", + type=json.loads, + default=[6,4,3], + help="number of top k predictions from each head to generate speculator candidate pool; should be same len as n_predict" +) + + args = parser.parse_args() +torch.cuda.manual_seed(args.seed) +torch.manual_seed(args.seed) + local_rank = int(os.getenv("LOCAL_RANK", 0)) world_size = int(os.getenv("WORLD_SIZE", 1)) if args.device_type == "cuda": @@ -138,7 +171,7 @@ distr_param = None model = get_model( - "paged_llama", + args.architecture, args.variant, model_path=args.model_path, checkpoint_sharding=args.checkpoint_sharding, @@ -157,7 +190,7 @@ if args.speculator_path is not None: print("loading speculator") speculator = MLPSpeculator( - model.config.emb_dim, 4096, model.config.src_vocab_size, n_predict=3 + model.config.emb_dim, 4096, model.config.src_vocab_size, n_predict=args.n_predict ) speculator.load_state_dict( torch.load(args.speculator_path, map_location=device)["model_state"] @@ -169,13 +202,17 @@ # cache setup from fms_extras.utils.cache.paged import PagedKVCacheManager - use_cache = True +if hasattr(model.config, "kvheads"): + kv_heads = model.config.kvheads +else: + kv_heads = 1 if model.config.multiquery_attn else model.config.nheads + kv_cache_manager = PagedKVCacheManager( model.config.nlayers, model.config.nheads, model.config.emb_dim, - kv_heads=model.config.kvheads, + kv_heads=kv_heads, tensor_parallel_size=dist.get_world_size() if args.distributed else 1, dtype=torch.get_default_dtype(), device=device, @@ -192,6 +229,7 @@ datasets=[ args.subdata, ], + seed=args.seed, min_length=2148, max_chunksize=8192, ) @@ -209,13 +247,12 @@ in_middle = True data = torch.IntTensor(data).to(device) - -# def ids_for_prompt(prompt): -# tokens = tokenizer.tokenize(prompt) -# tokens = [""] + tokens -# ids = tokenizer.convert_tokens_to_ids(tokens) -# ids = torch.tensor(ids, dtype=torch.long, device=device) -# return ids +def ids_for_prompt(prompt): + tokens = tokenizer.tokenize(prompt) + tokens = [""] + tokens + ids = tokenizer.convert_tokens_to_ids(tokens) + ids = torch.tensor(ids, dtype=torch.long, device=device) + return ids def print_result(result, inp, n_steps): @@ -232,10 +269,11 @@ def print_result(result, inp, n_steps): print() -def infer(ids, k, warmup, model, decode_model, speculator, flatting): +def infer(ids, k, warmup, model, decode_model, speculator): # With greedy generation (do_sample=False) we _should_ always get the same results. # There is currently a bug in start_pos for batched rotary embeddings that can lead # varying results for the same prompt. + max_seq_len = model.config.max_expected_seq_len if hasattr(model.config, "max_expected_seq_len") else model.config.max_pos if k != 0: result, n_steps, generated_token_time_out = speculative_generate( @@ -244,11 +282,11 @@ def infer(ids, k, warmup, model, decode_model, speculator, flatting): speculator, kv_cache_manager, new_tokens=100, - max_seq_len=model.config.max_expected_seq_len, + max_seq_len=max_seq_len, decode_model=decode_model, top_k=k, - threshes=[6, 4, 3], - flatting=flatting, + threshes=args.threshes, + flatting=not args.no_flat, ) else: result, n_steps, generated_token_time_out = paged_generate( @@ -256,7 +294,7 @@ def infer(ids, k, warmup, model, decode_model, speculator, flatting): ids, kv_cache_manager, max_new_tokens=100, - max_seq_len=model.config.max_expected_seq_len, + max_seq_len=max_seq_len, do_sample=False, decode_model=decode_model, ) From 9dce65d0cc2ea6800bc87f70c623868cda9c42a3 Mon Sep 17 00:00:00 2001 From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com> Date: Fri, 22 Mar 2024 17:58:44 -0400 Subject: [PATCH 02/10] Update benchmark_speculator_logical.py --- speculator/benchmark_speculator_logical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speculator/benchmark_speculator_logical.py b/speculator/benchmark_speculator_logical.py index 0367ca52..754eb76d 100644 --- a/speculator/benchmark_speculator_logical.py +++ b/speculator/benchmark_speculator_logical.py @@ -29,7 +29,7 @@ "--architecture", type=str, default="paged_llama", - help="The model architecture to benchmark", + help="The model architecture to benchmark, e.g. 'paged_llama', 'paged_gpt_bigcode', ) parser.add_argument( "--variant", From 18ecee853656f766869b41a2f7c978aa8f483515 Mon Sep 17 00:00:00 2001 From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com> Date: Mon, 25 Mar 2024 12:57:47 -0400 Subject: [PATCH 03/10] Update benchmark_speculator_logical.py --- speculator/benchmark_speculator_logical.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/speculator/benchmark_speculator_logical.py b/speculator/benchmark_speculator_logical.py index 754eb76d..f6f86a77 100644 --- a/speculator/benchmark_speculator_logical.py +++ b/speculator/benchmark_speculator_logical.py @@ -18,8 +18,9 @@ from fms_fsdp.utils.dataset_utils import Streaming_Doc_Dataset -# This example script validates the LLaMA implementation by running inference on a couple of prompts. -# torchrun --nproc_per_node=1 scripts/inference.py --variant=7b --model_path=~/models/7B-F --tokenizer=~/models/tokenizer.model --model_source=meta --speculator_path=~/models/speculator_7B_F.pth --compile +# This example script measures the logical speedup of running a speculator atop a base model. Run as: +# export CUDA_VISIBLE_DEVICES=1 +# torchrun --nproc_per_node=1 benchmark_speculator_logical.py --architecture=paged_llama --variant=7b --model_path=~/models/7B-F --tokenizer=~/models/tokenizer.model --model_source=hf --speculator_path=~/models/speculator_7B_F.pth --compile parser = argparse.ArgumentParser( description="Script to run inference on a causal model" From 268f8c84583b3ea078550b6e333a5317879a2cae Mon Sep 17 00:00:00 2001 From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com> Date: Mon, 25 Mar 2024 13:00:26 -0400 Subject: [PATCH 04/10] Update benchmark_speculator_logical.py --- speculator/benchmark_speculator_logical.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/speculator/benchmark_speculator_logical.py b/speculator/benchmark_speculator_logical.py index f6f86a77..e5a3a425 100644 --- a/speculator/benchmark_speculator_logical.py +++ b/speculator/benchmark_speculator_logical.py @@ -20,7 +20,8 @@ # This example script measures the logical speedup of running a speculator atop a base model. Run as: # export CUDA_VISIBLE_DEVICES=1 -# torchrun --nproc_per_node=1 benchmark_speculator_logical.py --architecture=paged_llama --variant=7b --model_path=~/models/7B-F --tokenizer=~/models/tokenizer.model --model_source=hf --speculator_path=~/models/speculator_7B_F.pth --compile +# e.g., #1: torchrun --nproc_per_node=1 benchmark_speculator_logical.py --architecture=paged_llama --variant=7b --model_path=~/models/7B-F --tokenizer=~/models/tokenizer.model --model_source=hf --speculator_path=~/models/speculator_7B_F.pth --compile +# e.g., #2: torchrun --nproc_per_node=1 benchmark_speculator_logical.py --architecture=paged_gpt_bigcode --variant=ibm.20b --model_path=~/models/granite-20b-instruct --tokenizer=~/models/granite-20b-instruct --model_source=hf --speculator_path=~/models/speculator_granite20B.pth --n_predict=4 --threshes=[6,4,3,3] parser = argparse.ArgumentParser( description="Script to run inference on a causal model" From f0c413f2d1f5e52c73df6009303f49d233b42363 Mon Sep 17 00:00:00 2001 From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com> Date: Mon, 25 Mar 2024 18:22:11 -0400 Subject: [PATCH 05/10] Update benchmark_speculator_logical.py --- speculator/benchmark_speculator_logical.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/speculator/benchmark_speculator_logical.py b/speculator/benchmark_speculator_logical.py index e5a3a425..40f9b5f9 100644 --- a/speculator/benchmark_speculator_logical.py +++ b/speculator/benchmark_speculator_logical.py @@ -20,8 +20,8 @@ # This example script measures the logical speedup of running a speculator atop a base model. Run as: # export CUDA_VISIBLE_DEVICES=1 -# e.g., #1: torchrun --nproc_per_node=1 benchmark_speculator_logical.py --architecture=paged_llama --variant=7b --model_path=~/models/7B-F --tokenizer=~/models/tokenizer.model --model_source=hf --speculator_path=~/models/speculator_7B_F.pth --compile -# e.g., #2: torchrun --nproc_per_node=1 benchmark_speculator_logical.py --architecture=paged_gpt_bigcode --variant=ibm.20b --model_path=~/models/granite-20b-instruct --tokenizer=~/models/granite-20b-instruct --model_source=hf --speculator_path=~/models/speculator_granite20B.pth --n_predict=4 --threshes=[6,4,3,3] +# e.g., #1: torchrun --nproc_per_node=1 benchmark_speculator_logical.py --architecture=paged_llama --variant=7b --model_path=~/models/7B-F --tokenizer=~/models/tokenizer.model --model_source=hf --speculator_path=~/models/speculator_7B_F.pth --compile --data_path="/path/to/training_dataset_tokens/" --subdata="webhose" +# e.g., #2: torchrun --nproc_per_node=1 benchmark_speculator_logical.py --architecture=paged_gpt_bigcode --variant=ibm.20b --model_path=~/models/granite-20b-instruct --tokenizer=~/models/granite-20b-instruct --model_source=hf --speculator_path=~/models/speculator_granite20B.pth --data_path="/path/to/training_dataset_tokens/" --subdata="github" --n_predict=4 --threshes=[6,4,3,3] parser = argparse.ArgumentParser( description="Script to run inference on a causal model" From a0798ada82e2eb2b84c503f446886c9cf87d9adc Mon Sep 17 00:00:00 2001 From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com> Date: Mon, 25 Mar 2024 18:26:50 -0400 Subject: [PATCH 06/10] Update benchmark_speculator_logical.py --- speculator/benchmark_speculator_logical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speculator/benchmark_speculator_logical.py b/speculator/benchmark_speculator_logical.py index 40f9b5f9..489355fb 100644 --- a/speculator/benchmark_speculator_logical.py +++ b/speculator/benchmark_speculator_logical.py @@ -31,7 +31,7 @@ "--architecture", type=str, default="paged_llama", - help="The model architecture to benchmark, e.g. 'paged_llama', 'paged_gpt_bigcode', + help="The model architecture to benchmark, e.g. 'paged_llama', 'paged_gpt_bigcode'", ) parser.add_argument( "--variant", From ef983648bb4fd5c36f78d5a3f1a2ee3fd44124b9 Mon Sep 17 00:00:00 2001 From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com> Date: Wed, 27 Mar 2024 08:50:14 -0400 Subject: [PATCH 07/10] Update benchmark_speculator_logical.py --- speculator/benchmark_speculator_logical.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/speculator/benchmark_speculator_logical.py b/speculator/benchmark_speculator_logical.py index 489355fb..33641232 100644 --- a/speculator/benchmark_speculator_logical.py +++ b/speculator/benchmark_speculator_logical.py @@ -116,7 +116,7 @@ parser.add_argument( "--no_flat", action="store_true", - help="Disable batch auto-flattening for handling candidate trees?" + help="Disable batch auto-flattening for handling candidate trees?", ) parser.add_argument( @@ -137,7 +137,7 @@ "--threshes", type=json.loads, default=[6,4,3], - help="number of top k predictions from each head to generate speculator candidate pool; should be same len as n_predict" + help="number of top k predictions from each head to generate speculator candidate pool; should be same len as n_predict", ) @@ -192,7 +192,10 @@ if args.speculator_path is not None: print("loading speculator") speculator = MLPSpeculator( - model.config.emb_dim, 4096, model.config.src_vocab_size, n_predict=args.n_predict + model.config.emb_dim, + 4096, + model.config.src_vocab_size, + n_predict=args.n_predict, ) speculator.load_state_dict( torch.load(args.speculator_path, map_location=device)["model_state"] @@ -249,6 +252,7 @@ in_middle = True data = torch.IntTensor(data).to(device) + def ids_for_prompt(prompt): tokens = tokenizer.tokenize(prompt) tokens = [""] + tokens @@ -275,7 +279,11 @@ def infer(ids, k, warmup, model, decode_model, speculator): # With greedy generation (do_sample=False) we _should_ always get the same results. # There is currently a bug in start_pos for batched rotary embeddings that can lead # varying results for the same prompt. - max_seq_len = model.config.max_expected_seq_len if hasattr(model.config, "max_expected_seq_len") else model.config.max_pos + max_seq_len = ( + model.config.max_expected_seq_len + if hasattr(model.config, "max_expected_seq_len") + else model.config.max_pos + ) if k != 0: result, n_steps, generated_token_time_out = speculative_generate( @@ -309,6 +317,7 @@ def infer(ids, k, warmup, model, decode_model, speculator): return generated_token_time_out / avg_tokens, avg_tokens / n_steps return None + torch._dynamo.config.cache_size_limit = 64 torch.cuda.empty_cache() From 85f2bf00874b8fd3369899c7eed67d8dbe5790d5 Mon Sep 17 00:00:00 2001 From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com> Date: Wed, 27 Mar 2024 08:51:47 -0400 Subject: [PATCH 08/10] Update benchmark_speculator_logical.py --- speculator/benchmark_speculator_logical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speculator/benchmark_speculator_logical.py b/speculator/benchmark_speculator_logical.py index 33641232..078aae96 100644 --- a/speculator/benchmark_speculator_logical.py +++ b/speculator/benchmark_speculator_logical.py @@ -136,7 +136,7 @@ parser.add_argument( "--threshes", type=json.loads, - default=[6,4,3], + default=[6, 4, 3], help="number of top k predictions from each head to generate speculator candidate pool; should be same len as n_predict", ) From eba5053b9137bbbb39df13a47c6a3647473c436f Mon Sep 17 00:00:00 2001 From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com> Date: Wed, 27 Mar 2024 08:54:04 -0400 Subject: [PATCH 09/10] Update benchmark_speculator.py --- speculator/benchmark_speculator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speculator/benchmark_speculator.py b/speculator/benchmark_speculator.py index 5658eec0..45d9b70e 100644 --- a/speculator/benchmark_speculator.py +++ b/speculator/benchmark_speculator.py @@ -100,7 +100,7 @@ parser.add_argument( "--no_flat", action="store_true", - help="Disable batch auto-flattening for handling candidate trees?" + help="Disable batch auto-flattening for handling candidate trees?", ) args = parser.parse_args() From 1d5858bf9d4c6952c852206b308a8d7cd0adb909 Mon Sep 17 00:00:00 2001 From: Sahil Suneja <6835847+sahilsuneja1@users.noreply.github.com> Date: Wed, 27 Mar 2024 13:57:19 -0400 Subject: [PATCH 10/10] Update benchmark_speculator_logical.py --- speculator/benchmark_speculator_logical.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/speculator/benchmark_speculator_logical.py b/speculator/benchmark_speculator_logical.py index 078aae96..0f4eca6f 100644 --- a/speculator/benchmark_speculator_logical.py +++ b/speculator/benchmark_speculator_logical.py @@ -1,12 +1,12 @@ import argparse import itertools +import json import os import time -import torch -import json -import fms_extras.models.paged_llama import fms_extras.models.paged_gpt_bigcode +import fms_extras.models.paged_llama +import torch import torch._inductor.config from fms.models import get_model from fms.utils import generation, tokenizers @@ -207,6 +207,7 @@ # cache setup from fms_extras.utils.cache.paged import PagedKVCacheManager + use_cache = True if hasattr(model.config, "kvheads"): kv_heads = model.config.kvheads