From 4972b846a07765c6f3fac74774930da59b9c565e Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 12 Apr 2025 16:10:12 -0400
Subject: [PATCH 01/17] increase MAX_NUM_TOKENS

---
 include/flexflow/batch_config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 4eb80f8ef..ebd0ecbfe 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -97,7 +97,7 @@ class BatchConfig {
   // These maximum values are used for copying BatchConfig
   // across workers
   static int const MAX_NUM_REQUESTS = 260;
-  static int const MAX_NUM_TOKENS = 8192;
+  static int const MAX_NUM_TOKENS = 16384;
   static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
   static int const MAX_PEFT_CONFIG_SIZE = 1024;
 

From 0c3ea2a1905bd3b169f93c64d96224c536b43f66 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 13 Apr 2025 17:23:16 +0000
Subject: [PATCH 02/17] peft support mode enum

---
 include/flexflow/config.h                    |  2 +-
 include/flexflow/ffconst.h                   |  8 +++
 include/flexflow/ffconst_utils.h             |  4 ++
 include/flexflow/flexflow_c.h                | 11 ++--
 include/flexflow/inference.h                 |  1 +
 include/flexflow/op_meta.h                   |  2 +-
 include/flexflow/operator.h                  |  4 +-
 include/flexflow/request_manager.h           |  4 +-
 inference/README.md                          |  2 +-
 inference/flexllm/peft_train.cc              | 45 ++++++++++------
 inference/models/falcon.cc                   |  4 +-
 inference/models/llama.cc                    |  8 +--
 inference/models/mpt.cc                      |  4 +-
 inference/models/opt.cc                      |  4 +-
 inference/models/starcoder.cc                |  4 +-
 inference/peft/peft.cc                       | 56 +++++++++++++-------
 inference/python/chat.py                     |  2 +-
 inference/python/ff_peft.py                  | 25 ++++++---
 inference/python/incr_decoding.py            | 10 +++-
 inference/python/spec_infer.py               | 10 +++-
 inference/python/streamlit/fastapi_incr.py   |  4 +-
 python/flexflow/core/__init__.py             |  1 -
 python/flexflow/core/flexflow_cffi.py        | 47 ++++++++--------
 python/flexflow/serve/__init__.py            | 18 +++----
 python/flexflow/serve/models/falcon.py       |  4 +-
 python/flexflow/serve/models/llama.py        |  4 +-
 python/flexflow/serve/models/mpt.py          |  4 +-
 python/flexflow/serve/models/opt.py          |  4 +-
 python/flexflow/serve/models/starcoder.py    |  4 +-
 python/flexflow/serve/serve.py               | 18 +++----
 python/flexflow/type.py                      | 14 +++++
 src/c/flexflow_c.cc                          | 24 ++++-----
 src/ops/add_bias_residual_layer_norm.cu      |  8 ++-
 src/ops/aggregate.cc                         |  1 -
 src/ops/aggregate_spec.cc                    |  1 -
 src/ops/arg_topk.cc                          |  1 -
 src/ops/argmax.cc                            |  1 -
 src/ops/attention.cc                         |  1 -
 src/ops/batch_matmul.cc                      |  1 -
 src/ops/batch_norm.cu                        |  1 -
 src/ops/beam_topk.cc                         |  1 -
 src/ops/cache.cc                             |  1 -
 src/ops/concat.cc                            |  1 -
 src/ops/conv_2d.cc                           |  1 -
 src/ops/element_binary.cc                    |  2 -
 src/ops/element_unary.cc                     |  1 -
 src/ops/embedding.cc                         |  1 -
 src/ops/experts.cc                           |  1 -
 src/ops/group_by.cc                          |  1 -
 src/ops/inc_multihead_self_attention.cc      |  1 -
 src/ops/inc_multihead_self_attention.cu      | 14 ++---
 src/ops/kernels/dropout_kernels.cu           |  1 -
 src/ops/kernels/element_binary_kernels.cu    |  1 -
 src/ops/kernels/linear_kernels.cu            |  4 +-
 src/ops/kernels/residual_rms_norm_kernels.cu |  6 +--
 src/ops/kernels/rms_norm_kernels.cu          |  6 +--
 src/ops/kernels/softmax.cu                   |  5 +-
 src/ops/layer_norm.cu                        |  8 ++-
 src/ops/lora_linear.cc                       |  2 +-
 src/ops/pool_2d.cc                           |  1 -
 src/ops/residual_layer_norm.cu               |  8 ++-
 src/ops/sampling.cc                          |  1 -
 src/ops/sigmoid_silu_multi.cu                |  3 +-
 src/ops/spec_inc_multihead_self_attention.cc |  1 -
 src/ops/topk.cc                              |  1 -
 src/ops/transpose.cc                         |  1 -
 src/ops/tree_inc_multihead_self_attention.cc |  1 -
 src/runtime/ffconst_utils.cc                 | 19 +++++++
 src/runtime/model.cc                         | 15 ++----
 src/runtime/request_manager.cc               | 34 ++++++------
 tests/inference/cpp_inference_tests.sh       |  2 +-
 tests/inference/generate_inf_test_configs.py |  2 +-
 tests/peft_test.sh                           |  6 +--
 73 files changed, 291 insertions(+), 233 deletions(-)

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index ca438f02e..341d7c784 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -178,7 +178,7 @@ class FFConfig {
   size_t offload_reserve_space_size;
   DataType quantization_type;
   // PEFT related fields
-  bool enable_peft, enable_peft_finetuning;
+  PeftSupportMode peft_support_mode;
   // Control parallelizable dimensions
   bool only_data_parallel;
   bool enable_sample_parallel;
diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index 57854c722..18e19cfa9 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -84,6 +84,14 @@ enum RequestType {
   REQ_FINETUNING = 4002,
 };
 
+enum PeftSupportMode {
+  PEFT_DISABLED = 5001,
+  PEFT_INFERENCE_ONLY = 5002,
+  COSERVING = 5003,
+  TEMPORAL_SHARING = 5004,
+  SPATIAL_SHARING = 5005,
+};
+
 // This is consistent with TASO's OpType
 // https://github.com/jiazhihao/TASO/blob/master/include/taso/ops.h#L75-L138
 enum OperatorType {
diff --git a/include/flexflow/ffconst_utils.h b/include/flexflow/ffconst_utils.h
index 421a139d5..ae79bd3b9 100644
--- a/include/flexflow/ffconst_utils.h
+++ b/include/flexflow/ffconst_utils.h
@@ -18,6 +18,10 @@ size_t get_quantization_to_byte_size(DataType type,
 
 std::ostream &operator<<(std::ostream &, OperatorType);
 
+const char* peftSupportModeToString(PeftSupportMode mode);
+bool peft_finetuning_enabled(PeftSupportMode peft_support_mode);
+bool peft_enabled(PeftSupportMode peft_support_mode);
+
 }; // namespace FlexFlow
 
 #endif // _FLEXFLOW_FFCONST_UTILS_H
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index e26279e26..fbb531e77 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -92,11 +92,10 @@ int flexflow_config_get_tensor_parallelism_degree(flexflow_config_t handle_);
 
 int flexflow_config_get_pipeline_parallelism_degree(flexflow_config_t handle_);
 
-bool flexflow_config_get_enable_peft(flexflow_config_t handle_);
+void flexflow_config_set_peft_support_mode(flexflow_config_t handle_,
+                                            enum PeftSupportMode value);
 
-bool flexflow_config_get_enable_peft_finetuning(flexflow_config_t handle_);
-void flexflow_config_set_enable_peft_finetuning(flexflow_config_t handle_,
-                                                bool value);
+enum PeftSupportMode flexflow_config_get_peft_support_mode(flexflow_config_t handle_);
 
 void flexflow_config_set_data_parallelism_degree(flexflow_config_t handle_,
                                                  int value);
@@ -984,8 +983,8 @@ int flexflow_request_manager_get_max_sequence_length(
 void flexflow_request_manager_set_max_concurrent_adapters(
     flexflow_request_manager_t handle_, int max_concurrent_adapters);
 
-void flexflow_request_manager_set_enable_peft_finetuning(
-    flexflow_request_manager_t handle_, bool enable_peft_finetuning_);
+void flexflow_request_manager_set_peft_support_mode(
+    flexflow_request_manager_t handle_, enum PeftSupportMode peft_support_mode_);
 
 void flexflow_request_manager_set_num_transformers_layers(
     flexflow_request_manager_t handle_, int num_transformers_layers_);
diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
index 755df9f5c..975f77e42 100644
--- a/include/flexflow/inference.h
+++ b/include/flexflow/inference.h
@@ -14,6 +14,7 @@
  */
 
 #pragma once
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/batch_config.h"
 #include <string>
 #include <vector>
diff --git a/include/flexflow/op_meta.h b/include/flexflow/op_meta.h
index aea154138..aaf3d943d 100644
--- a/include/flexflow/op_meta.h
+++ b/include/flexflow/op_meta.h
@@ -16,7 +16,7 @@ class OpMeta {
   FFHandler handle;
   bool profiling; // Measure the run time of the task
   bool inference_debugging;
-  bool enable_peft_finetuning;
+  enum PeftSupportMode peft_support_mode;
   int decoding_step;
   int bwd_step;
   char op_name[MAX_OPNAME];
diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index b07901aab..9028e865f 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -200,7 +200,7 @@ class Op {
   Op(int guid,
      bool profiling,
      bool inference_debugging,
-     bool enable_peft_finetuning,
+     PeftSupportMode peft_support_mode,
      OperatorType otype,
      DataType dtype,
      char const *name,
@@ -474,7 +474,7 @@ class Op {
   int numInputs, numWeights, numOutputs;
   bool profiling;
   bool inference_debugging;
-  bool enable_peft_finetuning;
+  PeftSupportMode peft_support_mode;
   bool add_bias_only_once;
 #ifdef FF_USE_NCCL
   ncclUniqueId ncclId;
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 5dfff05b4..8ffe35858 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -190,7 +190,7 @@ class RequestManager {
   void set_max_sequence_length(int max_seq_length);
 
   void push_spec_infer_tree_width(int tree_width);
-  void set_enable_peft_finetuning(bool enable_peft_finetuning_);
+  void set_peft_support_mode(PeftSupportMode peft_support_mode_);
   void set_inference_finished(bool finished = true);
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
@@ -407,7 +407,7 @@ class RequestManager {
   int max_lora_rank = 32;
   int max_concurrent_adapters = 0;
   // peft benchmarking
-  bool enable_peft_finetuning = false;
+  PeftSupportMode peft_support_mode = PEFT_DISABLED;
   bool inference_finished = false;
   int num_transformer_layers = 0;
   int num_layers_per_finetuning_step = 0;
diff --git a/inference/README.md b/inference/README.md
index 14c94e22a..49a3b9eb2 100644
--- a/inference/README.md
+++ b/inference/README.md
@@ -36,7 +36,7 @@ To run a PEFT model example in C++, call:
     -llm-model JackFram/llama-160m \
     -finetuning-dataset ../inference/prompt/peft_dataset.json \
     -peft-model goliaro/llama-160m-lora \
-    -enable-peft \
+    --peft-support-mode COSERVING \
     --use-full-precision \
     --inference-debugging
 ```
\ No newline at end of file
diff --git a/inference/flexllm/peft_train.cc b/inference/flexllm/peft_train.cc
index 295f6f9cd..c2573e5b3 100644
--- a/inference/flexllm/peft_train.cc
+++ b/inference/flexllm/peft_train.cc
@@ -46,7 +46,7 @@ void parse_input_args(char **argv,
                       bool &use_full_precision,
                       bool &verbose,
                       bool &do_sample,
-                      bool &enable_peft,
+                      PeftSupportMode &peft_support_mode,
                       float &temperature,
                       float &topp,
                       int &max_requests_per_batch,
@@ -68,10 +68,6 @@ void parse_input_args(char **argv,
       }
       continue;
     }
-    if (!strcmp(argv[i], "-enable-peft")) {
-      enable_peft = true;
-      continue;
-    }
     if (!strcmp(argv[i], "-peft-model")) {
       peft_model_name = std::string(argv[++i]);
       for (char &c : peft_model_name) {
@@ -140,8 +136,7 @@ void parse_input_args(char **argv,
       max_sequence_length = std::stoi(argv[++i]);
       continue;
     }
-    // num kv cache slots for inference (i.e. number of tokens across all
-    // requests)
+    // num kv cache slots for inference (i.e. number of tokens across all requests)
     if (!strcmp(argv[i], "--num-kv-cache-slots")) {
       num_kv_cache_slots = std::stoi(argv[++i]);
       continue;
@@ -166,6 +161,28 @@ void parse_input_args(char **argv,
       num_layers_per_finetuning_step = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--peft-support-mode")) {
+      std::string mode = argv[++i];
+      // Convert to lowercase for comparison
+      for (char &c : mode) {
+        c = std::tolower(c);
+      }
+      if (mode == "disabled") {
+        peft_support_mode = PEFT_DISABLED;
+      } else if (mode == "inference_only" || mode == "inference-only") {
+        peft_support_mode = PEFT_INFERENCE_ONLY;
+      } else if (mode == "coserving") {
+        peft_support_mode = COSERVING;
+      } else if (mode == "temporal_sharing" || mode == "temporal-sharing") {
+        peft_support_mode = TEMPORAL_SHARING;
+      } else if (mode == "spatial_sharing" || mode == "spatial-sharing") {
+        peft_support_mode = SPATIAL_SHARING;
+      } else {
+        std::cerr << "Unknown peft support mode: " << mode << std::endl;
+        assert(false && "Invalid peft support mode");
+      }
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -295,7 +312,7 @@ void FlexFlow::top_level_task(Task const *task,
   bool use_full_precision = false;
   bool verbose = false;
   bool do_sample = false;
-  bool enable_peft = false;
+  ffconfig.peft_support_mode = COSERVING;
   float temperature = 0.0f;
   float topp = 0.0f;
   int max_requests_per_batch = 1;
@@ -305,7 +322,6 @@ void FlexFlow::top_level_task(Task const *task,
   int max_training_epochs = 1;
   int gradient_accumulation_steps = 8;
   int num_logging_steps = 10;
-  bool enable_peft_finetuning = true;
   int num_layers_per_finetuning_step = -1;
   bool run_warmup = false;
   int num_kv_cache_slots = -1;
@@ -322,7 +338,7 @@ void FlexFlow::top_level_task(Task const *task,
                    use_full_precision,
                    verbose,
                    do_sample,
-                   enable_peft,
+                   ffconfig.peft_support_mode,
                    temperature,
                    topp,
                    max_requests_per_batch,
@@ -335,10 +351,8 @@ void FlexFlow::top_level_task(Task const *task,
                    num_logging_steps,
                    num_layers_per_finetuning_step,
                    run_warmup);
-  enable_peft_finetuning = file_paths.dataset_file_path.empty() ? false : true;
-  assert(
-      enable_peft && enable_peft_finetuning &&
-      "Cannot train LORA adapter if PEFT and PEFT finetuning are not enabled");
+  assert(peft_finetuning_enabled(ffconfig.peft_support_mode) &&
+         "Cannot train LORA adapter if finetuning is not enabled");
   assert(!file_paths.dataset_file_path.empty() &&
          "Cannot train LORA adapter if dataset path is empty");
   assert(!peft_model_name.empty() &&
@@ -351,7 +365,6 @@ void FlexFlow::top_level_task(Task const *task,
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
          ffconfig.numNodes * ffconfig.workersPerNode);
-  ffconfig.enable_peft_finetuning = enable_peft_finetuning;
 
   std::string config_filepath = join_path(
       {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
@@ -441,7 +454,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->register_tokenizer(
       model_type, bos_token_id, eos_token_ids, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
-  rm->set_enable_peft_finetuning(enable_peft_finetuning);
+  rm->set_peft_support_mode(ffconfig.peft_support_mode);
   rm->set_max_lora_rank(rank);
 
   FFModel model(ffconfig, ffconfig.cpu_offload);
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index 3a345b15c..8b6f423eb 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -43,7 +43,7 @@ void FALCON::create_falcon_model(FFModel &ff,
   int batch_tensor_num_tokens = BatchConfig::max_tokens_per_batch();
   if (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) {
     batch_tensor_num_tokens = BatchConfig::max_verify_tokens_per_batch();
-  } else if (ff.config.enable_peft_finetuning) {
+  } else if (peft_finetuning_enabled(ff.config.peft_support_mode)) {
     batch_tensor_num_tokens = BatchConfig::max_sequence_length();
   }
   int const token_dims[] = {batch_tensor_num_tokens, 1};
@@ -270,7 +270,7 @@ void FALCON::create_falcon_model(FFModel &ff,
   }
 
   // If PEFT is enabled, add LoRA layers
-  if (ff.config.enable_peft) {
+  if (peft_enabled(ff.config.peft_support_mode)) {
     // todo: add attention projections
     std::vector<std::string> target_modules = {"dense_h_to_4h",
                                                "dense_4h_to_h"};
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 22fd36526..76abba0f5 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -36,8 +36,8 @@ void LLAMA::create_llama_model(FFModel &ff,
     assert(false && "The number of attention heads is smaller, or it is not "
                     "divisible by the tensor parallelism degree");
   }
-  std::cout << "Creating llama model with ff.config.enable_peft_finetuning="
-            << ff.config.enable_peft_finetuning << std::endl;
+  std::cout << "Creating llama model with ff.config.peft_support_mode="
+            << peftSupportModeToString(ff.config.peft_support_mode) << std::endl;
   assert(llama_config.hidden_size % llama_config.num_attention_heads == 0 &&
          "Hidden size not divisible by number of attention heads");
   int tot_num_heads =
@@ -46,7 +46,7 @@ void LLAMA::create_llama_model(FFModel &ff,
   int batch_tensor_num_tokens = BatchConfig::max_tokens_per_batch();
   if (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) {
     batch_tensor_num_tokens = BatchConfig::max_verify_tokens_per_batch();
-  } else if (ff.config.enable_peft_finetuning) {
+  } else if (peft_finetuning_enabled(ff.config.peft_support_mode)) {
     batch_tensor_num_tokens = BatchConfig::max_sequence_length();
   }
   int const token_dims[] = {batch_tensor_num_tokens, 1};
@@ -297,7 +297,7 @@ void LLAMA::create_llama_model(FFModel &ff,
   }
 
   // If PEFT is enabled, add LoRA layers
-  if (ff.config.enable_peft) {
+  if (peft_enabled(ff.config.peft_support_mode)) {
     // todo: add attention projections
     std::vector<std::string> target_modules = {"down_proj"};
     ff.add_lora_layers(target_modules);
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index db7c2f6f3..58fae387f 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -40,7 +40,7 @@ void MPT::create_mpt_model(FFModel &ff,
   int batch_tensor_num_tokens = BatchConfig::max_tokens_per_batch();
   if (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) {
     batch_tensor_num_tokens = BatchConfig::max_verify_tokens_per_batch();
-  } else if (ff.config.enable_peft_finetuning) {
+  } else if (peft_finetuning_enabled(ff.config.peft_support_mode)) {
     batch_tensor_num_tokens = BatchConfig::max_sequence_length();
   }
   int const token_dims[] = {batch_tensor_num_tokens, 1};
@@ -275,7 +275,7 @@ void MPT::create_mpt_model(FFModel &ff,
   }
 
   // If PEFT is enabled, add LoRA layers
-  if (ff.config.enable_peft) {
+  if (peft_enabled(ff.config.peft_support_mode)) {
     // todo: add attention projections
     std::vector<std::string> target_modules = {"up_proj", "down_proj"};
     ff.add_lora_layers(target_modules);
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index da7bc6ab8..c68e57ffb 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -43,7 +43,7 @@ void OPT::create_opt_model(FFModel &ff,
   int batch_tensor_num_tokens = BatchConfig::max_tokens_per_batch();
   if (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) {
     batch_tensor_num_tokens = BatchConfig::max_verify_tokens_per_batch();
-  } else if (ff.config.enable_peft_finetuning) {
+  } else if (peft_finetuning_enabled(ff.config.peft_support_mode)) {
     batch_tensor_num_tokens = BatchConfig::max_sequence_length();
   }
   int const token_dims[] = {batch_tensor_num_tokens, 1};
@@ -286,7 +286,7 @@ void OPT::create_opt_model(FFModel &ff,
   }
 
   // If PEFT is enabled, add LoRA layers
-  if (ff.config.enable_peft) {
+  if (peft_enabled(ff.config.peft_support_mode)) {
     // todo: add attention projections
     std::vector<std::string> target_modules = {"fc1", "fc2"};
     ff.add_lora_layers(target_modules);
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 7c24b9614..b9b4847bb 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -57,7 +57,7 @@ void STARCODER::create_starcoder_model(
   int batch_tensor_num_tokens = BatchConfig::max_tokens_per_batch();
   if (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE) {
     batch_tensor_num_tokens = BatchConfig::max_verify_tokens_per_batch();
-  } else if (ff.config.enable_peft_finetuning) {
+  } else if (peft_finetuning_enabled(ff.config.peft_support_mode)) {
     batch_tensor_num_tokens = BatchConfig::max_sequence_length();
   }
   int const token_dims[] = {batch_tensor_num_tokens, 1};
@@ -257,7 +257,7 @@ void STARCODER::create_starcoder_model(
   }
 
   // If PEFT is enabled, add LoRA layers
-  if (ff.config.enable_peft) {
+  if (peft_enabled(ff.config.peft_support_mode)) {
     // todo: add attention projections
     std::vector<std::string> target_modules = {"c_fc", "c_proj"};
     ff.add_lora_layers(target_modules);
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index 1b3bbfc0d..6ad8d628e 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -46,7 +46,7 @@ void parse_input_args(char **argv,
                       bool &use_full_precision,
                       bool &verbose,
                       bool &do_sample,
-                      bool &enable_peft,
+                      PeftSupportMode &peft_support_mode,
                       float &temperature,
                       float &topp,
                       int &max_requests_per_batch,
@@ -65,8 +65,26 @@ void parse_input_args(char **argv,
       }
       continue;
     }
-    if (!strcmp(argv[i], "-enable-peft")) {
-      enable_peft = true;
+    if (!strcmp(argv[i], "--peft-support-mode")) {
+      std::string mode = argv[++i];
+      // Convert to lowercase for comparison
+      for (char &c : mode) {
+        c = std::tolower(c);
+      }
+      if (mode == "disabled") {
+        peft_support_mode = PEFT_DISABLED;
+      } else if (mode == "inference_only" || mode == "inference-only") {
+        peft_support_mode = PEFT_INFERENCE_ONLY;
+      } else if (mode == "coserving") {
+        peft_support_mode = COSERVING;
+      } else if (mode == "temporal_sharing" || mode == "temporal-sharing") {
+        peft_support_mode = TEMPORAL_SHARING;
+      } else if (mode == "spatial_sharing" || mode == "spatial-sharing") {
+        peft_support_mode = SPATIAL_SHARING;
+      } else {
+        std::cerr << "Unknown peft support mode: " << mode << std::endl;
+        assert(false && "Invalid peft support mode");
+      }
       continue;
     }
     if (!strcmp(argv[i], "-peft-model")) {
@@ -202,14 +220,13 @@ void FlexFlow::top_level_task(Task const *task,
   bool use_full_precision = false;
   bool verbose = false;
   bool do_sample = false;
-  bool enable_peft = false;
+  ffconfig.peft_support_mode = PEFT_INFERENCE_ONLY;
   float temperature = 0.0f;
   float topp = 0.0f;
   int max_requests_per_batch = 1;
   int max_tokens_per_batch = 128;
   int max_sequence_length = 256;
   int max_training_epochs = 2;
-  bool enable_peft_finetuning = true;
   int num_layers_per_finetuning_step = -1;
   bool run_warmup = false;
   int num_kv_cache_slots = -1;
@@ -225,7 +242,7 @@ void FlexFlow::top_level_task(Task const *task,
                    use_full_precision,
                    verbose,
                    do_sample,
-                   enable_peft,
+                   ffconfig.peft_support_mode,
                    temperature,
                    topp,
                    max_requests_per_batch,
@@ -243,8 +260,9 @@ void FlexFlow::top_level_task(Task const *task,
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
          ffconfig.numNodes * ffconfig.workersPerNode);
-  enable_peft_finetuning = file_paths.dataset_file_path.empty() ? false : true;
-  ffconfig.enable_peft_finetuning = enable_peft_finetuning;
+  if (!file_paths.dataset_file_path.empty()) {
+    ffconfig.peft_support_mode = COSERVING;
+  }
 
   std::string config_filepath = join_path(
       {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
@@ -261,14 +279,14 @@ void FlexFlow::top_level_task(Task const *task,
               << std::endl;
     assert(false);
   }
-  if (!enable_peft) {
+  if (!peft_enabled(ffconfig.peft_support_mode)) {
     std::cerr << "Running PEFT script with PEFT not enabled" << std::endl;
     assert(false);
   }
-  if (enable_peft && peft_model_name.empty()) {
+  if (peft_enabled(ffconfig.peft_support_mode) && peft_model_name.empty()) {
     std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl;
     assert(false);
-  } else if (!enable_peft && !peft_model_name.empty()) {
+  } else if (!peft_enabled(ffconfig.peft_support_mode) && !peft_model_name.empty()) {
     std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl;
     assert(false);
   }
@@ -326,13 +344,13 @@ void FlexFlow::top_level_task(Task const *task,
           : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
 
   LoraOptimizerConfig *optim_config = nullptr;
-  if (enable_peft_finetuning) {
+  if (peft_finetuning_enabled(ffconfig.peft_support_mode)) {
     // float sgd_learning_rate = 2e-1;
     float sgd_learning_rate = 0.001f;
     optim_config = new LoraSGDOptimizerConfig(sgd_learning_rate);
   }
   LoraLinearConfig peft_config_finetuning =
-      !enable_peft_finetuning
+      !peft_finetuning_enabled(ffconfig.peft_support_mode)
           ? LoraLinearConfig::EmptyConfig
           : LoraLinearConfig(file_paths.cache_folder_path,
                              peft_model_name,
@@ -347,16 +365,16 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_verbose(verbose);
   rm->set_max_requests_per_batch(
       max_requests_per_batch +
-      (int)enable_peft_finetuning); // add one slot for finetuning if needed
+      (int)peft_finetuning_enabled(ffconfig.peft_support_mode)); // add one slot for finetuning if needed
   // rm->set_max_concurrent_adapters(max_requests_per_batch +
-  //                                 (int)enable_peft_finetuning);
+  //                                 (int)peft_finetuning_enabled(ffconfig.peft_support_mode));
   rm->set_max_concurrent_adapters(1);
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->register_tokenizer(
       model_type, bos_token_id, eos_token_ids, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
-  rm->set_enable_peft_finetuning(enable_peft_finetuning);
+  rm->set_peft_support_mode(ffconfig.peft_support_mode);
 
   FFModel model(ffconfig, ffconfig.cpu_offload);
   model.set_num_kv_cache_pages(compute_num_kv_cache_pages_needed(
@@ -408,10 +426,10 @@ void FlexFlow::top_level_task(Task const *task,
 
   // Add PEFT adapter(s)
   PEFTModelID *peft_model_id = nullptr, *peft_model_id_finetuning = nullptr;
-  if (!peft_model_name.empty() && !enable_peft_finetuning) {
+  if (!peft_model_name.empty() && !peft_finetuning_enabled(ffconfig.peft_support_mode)) {
     peft_model_id = model.register_peft_adapter(peft_config);
   }
-  if (enable_peft_finetuning) {
+  if (peft_finetuning_enabled(ffconfig.peft_support_mode)) {
     peft_model_id_finetuning =
         model.register_peft_adapter(peft_config_finetuning);
   }
@@ -453,7 +471,7 @@ void FlexFlow::top_level_task(Task const *task,
     }
 
     // Add fine-tuning request
-    if (enable_peft_finetuning) {
+    if (peft_finetuning_enabled(ffconfig.peft_support_mode)) {
       assert(!file_paths.dataset_file_path.empty() &&
              "Dataset file path is required for fine-tuning.");
       printf("Finetuning request with dataset %s\n",
diff --git a/inference/python/chat.py b/inference/python/chat.py
index 7c7454375..4700d1171 100644
--- a/inference/python/chat.py
+++ b/inference/python/chat.py
@@ -34,7 +34,7 @@ def get_configs():
         "offload_reserve_space_size": 8 * 1024,  # 8GB
         "use_4bit_quantization": False,
         "use_8bit_quantization": False,
-        "enable_peft": False,
+        "peft_support_mode": ff.PeftSupportMode.PEFT_DISABLED,
         "profiling": False,
         "benchmarking": False,
         "inference_debugging": False,
diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py
index 42853ce3b..9585f6b5e 100644
--- a/inference/python/ff_peft.py
+++ b/inference/python/ff_peft.py
@@ -33,7 +33,13 @@ def get_configs():
             raise FileNotFoundError(f"Config file {args.config_file} not found.")
         try:
             with open(args.config_file) as f:
-                return json.load(f)
+                config = json.load(f)
+            if "peft_support_mode" in config and isinstance(config["peft_support_mode"], str):
+                try:
+                    config["peft_support_mode"] = ff.PeftSupportMode[config["peft_support_mode"]]
+                except KeyError:
+                    raise ValueError(f"Invalid peft_support_mode value: {config['peft_support_mode']}")
+            return config
         except json.JSONDecodeError as e:
             print("JSON format error:")
             print(e)
@@ -54,7 +60,7 @@ def get_configs():
             "offload_reserve_space_size": 8 * 1024,  # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
-            "enable_peft": True,
+            "peft_support_mode": ff.PeftSupportMode.PEFT_INFERENCE_ONLY,
             "profiling": False,
             "inference_debugging": False,
             "fusion": False,
@@ -107,17 +113,20 @@ def main():
     generation_config = ff.GenerationConfig(
         do_sample=False, temperature=0.9, topp=0.8, topk=1
     )
-    enable_peft_finetuning = len(configs.finetuning_dataset) > 0
+    
+    if len(configs.finetuning_dataset) > 0:
+        configs.peft_support_mode = ff.PeftSupportMode.COSERVING
+        configs_dict["max_requests_per_batch"] = configs_dict.get("max_requests_per_batch", 1) + 1
+        configs_dict["max_concurrent_adapters"] = configs_dict.get("max_concurrent_adapters", 1) + 1
+
     llm.compile(
         generation_config,
-        max_requests_per_batch=configs_dict.get("max_requests_per_batch", 1)
-        + enable_peft_finetuning,
+        max_requests_per_batch=configs_dict.get("max_requests_per_batch", 1),
         max_seq_length=configs_dict.get("max_seq_length", 256),
         max_tokens_per_batch=configs_dict.get("max_tokens_per_batch", 128),
         num_kv_cache_slots=configs_dict.get("num_kv_cache_slots", -1),
-        max_concurrent_adapters=configs_dict.get("max_concurrent_adapters", 1)
-        + enable_peft_finetuning,
-        enable_peft_finetuning=enable_peft_finetuning,
+        max_concurrent_adapters=configs_dict.get("max_concurrent_adapters", 1),
+        peft_support_mode=configs.peft_support_mode,
     )
 
     llm.start_server()
diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py
index 7deb1b5bd..a77f9ea18 100644
--- a/inference/python/incr_decoding.py
+++ b/inference/python/incr_decoding.py
@@ -33,7 +33,13 @@ def get_configs():
             raise FileNotFoundError(f"Config file {args.config_file} not found.")
         try:
             with open(args.config_file) as f:
-                return json.load(f)
+                config = json.load(f)
+            if "peft_support_mode" in config and isinstance(config["peft_support_mode"], str):
+                try:
+                    config["peft_support_mode"] = ff.PeftSupportMode[config["peft_support_mode"]]
+                except KeyError:
+                    raise ValueError(f"Invalid peft_support_mode value: {config['peft_support_mode']}")
+            return config
         except json.JSONDecodeError as e:
             print("JSON format error:")
             print(e)
@@ -54,7 +60,7 @@ def get_configs():
             "offload_reserve_space_size": 8 * 1024,  # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
-            "enable_peft": False,
+            "peft_support_mode": ff.PeftSupportMode.DISABLED,
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py
index 1c803993c..8604452e1 100644
--- a/inference/python/spec_infer.py
+++ b/inference/python/spec_infer.py
@@ -33,7 +33,13 @@ def get_configs():
             raise FileNotFoundError(f"Config file {args.config_file} not found.")
         try:
             with open(args.config_file) as f:
-                return json.load(f)
+                config = json.load(f)
+            if "peft_support_mode" in config and isinstance(config["peft_support_mode"], str):
+                try:
+                    config["peft_support_mode"] = ff.PeftSupportMode[config["peft_support_mode"]]
+                except KeyError:
+                    raise ValueError(f"Invalid peft_support_mode value: {config['peft_support_mode']}")
+            return config
         except json.JSONDecodeError as e:
             print("JSON format error:")
             print(e)
@@ -54,7 +60,7 @@ def get_configs():
             "offload_reserve_space_size": 8 * 1024,  # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
-            "enable_peft": False,
+            "peft_support_mode": ff.PeftSupportMode.DISABLED,
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
diff --git a/inference/python/streamlit/fastapi_incr.py b/inference/python/streamlit/fastapi_incr.py
index 6c7c53d96..c15e65fe2 100644
--- a/inference/python/streamlit/fastapi_incr.py
+++ b/inference/python/streamlit/fastapi_incr.py
@@ -128,7 +128,7 @@ def get_configs():
             "offload_reserve_space_size": 8 * 1024, # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
-            "enable_peft": True,
+            "peft_support_mode": ff.PeftSupportMode.COSERVING,
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
@@ -187,7 +187,7 @@ async def startup_event():
         num_kv_cache_slots=configs_dict.get("num_kv_cache_slots", -1),
         max_concurrent_adapters=configs_dict.get("max_concurrent_adapters", 1)
         + 1,  # +1 for the finetuning request
-        enable_peft_finetuning=True,
+        peft_support_mode=ff.PeftSupportMode.COSERVING,
     )
     llm.start_server()
 
diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py
index 94e925e2f..f7ea11c40 100644
--- a/python/flexflow/core/__init__.py
+++ b/python/flexflow/core/__init__.py
@@ -84,7 +84,6 @@
     "offload_reserve_space_size": "-offload-reserve-space-size",
     "use_4bit_quantization": "--4bit-quantization",
     "use_8bit_quantization": "--8bit-quantization",
-    "enable_peft": "-enable-peft",
 }
 
 
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index 443cb9bfe..50de97545 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -33,6 +33,9 @@
     ModelType,
     OpType,
     ParameterSyncType,
+    PeftSupportMode,
+    peft_finetuning_enabled,
+    peft_enabled,
     enum_to_int,
     int_to_enum,
     data_type_size,
@@ -814,20 +817,19 @@ def python_data_loader_type(self):
         return ffc().flexflow_config_get_python_data_loader_type(self.handle)
 
     @property
-    def enable_peft(self):
-        return ffc().flexflow_config_get_enable_peft(self.handle)
+    def peft_support_mode(self):
+        c_peft_support_mode = ffc().flexflow_config_get_peft_support_mode(self.handle)
+        return int_to_enum(PeftSupportMode, c_peft_support_mode)
 
-    @property
-    def enable_peft_finetuning(self):
-        return ffc().flexflow_config_get_enable_peft_finetuning(self.handle)
 
-    @enable_peft_finetuning.setter
-    def enable_peft_finetuning(self, value):
-        if type(value) is not bool:
+    @peft_support_mode.setter
+    def peft_support_mode(self, value: PeftSupportMode):
+        if not isinstance(value, PeftSupportMode):
             raise ValueError(
-                "enable_peft_finetuning must be specified as a boolean value"
+                "peft_support_mode must be one of the valid options: {}".format(list(PeftSupportMode))
             )
-        ffc().flexflow_config_set_enable_peft_finetuning(self.handle, value)
+        c_peft_support_mode = enum_to_int(PeftSupportMode, value)
+        ffc().flexflow_config_set_peft_support_mode(self.handle, c_peft_support_mode)
 
     @property
     def cpu_offload(self):
@@ -1644,7 +1646,7 @@ def register_ssm_model(self, model):
 
     # Max requests per batch
     def set_max_requests_per_batch(self, max_requests):
-        return ffc().flexflow_request_manager_set_max_requests_per_batch(
+        ffc().flexflow_request_manager_set_max_requests_per_batch(
             self.handle, max_requests
         )
 
@@ -1653,7 +1655,7 @@ def get_max_requests_per_batch(self):
 
     # Max tokens per batch
     def set_max_tokens_per_batch(self, max_tokens):
-        return ffc().flexflow_request_manager_set_max_tokens_per_batch(
+        ffc().flexflow_request_manager_set_max_tokens_per_batch(
             self.handle, max_tokens
         )
 
@@ -1662,7 +1664,7 @@ def get_max_tokens_per_batch(self):
 
     # Max spec tree token num
     def set_max_spec_tree_token_num(self, max_tokens):
-        return ffc().flexflow_request_manager_set_max_spec_tree_token_num(
+        ffc().flexflow_request_manager_set_max_spec_tree_token_num(
             self.handle, max_tokens
         )
 
@@ -1677,7 +1679,7 @@ def get_max_verify_tokens_per_batch(self):
 
     # Max sequence length
     def set_max_sequence_length(self, max_length):
-        return ffc().flexflow_request_manager_set_max_sequence_length(
+        ffc().flexflow_request_manager_set_max_sequence_length(
             self.handle, max_length
         )
 
@@ -1686,25 +1688,28 @@ def get_max_sequence_length(self):
 
     # Num transformer layers
     def set_num_transformers_layers(self, num_layers):
-        return ffc().flexflow_request_manager_set_num_transformers_layers(
+        ffc().flexflow_request_manager_set_num_transformers_layers(
             self.handle, num_layers
         )
 
     # Num layers per finetuning steps
     def set_num_layers_per_finetuning_step(self, num_layers):
-        return ffc().flexflow_request_manager_set_num_layers_per_finetuning_step(
+        ffc().flexflow_request_manager_set_num_layers_per_finetuning_step(
             self.handle, num_layers
         )
 
     def set_max_concurrent_adapters(self, max_adapters):
-        return ffc().flexflow_request_manager_set_max_concurrent_adapters(
+        ffc().flexflow_request_manager_set_max_concurrent_adapters(
             self.handle, max_adapters
         )
 
-    def set_enable_peft_finetuning(self, enable_peft_finetuning):
-        return ffc().flexflow_request_manager_set_enable_peft_finetuning(
-            self.handle, enable_peft_finetuning
-        )
+    def set_peft_support_mode(self, value: PeftSupportMode):
+        if not isinstance(value, PeftSupportMode):
+            raise ValueError(
+                "peft_support_mode must be one of the valid options: {}".format(list(PeftSupportMode))
+            )
+        c_peft_support_mode = enum_to_int(PeftSupportMode, value)
+        ffc().flexflow_request_manager_set_peft_support_mode(self.handle, c_peft_support_mode)
 
     def start_server(self, model):
         return ffc().flexflow_request_manager_start_background_server(
diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
index 6c547d295..a194dc175 100644
--- a/python/flexflow/serve/__init__.py
+++ b/python/flexflow/serve/__init__.py
@@ -54,7 +54,7 @@ def init(
     offload_reserve_space_size: Optional[int] = None,
     use_4bit_quantization: Optional[bool] = None,
     use_8bit_quantization: Optional[bool] = None,
-    enable_peft: Optional[bool] = None,
+    peft_support_mode: Optional[PeftSupportMode] = None,
     profiling: Optional[bool] = None,
     benchmarking: Optional[bool] = None,
     inference_debugging: Optional[bool] = None,
@@ -85,7 +85,7 @@ def init(
     - offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, defaults to 8 GB
     - use_4bit_quantization: whether to use 4-bit quantization, defaults to False
     - use_8bit_quantization: whether to use 8-bit quantization, defaults to False
-    - enable_peft: whether to enable the use of PEFT, defaults to False
+    - peft_support_mode: what kind of PEFT support to enable, defaults to PEFT_DISABLED
     - profiling: whether to enable the FlexFlow profiling mode, defaults to False
     - benchmarking: whether to run benchmaking only, without loading real weights, defaults to False
     - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
@@ -123,8 +123,8 @@ def init(
     :type use_4bit_quantization: Optional[bool], optional
     :param use_8bit_quantization: whether to use 8-bit quantization, defaults to False
     :type use_8bit_quantization: Optional[bool], optional
-    :param enable_peft: whether to enable the use of PEFT, defaults to False
-    :type enable_peft: Optional[bool], optional
+    :param peft_support_mode: what kind of PEFT support to enable, defaults to PEFT_DISABLED
+    :type peft_support_mode: Optional[PeftSupportMode], optional
     :param profiling: whether to enable the FlexFlow profiling mode, defaults to False
     :type profiling: Optional[bool], optional
     :param benchmarking: whether to run benchmaking only, without loading real weights, defaults to False
@@ -157,7 +157,7 @@ def init(
             offload_reserve_space_size is not None,
             use_4bit_quantization is not None,
             use_8bit_quantization is not None,
-            enable_peft is not None,
+            peft_support_mode is not None,
             profiling is not None,
             benchmarking is not None,
             inference_debugging is not None,
@@ -186,14 +186,14 @@ def init(
             "offload_reserve_space_size": offload_reserve_space_size,
             "use_4bit_quantization": use_4bit_quantization,
             "use_8bit_quantization": use_8bit_quantization,
-            "enable_peft": enable_peft,
+            "peft_support_mode": peft_support_mode,
             "profiling": profiling,
             "benchmarking": benchmarking,
             "inference_debugging": inference_debugging,
             "fusion": fusion,
             "log_instance_cration": log_instance_cration,
         }
-
+    print("configs_dict: ", configs_dict)
     # Check that mandatory configs are present
     required_keys = ["num_gpus", "memory_per_gpu", "zero_copy_memory_per_node"]
     for required_key in required_keys:
@@ -235,8 +235,8 @@ def init(
         configs_dict["use_4bit_quantization"] = False
     if configs_dict.get("use_8bit_quantization", None) is None:
         configs_dict["use_8bit_quantization"] = False
-    if configs_dict.get("enable_peft", None) is None:
-        configs_dict["enable_peft"] = False
+    if configs_dict.get("peft_support_mode", None) is None:
+        configs_dict["peft_support_mode"] = PeftSupportMode.PEFT_DISABLED
     if configs_dict.get("profiling", None) is None:
         configs_dict["profiling"] = False
     if configs_dict.get("benchmarking", None) is None:
diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index b418d5bdc..25d19564f 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -105,7 +105,7 @@ def build_model(self):
         batch_tensor_num_tokens = self.rm.get_max_tokens_per_batch()
         if is_spec:
             batch_tensor_num_tokens = self.rm.get_max_verify_tokens_per_batch()
-        elif self.ffconfig.enable_peft_finetuning:
+        elif peft_finetuning_enabled(self.ffconfig.peft_support_mode):
             batch_tensor_num_tokens = self.rm.get_max_sequence_length()
 
         tokens_dims = [batch_tensor_num_tokens, 1]
@@ -263,7 +263,7 @@ def build_model(self):
                 softmax = self.ffmodel.softmax(lm_head, -1)
                 output = self.ffmodel.argmax(softmax, False)
 
-        if self.ffconfig.enable_peft:
+        if peft_enabled(self.ffconfig.peft_support_mode):
             # TODO: add attention projections
             self.ffmodel.add_lora_layers(["dense_h_to_4h", "dense_4h_to_h"])
 
diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index f9099e542..649323e8b 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -105,7 +105,7 @@ def build_model(self):
         batch_tensor_num_tokens = self.rm.get_max_tokens_per_batch()
         if is_spec:
             batch_tensor_num_tokens = self.rm.get_max_verify_tokens_per_batch()
-        elif self.ffconfig.enable_peft_finetuning:
+        elif peft_finetuning_enabled(self.ffconfig.peft_support_mode):
             batch_tensor_num_tokens = self.rm.get_max_sequence_length()
 
         tokens_dims = [batch_tensor_num_tokens, 1]
@@ -267,7 +267,7 @@ def build_model(self):
                 softmax = self.ffmodel.softmax(dense, -1)
                 output = self.ffmodel.argmax(softmax, False)
 
-        if self.ffconfig.enable_peft:
+        if peft_enabled(self.ffconfig.peft_support_mode):
             # TODO: add attention projections
             self.ffmodel.add_lora_layers(["gate_proj", "up_proj", "down_proj", "o_proj", "qkv_proj"])
 
diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index b2d8c90ce..e524aa5c8 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -75,7 +75,7 @@ def build_model(self):
         batch_tensor_num_tokens = self.rm.get_max_tokens_per_batch()
         if is_spec:
             batch_tensor_num_tokens = self.rm.get_max_verify_tokens_per_batch()
-        elif self.ffconfig.enable_peft_finetuning:
+        elif peft_finetuning_enabled(self.ffconfig.peft_support_mode):
             batch_tensor_num_tokens = self.rm.get_max_sequence_length()
 
         tokens_dims = [batch_tensor_num_tokens, 1]
@@ -255,7 +255,7 @@ def build_model(self):
             softmax = self.ffmodel.softmax(lm_head, -1)
             output = self.ffmodel.argmax(softmax, False)
 
-        if self.ffconfig.enable_peft:
+        if peft_enabled(self.ffconfig.peft_support_mode):
             # TODO: add attention projections
             self.ffmodel.add_lora_layers(["up_proj", "down_proj"])
 
diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index e93760346..7aa12391e 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -84,7 +84,7 @@ def build_model(self):
         batch_tensor_num_tokens = self.rm.get_max_tokens_per_batch()
         if is_spec:
             batch_tensor_num_tokens = self.rm.get_max_verify_tokens_per_batch()
-        elif self.ffconfig.enable_peft_finetuning:
+        elif peft_finetuning_enabled(self.ffconfig.peft_support_mode):
             batch_tensor_num_tokens = self.rm.get_max_sequence_length()
 
         tokens_dims = [batch_tensor_num_tokens, 1]
@@ -283,7 +283,7 @@ def build_model(self):
                 softmax = self.ffmodel.softmax(lm_head, -1)
                 output = self.ffmodel.argmax(softmax, False)
 
-        if self.ffconfig.enable_peft:
+        if peft_enabled(self.ffconfig.peft_support_mode):
             # TODO: add attention projections
             self.ffmodel.add_lora_layers(["fc1", "fc2"])
 
diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index 2cc1cff5b..aa52357ac 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -81,7 +81,7 @@ def build_model(self):
         batch_tensor_num_tokens = self.rm.get_max_tokens_per_batch()
         if is_spec:
             batch_tensor_num_tokens = self.rm.get_max_verify_tokens_per_batch()
-        elif self.ffconfig.enable_peft_finetuning:
+        elif peft_finetuning_enabled(self.ffconfig.peft_support_mode):
             batch_tensor_num_tokens = self.rm.get_max_sequence_length()
 
         tokens_dims = [batch_tensor_num_tokens, 1]
@@ -221,7 +221,7 @@ def build_model(self):
             softmax = self.ffmodel.softmax(lm_head, -1)
             output = self.ffmodel.argmax(softmax, False)
 
-        if self.ffconfig.enable_peft:
+        if peft_enabled(self.ffconfig.peft_support_mode):
             # TODO: add attention projections
             self.ffmodel.add_lora_layers(["c_fc", "c_proj"])
 
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index ca467a8f6..17156eec9 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -471,7 +471,7 @@ def compile(
         max_tokens_per_batch: int = 64,
         num_kv_cache_slots: int = -1,
         max_concurrent_adapters: int = 1,
-        enable_peft_finetuning: bool = False,
+        peft_support_mode: PeftSupportMode = PeftSupportMode.PEFT_DISABLED,
         num_bwd_layers_per_ft_step: int = -1,
         ssms: list = [],
     ):
@@ -489,8 +489,8 @@ def compile(
         :type num_kv_cache_slots: int, optional
         :param max_concurrent_adapters: The maximum number of concurrent LoRA adapters, defaults to 1
         :type max_concurrent_adapters: int, optional
-        :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False
-        :type enable_peft_finetuning: bool, optional
+        :param peft_support_mode: The PEFT support mode to use, defaults to PeftSupportMode.PEFT_DISABLED
+        :type peft_support_mode: PeftSupportMode, optional
         :param num_bwd_layers_per_ft_step: The number of backward layers to run per finetuning step, defaults to -1 (i.e. all layers)
         :type num_bwd_layers_per_ft_step: int, optional
         :param ssms: The SSMs to use when operating in speculative inference mode, defaults to []
@@ -514,7 +514,7 @@ def compile(
         self.max_spec_tree_token_num = 20
 
         self.max_seq_length = max_seq_length
-        self.ffconfig.enable_peft_finetuning = enable_peft_finetuning
+        self.ffconfig.peft_support_mode = peft_support_mode
         self.num_kv_cache_slots = num_kv_cache_slots
         if num_kv_cache_slots < 0:
             if is_spec:
@@ -531,7 +531,7 @@ def compile(
         self.rm.set_max_spec_tree_token_num(self.max_spec_tree_token_num)
         self.rm.set_max_sequence_length(max_seq_length)
         self.rm.set_max_concurrent_adapters(max_concurrent_adapters)
-        self.rm.set_enable_peft_finetuning(enable_peft_finetuning)
+        self.rm.set_peft_support_mode(peft_support_mode)
         self.rm.set_num_transformers_layers(self.hf_config.num_hidden_layers)
         if num_bwd_layers_per_ft_step != -1:
             self.rm.set_num_layers_per_finetuning_step(num_bwd_layers_per_ft_step)
@@ -805,7 +805,7 @@ def compile(
         max_tokens_per_batch: int = 2048,
         num_kv_cache_slots: int = -1,
         max_concurrent_adapters: int = 1,
-        enable_peft_finetuning: bool = False,
+        peft_support_mode: PeftSupportMode = PeftSupportMode.PEFT_DISABLED,
         num_bwd_layers_per_ft_step: int = -1,
         ssms: list = [],
     ):
@@ -822,8 +822,8 @@ def compile(
         :type num_kv_cache_slots: int, optional
         :param max_concurrent_adapters: The maximum number of concurrent LoRA adapters, defaults to 1
         :type max_concurrent_adapters: int, optional
-        :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False
-        :type enable_peft_finetuning: bool, optional
+        :param peft_support_mode: The PEFT support mode to use, defaults to PeftSupportMode.PEFT_DISABLED
+        :type peft_support_mode: PeftSupportMode, optional
         :param num_bwd_layers_per_ft_step: The number of backward layers to run per finetuning step, defaults to -1 (i.e. all layers)
         :type num_bwd_layers_per_ft_step: int, optional
         :param ssms: The SSMs to use when operating in speculative inference mode, defaults to []
@@ -836,7 +836,7 @@ def compile(
             max_tokens_per_batch,
             num_kv_cache_slots,
             max_concurrent_adapters,
-            enable_peft_finetuning,
+            peft_support_mode,
             num_bwd_layers_per_ft_step,
             ssms,
         )
\ No newline at end of file
diff --git a/python/flexflow/type.py b/python/flexflow/type.py
index c2eebe899..0db7ce39c 100644
--- a/python/flexflow/type.py
+++ b/python/flexflow/type.py
@@ -163,6 +163,20 @@ class RequestType(Enum):
     REQ_INFERENCE = 4001
     REQ_FINETUNING = 4002
 
+class PeftSupportMode(Enum):
+    PEFT_DISABLED = 5001
+    PEFT_INFERENCE_ONLY = 5002
+    COSERVING = 5003
+    TEMPORAL_SHARING = 5004
+    SPATIAL_SHARING = 5005
+    def __str__(self):
+        return self.name
+
+def peft_finetuning_enabled(peft_support_mode: PeftSupportMode):
+    return peft_support_mode != PeftSupportMode.PEFT_DISABLED and peft_support_mode != PeftSupportMode.PEFT_INFERENCE_ONLY
+
+def peft_enabled(peft_support_mode: PeftSupportMode):
+    return peft_support_mode != PeftSupportMode.PEFT_DISABLED
 
 def enum_to_int(enum, enum_item):
     for item in enum:
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 23439b1fe..4921aa54b 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -174,18 +174,15 @@ void flexflow_config_set_pipeline_parallelism_degree(flexflow_config_t handle_,
   handle->pipeline_parallelism_degree = value;
 }
 
-bool flexflow_config_get_enable_peft(flexflow_config_t handle_) {
+void flexflow_config_set_peft_support_mode(flexflow_config_t handle_,
+                                            enum PeftSupportMode value){
   FFConfig *handle = FFCObjectWrapper::unwrap(handle_);
-  return handle->enable_peft;
+  handle->peft_support_mode = value;
 }
-bool flexflow_config_get_enable_peft_finetuning(flexflow_config_t handle_) {
-  FFConfig *handle = FFCObjectWrapper::unwrap(handle_);
-  return handle->enable_peft_finetuning;
-}
-void flexflow_config_set_enable_peft_finetuning(flexflow_config_t handle_,
-                                                bool value) {
+
+enum PeftSupportMode flexflow_config_get_peft_support_mode(flexflow_config_t handle_){
   FFConfig *handle = FFCObjectWrapper::unwrap(handle_);
-  handle->enable_peft_finetuning = value;
+  return handle->peft_support_mode;
 }
 
 int flexflow_config_get_python_data_loader_type(flexflow_config_t handle_) {
@@ -2675,12 +2672,11 @@ void flexflow_request_manager_set_max_concurrent_adapters(
               max_concurrent_adapters);
 }
 
-void flexflow_request_manager_set_enable_peft_finetuning(
-    flexflow_request_manager_t handle_, bool enable_peft_finetuning_) {
+void flexflow_request_manager_set_peft_support_mode(
+    flexflow_request_manager_t handle_, enum PeftSupportMode peft_support_mode_) {
   RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
-  handle->set_enable_peft_finetuning(enable_peft_finetuning_);
-  DEBUG_PRINT("[RequestManager] set_enable_peft_finetuning %d",
-              enable_peft_finetuning_);
+  handle->set_peft_support_mode(peft_support_mode_);
+  DEBUG_PRINT("[RequestManager] set peft support mode %d", peft_support_mode_);
 }
 
 void flexflow_request_manager_set_num_transformers_layers(
diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu
index 16629f493..b4ade9737 100644
--- a/src/ops/add_bias_residual_layer_norm.cu
+++ b/src/ops/add_bias_residual_layer_norm.cu
@@ -35,14 +35,12 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
   effective_num_elements = ln->effective_num_elements;
   profiling = ln->profiling;
   inference_debugging = ln->inference_debugging;
-  enable_peft_finetuning = ln->enable_peft_finetuning;
   eps = ln->eps;
   DataType data_type = ln->data_type;
   size_t in_dim = ln->inputs[0]->dims[0].size / ln->inputs[0]->dims[0].degree;
   allocated_peft_buffer_size =
-      enable_peft_finetuning ? (data_type_size(data_type) *
-                                BatchConfig::max_sequence_length() * in_dim)
-                             : 0;
+      peft_finetuning_enabled(peft_support_mode) ? (data_type_size(data_type) * BatchConfig::max_sequence_length() * in_dim)
+                                                  : 0;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 3 +
                      allocated_peft_buffer_size;
 
@@ -54,7 +52,7 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
       data_type_size(data_type) * effective_batch_size);
   bias_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
-  if (enable_peft_finetuning) {
+  if (peft_finetuning_enabled(peft_support_mode)) {
     input_activation =
         gpu_mem_allocator.allocate_instance_untyped(allocated_peft_buffer_size);
   }
diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc
index 13b4e8e4c..c83b738a0 100644
--- a/src/ops/aggregate.cc
+++ b/src/ops/aggregate.cc
@@ -245,7 +245,6 @@ OpMeta *Aggregate::init_task(Task const *task,
   AggregateMeta *m = new AggregateMeta(handle, agg);
   m->profiling = agg->profiling;
   m->inference_debugging = agg->inference_debugging;
-  m->enable_peft_finetuning = agg->enable_peft_finetuning;
   std::strcpy(m->op_name, agg->name);
   m->layer_guid = agg->layer_guid;
   return m;
diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc
index 7c9322a93..6ea3ff374 100644
--- a/src/ops/aggregate_spec.cc
+++ b/src/ops/aggregate_spec.cc
@@ -213,7 +213,6 @@ OpMeta *AggregateSpec::init_task(Task const *task,
   AggregateSpecMeta *m = new AggregateSpecMeta(handle, agg);
   m->profiling = agg->profiling;
   m->inference_debugging = agg->inference_debugging;
-  m->enable_peft_finetuning = agg->enable_peft_finetuning;
   std::strcpy(m->op_name, agg->name);
   m->layer_guid = agg->layer_guid;
   return m;
diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc
index 282773b35..16d6baf63 100644
--- a/src/ops/arg_topk.cc
+++ b/src/ops/arg_topk.cc
@@ -278,7 +278,6 @@ OpMeta *ArgTopK::init_task(Task const *task,
   ArgTopKMeta *m = new ArgTopKMeta(handle, topk);
   m->profiling = topk->profiling;
   m->inference_debugging = topk->inference_debugging;
-  m->enable_peft_finetuning = topk->enable_peft_finetuning;
   m->sorted = topk->sorted;
   m->k = topk->k;
   std::strcpy(m->op_name, topk->name);
diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc
index f479d3815..e58e48f80 100644
--- a/src/ops/argmax.cc
+++ b/src/ops/argmax.cc
@@ -248,7 +248,6 @@ OpMeta *ArgMax::init_task(Task const *task,
                                  gpu_mem_allocator);
   m->profiling = s->profiling;
   m->inference_debugging = s->inference_debugging;
-  m->enable_peft_finetuning = s->enable_peft_finetuning;
   m->beam_search = s->beam_search;
   std::strcpy(m->op_name, s->name);
   m->layer_guid = s->layer_guid;
diff --git a/src/ops/attention.cc b/src/ops/attention.cc
index 940aacb9c..aef4f0a16 100644
--- a/src/ops/attention.cc
+++ b/src/ops/attention.cc
@@ -519,7 +519,6 @@ OpMeta *
       new MultiHeadAttentionMeta(handle, attn, gpu_mem, num_samples, num_heads);
   m->profiling = attn->profiling;
   m->inference_debugging = attn->inference_debugging;
-  m->enable_peft_finetuning = attn->enable_peft_finetuning;
   std::strcpy(m->op_name, attn->name);
   m->layer_guid = attn->layer_guid;
   assert(acc_weight.rect.volume() * sizeof(float) == m->weightSize);
diff --git a/src/ops/batch_matmul.cc b/src/ops/batch_matmul.cc
index 2e5d64fb3..e5f0611fb 100644
--- a/src/ops/batch_matmul.cc
+++ b/src/ops/batch_matmul.cc
@@ -282,7 +282,6 @@ OpMeta *BatchMatmul::init_task(Task const *task,
   BatchMatmulMeta *m = new BatchMatmulMeta(handle, bmm);
   m->profiling = bmm->profiling;
   m->inference_debugging = bmm->inference_debugging;
-  m->enable_peft_finetuning = bmm->enable_peft_finetuning;
   m->a_seq_length_dim = bmm->a_seq_length_dim;
   m->b_seq_length_dim = bmm->b_seq_length_dim;
   std::strcpy(m->op_name, bmm->name);
diff --git a/src/ops/batch_norm.cu b/src/ops/batch_norm.cu
index 065fcd10f..01e993067 100644
--- a/src/ops/batch_norm.cu
+++ b/src/ops/batch_norm.cu
@@ -277,7 +277,6 @@ BatchNormMeta::BatchNormMeta(FFHandler handler,
   relu = bn->relu;
   profiling = bn->profiling;
   inference_debugging = bn->inference_debugging;
-  enable_peft_finetuning = bn->enable_peft_finetuning;
   mode = CUDNN_BATCHNORM_SPATIAL;
 #if CUDNN_VERSION >= 7000
   mode = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc
index 098684530..c6c8f5544 100644
--- a/src/ops/beam_topk.cc
+++ b/src/ops/beam_topk.cc
@@ -276,7 +276,6 @@ OpMeta *BeamTopK::init_task(Task const *task,
   BeamTopKMeta *m = new BeamTopKMeta(handle, topk, gpu_mem_allocator);
   m->profiling = topk->profiling;
   m->inference_debugging = topk->inference_debugging;
-  m->enable_peft_finetuning = topk->enable_peft_finetuning;
   std::strcpy(m->op_name, topk->name);
   m->layer_guid = topk->layer_guid;
   m->sorted = topk->sorted;
diff --git a/src/ops/cache.cc b/src/ops/cache.cc
index 73d4d3f27..33b862ae8 100644
--- a/src/ops/cache.cc
+++ b/src/ops/cache.cc
@@ -169,7 +169,6 @@ OpMeta *Cache::init_task(Task const *task,
   m->cache_score = 0.0f;
   m->profiling = c->profiling;
   m->inference_debugging = c->inference_debugging;
-  m->enable_peft_finetuning = c->enable_peft_finetuning;
   std::strcpy(m->op_name, c->name);
   m->layer_guid = c->layer_guid;
   return m;
diff --git a/src/ops/concat.cc b/src/ops/concat.cc
index cffd6fa85..0a82779b6 100644
--- a/src/ops/concat.cc
+++ b/src/ops/concat.cc
@@ -202,7 +202,6 @@ OpMeta *Concat::init_task(Task const *task,
   init_meta(m, cc->legion_axis);
   m->profiling = cc->profiling;
   m->inference_debugging = cc->inference_debugging;
-  m->enable_peft_finetuning = cc->enable_peft_finetuning;
   std::strcpy(m->op_name, cc->name);
   m->layer_guid = cc->layer_guid;
   return m;
diff --git a/src/ops/conv_2d.cc b/src/ops/conv_2d.cc
index 7aa494029..2428c9b99 100644
--- a/src/ops/conv_2d.cc
+++ b/src/ops/conv_2d.cc
@@ -593,7 +593,6 @@ OpMeta *Conv2D::init_task(Task const *task,
   m->use_bias = conv->use_bias;
   m->profiling = conv->profiling;
   m->inference_debugging = conv->inference_debugging;
-  m->enable_peft_finetuning = conv->enable_peft_finetuning;
   m->trainable_inputs[0] = conv->trainable_inputs[0];
   m->reset_input_grads[0] = conv->trainable_inputs[0];
   std::strcpy(m->op_name, conv->name);
diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc
index 47515e7c0..cf8696182 100644
--- a/src/ops/element_binary.cc
+++ b/src/ops/element_binary.cc
@@ -434,7 +434,6 @@ OpMeta *ElementBinary::init_task(Task const *task,
   m->op_type = eb->op_type;
   m->profiling = eb->profiling;
   m->inference_debugging = eb->inference_debugging;
-  m->enable_peft_finetuning = eb->enable_peft_finetuning;
   m->inplace_a = eb->inplace_a;
   m->has_same_operands = eb->has_same_operands;
   m->broadcast_input1 = eb->broadcast_input1;
@@ -1034,7 +1033,6 @@ bool ElementBinary::measure_operator_cost(Simulator *sim,
   m->op_type = op_type;
   m->profiling = this->profiling;
   m->inference_debugging = this->inference_debugging;
-  m->enable_peft_finetuning = this->enable_peft_finetuning;
   m->inplace_a = this->inplace_a;
   m->has_same_operands = this->has_same_operands;
   m->broadcast_input1 = this->broadcast_input1;
diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc
index 9db848370..09cf13c71 100644
--- a/src/ops/element_unary.cc
+++ b/src/ops/element_unary.cc
@@ -361,7 +361,6 @@ OpMeta *ElementUnary::init_task(Task const *task,
   assert(eu->outputs[0]->data_type == eu->inputs[0]->data_type);
   m->profiling = eu->profiling;
   m->inference_debugging = eu->inference_debugging;
-  m->enable_peft_finetuning = eu->enable_peft_finetuning;
   m->inplace = eu->inplace;
   m->scalar = eu->scalar;
   std::strcpy(m->op_name, eu->name);
diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc
index 592a8c11f..a1af564c9 100644
--- a/src/ops/embedding.cc
+++ b/src/ops/embedding.cc
@@ -411,7 +411,6 @@ OpMeta *Embedding::init_task(Task const *task,
   EmbeddingMeta *m = new EmbeddingMeta(handle, embed);
   m->profiling = embed->profiling;
   m->inference_debugging = embed->inference_debugging;
-  m->enable_peft_finetuning = embed->enable_peft_finetuning;
   m->aggr = embed->aggr;
   std::strcpy(m->op_name, embed->name);
   m->layer_guid = embed->layer_guid;
diff --git a/src/ops/experts.cc b/src/ops/experts.cc
index 25d656848..cf7beb303 100644
--- a/src/ops/experts.cc
+++ b/src/ops/experts.cc
@@ -592,7 +592,6 @@ OpMeta *Experts::init_task(Task const *task,
   ExpertsMeta *m = new ExpertsMeta(handle, exp);
   m->profiling = exp->profiling;
   m->inference_debugging = exp->inference_debugging;
-  m->enable_peft_finetuning = exp->enable_peft_finetuning;
   std::strcpy(m->op_name, exp->name);
   m->layer_guid = exp->layer_guid;
   return m;
diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc
index aa7a3079e..03b9a5199 100644
--- a/src/ops/group_by.cc
+++ b/src/ops/group_by.cc
@@ -274,7 +274,6 @@ OpMeta *Group_by::init_task(Task const *task,
   GroupByMeta *m = new GroupByMeta(handle, gb);
   m->profiling = gb->profiling;
   m->inference_debugging = gb->inference_debugging;
-  m->enable_peft_finetuning = gb->enable_peft_finetuning;
   std::strcpy(m->op_name, gb->name);
   m->layer_guid = gb->layer_guid;
   return m;
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index fe9399600..d3a4fc800 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -512,7 +512,6 @@ OpMeta *IncMultiHeadSelfAttention::init_task(
          peft_mem_allocator.instance_total_size);
   // m->profiling = attn->profiling;
   // m->inference_debugging = attn->inference_debugging;
-  // m->enable_peft_finetuning = attn->enable_peft_finetuning;
   std::strcpy(m->op_name, attn->name);
   m->layer_guid = attn->layer_guid;
 
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 561d1b177..4ee060032 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -2328,10 +2328,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   *position_bias = _position_bias;
 
   num_kv_cache_pages = _num_kv_cache_pages;
-  assert(num_kv_cache_pages > 0 || enable_peft_finetuning);
+  assert(num_kv_cache_pages > 0 || peft_finetuning_enabled(peft_support_mode));
 
   // spec decoding and peft finetuning are mutually exclusive
-  if (enable_peft_finetuning) {
+  if (peft_finetuning_enabled(peft_support_mode)) {
     assert(infer_mode == INC_DECODING_MODE);
   }
 
@@ -2365,7 +2365,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                   BatchConfig::max_spec_tree_token_num()));
     }
     kv_cache_instance_size += (key_cache_size + value_cache_size) * size_of_dt;
-    if (enable_peft_finetuning) {
+    if (peft_finetuning_enabled(peft_support_mode)) {
       // add kv cache for single sequence
       peft_key_cache_size = peft_value_cache_size =
           num_kv_heads * kProjSize * BatchConfig::max_sequence_length();
@@ -2393,7 +2393,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                    (num_q_heads + num_kv_heads) /
                    2; // only used for Q and K, not V
     inf_instance_size += complex_size * sizeof(cuFloatComplex);
-    if (enable_peft_finetuning) {
+    if (peft_finetuning_enabled(peft_support_mode)) {
       complex_size_bwd = BatchConfig::max_sequence_length() * qProjSize *
                          (num_q_heads + num_kv_heads) /
                          2; // only used for Q and K, not V
@@ -2406,7 +2406,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       inf_instance_size += 2 * qk_prod_size * size_of_dt;
     }
     // PEFT partial results buffers
-    if (enable_peft_finetuning) {
+    if (peft_finetuning_enabled(peft_support_mode)) {
       allocated_peft_buffer_size1 = BatchConfig::max_sequence_length() *
                                     num_q_heads * qProjSize * size_of_dt;
       flash_attn_softmax_lse_size =
@@ -2466,7 +2466,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       valueCache = kv_cache_mem_allocator.allocate_instance_untyped(
           value_cache_size * size_of_dt);
     }
-    if (enable_peft_finetuning) {
+    if (peft_finetuning_enabled(peft_support_mode)) {
       assert(infer_mode == INC_DECODING_MODE);
       keyCachePeft = peft_mem_allocator.allocate_instance_untyped(
           peft_key_cache_size * size_of_dt);
@@ -2520,7 +2520,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
           qk_prod_size * size_of_dt);
     }
     // peft partial result buffers
-    if (enable_peft_finetuning) {
+    if (peft_finetuning_enabled(peft_support_mode)) {
       query_activation_buffer = peft_mem_allocator.allocate_instance_untyped(
           allocated_peft_buffer_size1);
       peft_token_infos_device =
diff --git a/src/ops/kernels/dropout_kernels.cu b/src/ops/kernels/dropout_kernels.cu
index 15eb4d18a..d65b951f5 100644
--- a/src/ops/kernels/dropout_kernels.cu
+++ b/src/ops/kernels/dropout_kernels.cu
@@ -30,7 +30,6 @@ DropoutMeta::DropoutMeta(FFHandler handler,
     : OpMeta(handler, dropout) {
   profiling = dropout->profiling;
   inference_debugging = dropout->inference_debugging;
-  enable_peft_finetuning = dropout->enable_peft_finetuning;
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
   checkCUDNN(cudnnCreateDropoutDescriptor(&dropoutDesc));
diff --git a/src/ops/kernels/element_binary_kernels.cu b/src/ops/kernels/element_binary_kernels.cu
index 91475c6a8..42b31a664 100644
--- a/src/ops/kernels/element_binary_kernels.cu
+++ b/src/ops/kernels/element_binary_kernels.cu
@@ -31,7 +31,6 @@ ElementBinaryMeta::ElementBinaryMeta(FFHandler handler, Op const *op)
   op_type = OP_NOOP;
   profiling = false;
   inference_debugging = false;
-  enable_peft_finetuning = false;
   inplace_a = false;
   has_same_operands = false;
   broadcast_input1 = false;
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index f40c31433..c3f048f06 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -50,7 +50,7 @@ LinearMeta::LinearMeta(FFHandler handler,
   // peft activation
   size_t out_dim =
       li->outputs[0]->dims[0].size / li->outputs[0]->dims[0].degree;
-  if (enable_peft_finetuning &&
+  if (peft_finetuning_enabled(peft_support_mode) &&
       (activation == AC_MODE_RELU || activation == AC_MODE_SIGMOID)) {
     // Allocate space for storing the output activations for PEFT finetuning
     // during inference
@@ -87,7 +87,7 @@ LinearMeta::LinearMeta(FFHandler handler,
   } else {
     one_ptr = nullptr;
   }
-  if (enable_peft_finetuning) {
+  if (peft_finetuning_enabled(peft_support_mode)) {
     output_activation_buffer =
         gpu_mem_allocator.allocate_instance_untyped(allocated_peft_buffer_size);
   } else {
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 47a235fc9..3550e49b5 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -37,7 +37,7 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler,
 
   size_t rms_ptr_size = 0;
   allocated_peft_buffer_size = 0;
-  if (enable_peft_finetuning) {
+  if (peft_finetuning_enabled(peft_support_mode)) {
     rms_ptr_size = rms->effective_batch_size * sizeof(float);
     allocated_peft_buffer_size =
         BatchConfig::max_sequence_length() * in_dim * data_size;
@@ -45,7 +45,7 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler,
   size_t totalSize = rms_ptr_size + allocated_peft_buffer_size;
   gpu_mem_allocator.create_legion_instance(
       reserveInst, totalSize, "ResidualRMSNormMeta");
-  if (enable_peft_finetuning) {
+  if (peft_finetuning_enabled(peft_support_mode)) {
     rms_ptr = gpu_mem_allocator.allocate_instance_untyped(rms_ptr_size);
     input_activation =
         gpu_mem_allocator.allocate_instance_untyped(allocated_peft_buffer_size);
@@ -202,7 +202,7 @@ void store_peft_activations(ResidualRMSNormMeta const *m,
                             size_t in_dim,
                             DT const *residual_output_ptr,
                             cudaStream_t stream) {
-  assert(m->enable_peft_finetuning);
+  assert(peft_finetuning_enabled(m->peft_support_mode));
   assert(bc->num_finetuning_fwd_tokens() >= 1);
 
   int num_ft_tokens = bc->num_finetuning_fwd_tokens();
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index f9403bf86..b3b8bcbb2 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -34,7 +34,7 @@ RMSNormMeta::RMSNormMeta(FFHandler handler,
   size_t data_size = data_type_size(rms->weights[0]->data_type);
 
   size_t rms_ptr_size = rms->effective_batch_size * data_size;
-  if (enable_peft_finetuning) {
+  if (peft_finetuning_enabled(peft_support_mode)) {
     allocated_peft_buffer_size =
         BatchConfig::max_sequence_length() * in_dim * data_size;
   } else {
@@ -45,7 +45,7 @@ RMSNormMeta::RMSNormMeta(FFHandler handler,
   gpu_mem_allocator.create_legion_instance(
       reserveInst, totalSize, "RMSNormMeta");
   rms_ptr = gpu_mem_allocator.allocate_instance_untyped(rms_ptr_size);
-  if (enable_peft_finetuning) {
+  if (peft_finetuning_enabled(peft_support_mode)) {
     input_activation =
         gpu_mem_allocator.allocate_instance_untyped(allocated_peft_buffer_size);
   }
@@ -188,7 +188,7 @@ void store_peft_activations(RMSNormMeta const *m,
                             size_t in_dim,
                             DT const *input_ptr,
                             cudaStream_t stream) {
-  assert(m->enable_peft_finetuning);
+  assert(peft_finetuning_enabled(m->peft_support_mode));
   assert(bc->num_finetuning_fwd_tokens() >= 1);
 
   int num_ft_tokens = bc->num_finetuning_fwd_tokens();
diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index 3a7864baf..436b85b05 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -36,8 +36,7 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler,
   dim = softmax->dim;
   profiling = softmax->profiling;
   inference_debugging = softmax->inference_debugging;
-  enable_peft_finetuning = softmax->enable_peft_finetuning;
-  if (enable_peft_finetuning && is_last_op) {
+  if (peft_finetuning_enabled(peft_support_mode) && is_last_op) {
     allocated_peft_buffer_size =
         input_domain.get_volume() * data_type_size(softmax->data_type);
     gpu_mem_allocator.create_legion_instance(
@@ -291,7 +290,7 @@ void store_peft_activations(SoftmaxMeta *m,
                             int num_classes,
                             DT *output_ptr,
                             cudaStream_t stream) {
-  assert(m->enable_peft_finetuning);
+  assert(peft_finetuning_enabled(m->peft_support_mode));
   assert(m->output_grad_ptr != nullptr);
 
   int num_ft_tokens = bc->num_finetuning_fwd_tokens();
diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu
index fecaa067c..6feeae039 100644
--- a/src/ops/layer_norm.cu
+++ b/src/ops/layer_norm.cu
@@ -34,14 +34,12 @@ LayerNormMeta::LayerNormMeta(FFHandler handle,
   effective_num_elements = ln->effective_num_elements;
   profiling = ln->profiling;
   inference_debugging = ln->inference_debugging;
-  enable_peft_finetuning = ln->enable_peft_finetuning;
   eps = ln->eps;
   DataType data_type = ln->data_type;
   size_t in_dim = ln->inputs[0]->dims[0].size / ln->inputs[0]->dims[0].degree;
   allocated_peft_buffer_size =
-      enable_peft_finetuning ? (data_type_size(data_type) *
-                                BatchConfig::max_sequence_length() * in_dim)
-                             : 0;
+      peft_finetuning_enabled(peft_support_mode) ? (data_type_size(data_type) * BatchConfig::max_sequence_length() * in_dim)
+                                                  : 0;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 6 +
                      allocated_peft_buffer_size;
   gpu_mem_allocator.create_legion_instance(
@@ -58,7 +56,7 @@ LayerNormMeta::LayerNormMeta(FFHandler handle,
       data_type_size(data_type) * effective_batch_size);
   bias_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
-  if (enable_peft_finetuning) {
+  if (peft_finetuning_enabled(peft_support_mode)) {
     input_activation =
         gpu_mem_allocator.allocate_instance_untyped(allocated_peft_buffer_size);
   }
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index baa1a765a..c53e0f1e7 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -53,7 +53,7 @@ bool check_lora_layer_match(Layer *potential_target,
 }
 
 void FFModel::add_lora_layers(std::vector<std::string> target_modules) {
-  assert(config.enable_peft &&
+  assert(peft_enabled(config.peft_support_mode) &&
          "Cannot add a LoRA layer if PEFT mode is not enabled");
   assert(target_modules.size() > 0 && "LoRA target module name is empty");
   RequestManager *rm = RequestManager::get_request_manager();
diff --git a/src/ops/pool_2d.cc b/src/ops/pool_2d.cc
index 18f7c9c77..c8b194afa 100644
--- a/src/ops/pool_2d.cc
+++ b/src/ops/pool_2d.cc
@@ -318,7 +318,6 @@ OpMeta *Pool2D::init_task(Task const *task,
   Pool2DMeta *m = new Pool2DMeta(handle, pool);
   m->profiling = pool->profiling;
   m->inference_debugging = pool->inference_debugging;
-  m->enable_peft_finetuning = pool->enable_peft_finetuning;
   std::strcpy(m->op_name, pool->name);
   m->layer_guid = pool->layer_guid;
   TensorAccessorR<float, Pool2DInput::NUMDIM> acc_input(
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index a258ff294..658add9e4 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -35,15 +35,13 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
   effective_num_elements = ln->effective_num_elements;
   profiling = ln->profiling;
   inference_debugging = ln->inference_debugging;
-  enable_peft_finetuning = ln->enable_peft_finetuning;
   eps = ln->eps;
   inplace_residual = ln->inplace_residual;
   DataType data_type = ln->data_type;
   size_t in_dim = ln->inputs[0]->dims[0].size / ln->inputs[0]->dims[0].degree;
   allocated_peft_buffer_size =
-      enable_peft_finetuning ? (data_type_size(data_type) *
-                                BatchConfig::max_sequence_length() * in_dim)
-                             : 0;
+      peft_finetuning_enabled(peft_support_mode) ? (data_type_size(data_type) * BatchConfig::max_sequence_length() * in_dim)
+                                                : 0;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 3 +
                      allocated_peft_buffer_size;
   gpu_mem_allocator.create_legion_instance(
@@ -54,7 +52,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
       data_type_size(data_type) * effective_batch_size);
   bias_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
-  if (enable_peft_finetuning) {
+  if (peft_finetuning_enabled(peft_support_mode)) {
     input_activation =
         gpu_mem_allocator.allocate_instance_untyped(allocated_peft_buffer_size);
   }
diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc
index 3555af65c..3413ee514 100644
--- a/src/ops/sampling.cc
+++ b/src/ops/sampling.cc
@@ -232,7 +232,6 @@ OpMeta *Sampling::init_task(Task const *task,
       handle, s, batch_size, length * batch_size, acc_input, gpu_mem_allocator);
   m->profiling = s->profiling;
   m->inference_debugging = s->inference_debugging;
-  m->enable_peft_finetuning = s->enable_peft_finetuning;
   std::strcpy(m->op_name, s->name);
   m->layer_guid = s->layer_guid;
   m->top_p = s->top_p;
diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index 22c4d79cc..6fb13f9b8 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -25,8 +25,7 @@ SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle,
     : OpMeta(handle, ssm) {
   profiling = ssm->profiling;
   inference_debugging = ssm->inference_debugging;
-  enable_peft_finetuning = ssm->enable_peft_finetuning;
-  if (enable_peft_finetuning) {
+  if (peft_finetuning_enabled(peft_support_mode)) {
     size_t in_dim =
         ssm->inputs[0]->dims[0].size / ssm->inputs[0]->dims[0].degree;
     allocated_peft_buffer_size = 2 * data_type_size(input_type[0]) *
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index b93f3f74b..b779cbba8 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -468,7 +468,6 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task(
          peft_mem_allocator.instance_total_size);
   m->profiling = attn->profiling;
   m->inference_debugging = attn->inference_debugging;
-  m->enable_peft_finetuning = attn->enable_peft_finetuning;
   std::strcpy(m->op_name, attn->name);
   m->layer_guid = attn->layer_guid;
   return m;
diff --git a/src/ops/topk.cc b/src/ops/topk.cc
index e710b9182..0e88befa6 100644
--- a/src/ops/topk.cc
+++ b/src/ops/topk.cc
@@ -229,7 +229,6 @@ OpMeta *TopK::init_task(Task const *task,
   TopKMeta *m = new TopKMeta(handle, topk);
   m->profiling = topk->profiling;
   m->inference_debugging = topk->inference_debugging;
-  m->enable_peft_finetuning = topk->enable_peft_finetuning;
   m->sorted = topk->sorted;
   std::strcpy(m->op_name, topk->name);
   m->layer_guid = topk->layer_guid;
diff --git a/src/ops/transpose.cc b/src/ops/transpose.cc
index a32228714..bffde477d 100644
--- a/src/ops/transpose.cc
+++ b/src/ops/transpose.cc
@@ -197,7 +197,6 @@ OpMeta *Transpose::init_task(Task const *task,
   transpose->init_meta(m, in_domain, out_domain);
   m->profiling = transpose->profiling;
   m->inference_debugging = transpose->inference_debugging;
-  m->enable_peft_finetuning = transpose->enable_peft_finetuning;
   std::strcpy(m->op_name, transpose->name);
   m->layer_guid = transpose->layer_guid;
   return m;
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index 1d4847bac..a8ac0a59f 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -514,7 +514,6 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task(
   }
   m->profiling = attn->profiling;
   m->inference_debugging = attn->inference_debugging;
-  m->enable_peft_finetuning = attn->enable_peft_finetuning;
   std::strcpy(m->op_name, attn->name);
   m->layer_guid = attn->layer_guid;
 
diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc
index 5a7d98b4d..e8389beef 100644
--- a/src/runtime/ffconst_utils.cc
+++ b/src/runtime/ffconst_utils.cc
@@ -248,4 +248,23 @@ std::ostream &operator<<(std::ostream &s, OperatorType op_type) {
   return s;
 }
 
+const char* peftSupportModeToString(PeftSupportMode mode) {
+  switch(mode) {
+    case PEFT_DISABLED:         return "PEFT_DISABLED";
+    case PEFT_INFERENCE_ONLY:     return "PEFT_INFERENCE_ONLY";
+    case COSERVING:               return "COSERVING";
+    case TEMPORAL_SHARING:        return "TEMPORAL_SHARING";
+    case SPATIAL_SHARING:         return "SPATIAL_SHARING";
+    default:                      return "UNKNOWN";
+  }
+}
+bool peft_finetuning_enabled(PeftSupportMode peft_support_mode) {
+  return peft_support_mode == COSERVING ||
+         peft_support_mode == TEMPORAL_SHARING ||
+         peft_support_mode == SPATIAL_SHARING;
+}
+bool peft_enabled(PeftSupportMode peft_support_mode) {
+  return peft_support_mode != PEFT_DISABLED;
+}
+
 }; // namespace FlexFlow
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 7bc818d12..959af8779 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -127,7 +127,7 @@ Op::Op(FFModel &model,
       numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs),
       profiling(model.config.profiling),
       inference_debugging(model.config.inference_debugging),
-      enable_peft_finetuning(model.config.enable_peft_finetuning) {
+      peft_support_mode(model.config.peft_support_mode) {
   for (int i = 0; i < MAX_NUM_INPUTS; i++) {
     inputs[i] = NULL;
   }
@@ -176,7 +176,7 @@ Op::Op(FFModel &model,
       numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs),
       profiling(model.config.profiling),
       inference_debugging(model.config.inference_debugging),
-      enable_peft_finetuning(model.config.enable_peft_finetuning) {
+      peft_support_mode(model.config.peft_support_mode) {
   std::string pcname;
   if (_name == NULL) {
     pcname = get_operator_type_name(op_type);
@@ -1501,7 +1501,7 @@ bool Op::get_weight_parameter(TNParameter tnp,
 OpMeta::OpMeta(FFHandler _handle, Op const *op)
     : handle(_handle), profiling(op->profiling),
       inference_debugging(op->inference_debugging),
-      enable_peft_finetuning(op->enable_peft_finetuning),
+      peft_support_mode(op->peft_support_mode),
       layer_guid(op->layer_guid) {
   for (int i = 0; i < op->numInputs; i++) {
     trainable_inputs[i] = op->trainable_inputs[i];
@@ -4371,7 +4371,7 @@ struct DefaultConfig {
   const static bool profiling = false;
   const static bool benchmarking = false;
   const static bool inference_debugging = false;
-  const static bool enable_peft_finetuning = false;
+  const static PeftSupportMode peft_support_mode = PEFT_DISABLED;
   constexpr static float learningRate = 0.01f;
   constexpr static float weightDecay = 0.0001f;
   const static size_t workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB
@@ -4411,7 +4411,6 @@ FFConfig::FFConfig() {
   log_instance_creation = DefaultConfig::log_instance_creation;
   benchmarking = DefaultConfig::benchmarking;
   inference_debugging = DefaultConfig::inference_debugging;
-  enable_peft_finetuning = DefaultConfig::enable_peft_finetuning;
   learningRate = DefaultConfig::learningRate;
   weightDecay = DefaultConfig::weightDecay;
   workSpaceSize = DefaultConfig::workSpaceSize;
@@ -4426,7 +4425,7 @@ FFConfig::FFConfig() {
   cpu_offload = DefaultConfig::cpuOffload;
   offload_reserve_space_size = DefaultConfig::offloadReserveSpaceSize;
   // PEFT related fields
-  enable_peft = DefaultConfig::enablePeft;
+  peft_support_mode = DefaultConfig::peft_support_mode;
   quantization_type = DT_NONE;
   only_data_parallel = DefaultConfig::onlyDataParallel;
   data_parallelism_degree = 1;
@@ -4553,10 +4552,6 @@ void FFConfig::parse_args(char **argv, int argc) {
       quantization_type = DT_INT8;
       continue;
     }
-    if ((!strcmp(argv[i], "-enable-peft"))) {
-      enable_peft = true;
-      continue;
-    }
     if ((!strcmp(argv[i], "--only-data-parallel"))) {
       only_data_parallel = true;
       continue;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index fd6680b1a..a77559aa2 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -379,8 +379,8 @@ void RequestManager::push_spec_infer_tree_width(int tree_width) {
   spec_infer_tree_width.emplace_back(tree_width);
 }
 
-void RequestManager::set_enable_peft_finetuning(bool enable_peft_finetuning_) {
-  enable_peft_finetuning = enable_peft_finetuning_;
+void RequestManager::set_peft_support_mode(PeftSupportMode peft_support_mode_) {
+  peft_support_mode = peft_support_mode_;
 }
 
 void RequestManager::set_inference_finished(bool finished) {
@@ -547,7 +547,7 @@ int RequestManager::get_num_layers_per_finetuning_step() {
 
 PEFTModelID *
     FFModel::register_peft_adapter(LoraLinearConfig const &peft_config) {
-  assert(config.enable_peft &&
+  assert(peft_enabled(config.peft_support_mode) &&
          "Cannot add a LoRA layer if PEFT mode is not enabled");
   if (peft_config.target_modules.size() == 0) {
     printf("PEFT config does not contain any target module\n");
@@ -657,7 +657,7 @@ RequestGuid RequestManager::register_new_request(Request const &request_) {
 }
 
 RequestGuid RequestManager::register_new_peft_request(Request const &request_) {
-  assert(enable_peft_finetuning && "PEFT finetuning is not enabled");
+  assert(peft_finetuning_enabled(peft_support_mode) && "PEFT finetuning is not enabled");
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
   // Add a new request
   Request request = Request::from_other(request_);
@@ -954,7 +954,7 @@ void RequestManager::process_inf_req_progress(BatchConfig const &old_fwd_bc,
     }
   }
   int inference_batch_size =
-      BatchConfig::max_requests_per_batch() - (int)enable_peft_finetuning;
+      BatchConfig::max_requests_per_batch() - (int)peft_finetuning_enabled(peft_support_mode);
   for (int req_idx = 0; req_idx < inference_batch_size; req_idx++) {
     if (old_fwd_bc.request_completed[req_idx]) {
       continue;
@@ -1131,7 +1131,7 @@ void RequestManager::add_continuing_inf_req_to_new_batch(
   assert(processed_tokens < request.tokens.size() &&
          "Continuing request has already finished");
   int inference_batch_size =
-      BatchConfig::max_requests_per_batch() - (int)enable_peft_finetuning;
+      BatchConfig::max_requests_per_batch() - (int)peft_finetuning_enabled(peft_support_mode);
 
   if (old_bc.requestsInfo[i].peft_model_id != PEFTModelID::NO_ID) {
     num_concurrent_inf_adapters += 1;
@@ -1290,7 +1290,7 @@ void RequestManager::handle_completed_finetuning_req(
   }
 
   int inference_batch_size =
-      BatchConfig::max_requests_per_batch() - (int)enable_peft_finetuning;
+      BatchConfig::max_requests_per_batch() - (int)peft_finetuning_enabled(peft_support_mode);
   assert(!old_finetuning_bc.request_completed[inference_batch_size] &&
          "Finetuning request not found in new batch");
 
@@ -1330,13 +1330,13 @@ void RequestManager::handle_completed_finetuning_req(
 
 void RequestManager::add_finetuning_req_fwd_batch(BatchConfig &new_bc) {
   // printf("Entering add_finetuning_req_fwd_batch\n");
-  assert(enable_peft_finetuning && "PEFT finetuning is not enabled");
+  assert(peft_finetuning_enabled(peft_support_mode) && "PEFT finetuning is not enabled");
   assert(!pending_peft_request_queue.empty() &&
          "Trying to add a new finetuning request when there are none");
   assert(new_bc.num_tokens < get_max_tokens_per_batch() &&
          "Trying to add a new finetuning request when the batch is full");
   int inference_batch_size =
-      BatchConfig::max_requests_per_batch() - (int)enable_peft_finetuning;
+      BatchConfig::max_requests_per_batch() - (int)peft_finetuning_enabled(peft_support_mode);
   assert(new_bc.request_completed[inference_batch_size] &&
          "Finetuning request already present in new batch");
   Request &request = pending_peft_request_queue.front();
@@ -1401,13 +1401,13 @@ void RequestManager::add_finetuning_req_fwd_batch(BatchConfig &new_bc) {
 
 void RequestManager::add_finetuning_req_bwd_batch(BatchConfig &new_bc) {
   // printf("Entering add_finetuning_req_bwd_batch\n");
-  assert(enable_peft_finetuning && "PEFT finetuning is not enabled");
+  assert(peft_finetuning_enabled(peft_support_mode) && "PEFT finetuning is not enabled");
   assert(!pending_peft_request_queue.empty() &&
          "Trying to add a new finetuning request when there are none");
   assert(new_bc.num_tokens <= get_max_tokens_per_batch() &&
          "Trying to add a new finetuning request when the batch is full");
   int inference_batch_size =
-      BatchConfig::max_requests_per_batch() - (int)enable_peft_finetuning;
+      BatchConfig::max_requests_per_batch() - (int)peft_finetuning_enabled(peft_support_mode);
   assert(new_bc.request_completed[inference_batch_size] &&
          "Finetuning request already present in new batch");
   Request &request = pending_peft_request_queue.front();
@@ -1514,7 +1514,7 @@ void RequestManager::process_finetuning_req_fwd_progress(
     return;
   }
   int inference_batch_size =
-      BatchConfig::max_requests_per_batch() - (int)enable_peft_finetuning;
+      BatchConfig::max_requests_per_batch() - (int)peft_finetuning_enabled(peft_support_mode);
   assert(!old_bc.request_completed[inference_batch_size] &&
          "Finetuning request not found in new batch");
   assert(old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch > 0 &&
@@ -1576,7 +1576,7 @@ void RequestManager::process_finetuning_req_bwd_progress(
     return;
   }
   int inference_batch_size =
-      BatchConfig::max_requests_per_batch() - (int)enable_peft_finetuning;
+      BatchConfig::max_requests_per_batch() - (int)peft_finetuning_enabled(peft_support_mode);
   assert(!old_bc.request_completed[inference_batch_size] &&
          "Finetuning request not found in new batch");
   // check that request in batch is the same as the first one in the pending
@@ -1689,7 +1689,7 @@ void RequestManager::process_work_from_old_batch(
 
   // Step 2: Finetuning. Process work from previous bwd iteration: update
   // records of finetuning bwd progress
-  if (enable_peft_finetuning) {
+  if (peft_finetuning_enabled(peft_support_mode)) {
     process_finetuning_req_fwd_progress(old_bc, result);
     process_finetuning_req_bwd_progress(old_bc);
   }
@@ -1730,7 +1730,7 @@ BatchConfig
   // when finetuning is enabled, the last entry in the batch cannot be used for
   // inference
   int inference_batch_size =
-      BatchConfig::max_requests_per_batch() - (int)enable_peft_finetuning;
+      BatchConfig::max_requests_per_batch() - (int)peft_finetuning_enabled(peft_support_mode);
   int num_concurrent_inf_adapters = 0;
 
   // Step 2: evict any requests that will not fit in the kv cache
@@ -3785,7 +3785,7 @@ std::vector<GenerationResult>
     // if the current request has not arrived yet, submit finetuning work if
     // available, then sleep until next request arrives
     else {
-      if (this->config.enable_peft_finetuning && !added_ft_req) {
+      if (peft_finetuning_enabled(this->config.peft_support_mode) && !added_ft_req) {
         // std::cout << "Time " << (current_time_us-start_time_us)/1000 <<
         // "Registering PEFT request" << std::endl;
         RequestGuid guid = rm->register_new_peft_request(ft_requests.at(0));
@@ -4003,7 +4003,7 @@ void RequestManager::serve_incr_decoding(FFModel *llm) {
                                                runtime);
     InferenceResultFuture irf = im->inference(llm, 0, bcf);
     std::vector<FinetuningBwdFuture> bwd_f;
-    if (llm->config.enable_peft) {
+    if (peft_finetuning_enabled(llm->config.peft_support_mode)) {
       bwd_f = im->peft_bwd(llm, 0, bcf);
     } else {
       for (int i = 0; i < tp_degree; i++) {
diff --git a/tests/inference/cpp_inference_tests.sh b/tests/inference/cpp_inference_tests.sh
index fd15f82f5..95c42d3ab 100755
--- a/tests/inference/cpp_inference_tests.sh
+++ b/tests/inference/cpp_inference_tests.sh
@@ -50,7 +50,7 @@ run_cpp_inference() {
     ["offload_reserve_space_size"]="-offload-reserve-space-size"
     ["use_4bit_quantization"]="--4bit-quantization"
     ["use_8bit_quantization"]="--8bit-quantization"
-    ["enable_peft"]="-enable-peft"
+    ["peft_support_mode"]="--peft_support_mode"
     ["profiling"]="--profiling"
     ["benchmarking"]="--benchmarking"
     ["inference_debugging"]="--inference-debugging"
diff --git a/tests/inference/generate_inf_test_configs.py b/tests/inference/generate_inf_test_configs.py
index 8e1f3d47f..056019bb6 100644
--- a/tests/inference/generate_inf_test_configs.py
+++ b/tests/inference/generate_inf_test_configs.py
@@ -19,7 +19,7 @@
     "offload_reserve_space_size": 8 * 1024, # 8 GB
     "use_4bit_quantization": False,
     "use_8bit_quantization": False,
-    "enable_peft": False,
+    "peft_support_mode": "DISABLED",
     "profiling": False,
     "benchmarking": False,
     "inference_debugging": False,
diff --git a/tests/peft_test.sh b/tests/peft_test.sh
index 3c041835e..b30fbc80b 100755
--- a/tests/peft_test.sh
+++ b/tests/peft_test.sh
@@ -63,7 +63,7 @@ json_config=$(cat <<-END
         "data_parallelism_degree": 1,
         "tensor_parallelism_degree": ${TP_DEGREE},
         "pipeline_parallelism_degree": ${PP_DEGREE},
-        "enable_peft": true,
+        "peft_support_mode": "COSERVING",
         "inference_debugging": true,
         "fusion": ${FUSION},
         "refresh_cache": false,
@@ -93,14 +93,14 @@ echo "C++ test"
 ./build/inference/peft/peft \
     -ll:gpu ${NUM_GPUS} -ll:cpu 4 -ll:util 4 \
     -tensor-parallelism-degree "${TP_DEGREE}" \
-    -ll:fsize "${MEMORY_PER_GPU}" -ll:zsize "${ZCOPY_MEMORY}" \
+    -ll:fsize "${MEMORY_PER_GPU}" -ll:zsize "${ZCOPY_MEMORY}" -ll:csize 2048 \
     --max-requests-per-batch 1 \
     --max-sequence-length 128 \
     --max-tokens-per-batch 128 \
     -llm-model "${BASE_MODEL_NAME}" \
     -finetuning-dataset ./inference/prompt/peft_dataset.json \
     -peft-model "$MODEL_NAME" \
-    -enable-peft \
+    --peft-support-mode COSERVING \
     "${full_precision_flag}" "${fusion_flag}" --inference-debugging
 
 # Check alignment

From 6006746eae7fd07d5423c4289f23a13c8e5bd731 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 13 Apr 2025 22:15:15 +0000
Subject: [PATCH 03/17] update

---
 include/flexflow/ffconst_utils.h  | 6 +++---
 include/flexflow/op_meta.h        | 2 +-
 inference/python/incr_decoding.py | 5 ++++-
 inference/python/spec_infer.py    | 2 +-
 python/flexflow/serve/__init__.py | 2 +-
 src/c/flexflow_c.cc               | 3 ++-
 src/runtime/ffconst_utils.cc      | 6 +++---
 7 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/include/flexflow/ffconst_utils.h b/include/flexflow/ffconst_utils.h
index ae79bd3b9..cbe0ceaab 100644
--- a/include/flexflow/ffconst_utils.h
+++ b/include/flexflow/ffconst_utils.h
@@ -18,9 +18,9 @@ size_t get_quantization_to_byte_size(DataType type,
 
 std::ostream &operator<<(std::ostream &, OperatorType);
 
-const char* peftSupportModeToString(PeftSupportMode mode);
-bool peft_finetuning_enabled(PeftSupportMode peft_support_mode);
-bool peft_enabled(PeftSupportMode peft_support_mode);
+const char* peftSupportModeToString(const PeftSupportMode mode);
+bool peft_finetuning_enabled(const PeftSupportMode peft_support_mode);
+bool peft_enabled(const PeftSupportMode peft_support_mode);
 
 }; // namespace FlexFlow
 
diff --git a/include/flexflow/op_meta.h b/include/flexflow/op_meta.h
index aaf3d943d..3427ff013 100644
--- a/include/flexflow/op_meta.h
+++ b/include/flexflow/op_meta.h
@@ -16,7 +16,7 @@ class OpMeta {
   FFHandler handle;
   bool profiling; // Measure the run time of the task
   bool inference_debugging;
-  enum PeftSupportMode peft_support_mode;
+  PeftSupportMode peft_support_mode;
   int decoding_step;
   int bwd_step;
   char op_name[MAX_OPNAME];
diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py
index a77f9ea18..94db0e074 100644
--- a/inference/python/incr_decoding.py
+++ b/inference/python/incr_decoding.py
@@ -60,7 +60,7 @@ def get_configs():
             "offload_reserve_space_size": 8 * 1024,  # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
-            "peft_support_mode": ff.PeftSupportMode.DISABLED,
+            "peft_support_mode": ff.PeftSupportMode.PEFT_DISABLED,
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
@@ -87,8 +87,11 @@ def get_configs():
 
 
 def main():
+    print("FlexFlow LLM Inference Example (Incremental Decoding)")
     configs_dict = get_configs()
     configs = SimpleNamespace(**configs_dict)
+    print(configs_dict)
+    print(configs)
 
     # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs
     ff.init(configs_dict)
diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py
index 8604452e1..fe89ffd54 100644
--- a/inference/python/spec_infer.py
+++ b/inference/python/spec_infer.py
@@ -60,7 +60,7 @@ def get_configs():
             "offload_reserve_space_size": 8 * 1024,  # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
-            "peft_support_mode": ff.PeftSupportMode.DISABLED,
+            "peft_support_mode": ff.PeftSupportMode.PEFT_DISABLED,
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
index a194dc175..c1da477da 100644
--- a/python/flexflow/serve/__init__.py
+++ b/python/flexflow/serve/__init__.py
@@ -193,7 +193,7 @@ def init(
             "fusion": fusion,
             "log_instance_cration": log_instance_cration,
         }
-    print("configs_dict: ", configs_dict)
+
     # Check that mandatory configs are present
     required_keys = ["num_gpus", "memory_per_gpu", "zero_copy_memory_per_node"]
     for required_key in required_keys:
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 4921aa54b..d97b17196 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -177,6 +177,7 @@ void flexflow_config_set_pipeline_parallelism_degree(flexflow_config_t handle_,
 void flexflow_config_set_peft_support_mode(flexflow_config_t handle_,
                                             enum PeftSupportMode value){
   FFConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  DEBUG_PRINT("flexflow_config_set_peft_support_mode peft support mode %s\n", peftSupportModeToString(value));
   handle->peft_support_mode = value;
 }
 
@@ -2676,7 +2677,7 @@ void flexflow_request_manager_set_peft_support_mode(
     flexflow_request_manager_t handle_, enum PeftSupportMode peft_support_mode_) {
   RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
   handle->set_peft_support_mode(peft_support_mode_);
-  DEBUG_PRINT("[RequestManager] set peft support mode %d", peft_support_mode_);
+  DEBUG_PRINT("[RequestManager] set peft_support_mode %s", peftSupportModeToString(peft_support_mode_));
 }
 
 void flexflow_request_manager_set_num_transformers_layers(
diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc
index e8389beef..613d2b06d 100644
--- a/src/runtime/ffconst_utils.cc
+++ b/src/runtime/ffconst_utils.cc
@@ -248,7 +248,7 @@ std::ostream &operator<<(std::ostream &s, OperatorType op_type) {
   return s;
 }
 
-const char* peftSupportModeToString(PeftSupportMode mode) {
+const char* peftSupportModeToString(const PeftSupportMode mode) {
   switch(mode) {
     case PEFT_DISABLED:         return "PEFT_DISABLED";
     case PEFT_INFERENCE_ONLY:     return "PEFT_INFERENCE_ONLY";
@@ -258,12 +258,12 @@ const char* peftSupportModeToString(PeftSupportMode mode) {
     default:                      return "UNKNOWN";
   }
 }
-bool peft_finetuning_enabled(PeftSupportMode peft_support_mode) {
+bool peft_finetuning_enabled(const PeftSupportMode peft_support_mode) {
   return peft_support_mode == COSERVING ||
          peft_support_mode == TEMPORAL_SHARING ||
          peft_support_mode == SPATIAL_SHARING;
 }
-bool peft_enabled(PeftSupportMode peft_support_mode) {
+bool peft_enabled(const PeftSupportMode peft_support_mode) {
   return peft_support_mode != PEFT_DISABLED;
 }
 

From 0621c8461b52ae516bbf7b15165ccb6c40d5e8b9 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 13 Apr 2025 22:35:08 +0000
Subject: [PATCH 04/17] reduce max num tokens per batch

---
 include/flexflow/batch_config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index ebd0ecbfe..e1a153479 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -97,7 +97,7 @@ class BatchConfig {
   // These maximum values are used for copying BatchConfig
   // across workers
   static int const MAX_NUM_REQUESTS = 260;
-  static int const MAX_NUM_TOKENS = 16384;
+  static int const MAX_NUM_TOKENS = 8704;
   static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
   static int const MAX_PEFT_CONFIG_SIZE = 1024;
 

From aa2b9dad83da938ed10fa936f301df99ed12cae7 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 14 Apr 2025 01:35:22 +0000
Subject: [PATCH 05/17] fix

---
 include/flexflow/batch_config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index e1a153479..4eb80f8ef 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -97,7 +97,7 @@ class BatchConfig {
   // These maximum values are used for copying BatchConfig
   // across workers
   static int const MAX_NUM_REQUESTS = 260;
-  static int const MAX_NUM_TOKENS = 8704;
+  static int const MAX_NUM_TOKENS = 8192;
   static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
   static int const MAX_PEFT_CONFIG_SIZE = 1024;
 

From d63a64822b39cf1e392440fbb12e2b7e641c5407 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 14 Apr 2025 04:20:45 +0000
Subject: [PATCH 06/17] fix

---
 tests/inference/generate_inf_test_configs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/inference/generate_inf_test_configs.py b/tests/inference/generate_inf_test_configs.py
index 056019bb6..25982f59c 100644
--- a/tests/inference/generate_inf_test_configs.py
+++ b/tests/inference/generate_inf_test_configs.py
@@ -19,7 +19,7 @@
     "offload_reserve_space_size": 8 * 1024, # 8 GB
     "use_4bit_quantization": False,
     "use_8bit_quantization": False,
-    "peft_support_mode": "DISABLED",
+    "peft_support_mode": "PEFT_DISABLED",
     "profiling": False,
     "benchmarking": False,
     "inference_debugging": False,

From 8b1af8452de3f85e336a9580ee81c7173210401a Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 14 Apr 2025 03:00:00 -0700
Subject: [PATCH 07/17] finish spatial sharing

---
 include/flexflow/config.h                     |  12 +-
 include/flexflow/ops/argmax.h                 |   8 +-
 .../flexflow/ops/kernels/embedding_kernels.h  |  29 ++-
 include/flexflow/ops/kernels/linear_kernels.h |   7 +-
 .../flexflow/ops/kernels/softmax_kernels.h    |   4 +-
 src/ops/argmax.cc                             |   6 +-
 src/ops/argmax.cu                             | 115 +++++++--
 src/ops/embedding.cc                          |  14 +-
 src/ops/fused.cpp                             |   2 +-
 src/ops/fused.cu                              |  38 +--
 src/ops/inc_multihead_self_attention.cu       |  23 +-
 src/ops/kernels/embedding_kernels.cu          | 138 ++++++++---
 src/ops/kernels/linear_kernels.cu             | 223 ++++++++++++++++--
 src/ops/kernels/lora_linear_kernels.cu        | 209 +++++++++++++++-
 src/ops/kernels/residual_rms_norm_kernels.cu  |  54 ++++-
 src/ops/kernels/rms_norm_kernels.cu           |  48 +++-
 src/ops/kernels/softmax.cu                    |  72 +++++-
 src/ops/linear.cc                             |   5 +-
 src/runtime/model.cu                          |  16 +-
 19 files changed, 856 insertions(+), 167 deletions(-)

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 341d7c784..f6e7003bb 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -88,13 +88,17 @@ struct CombinedBatchConfigMetaStruct {
 
 struct FFHandler {
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
-  cudnnHandle_t dnn, peft_dnn;
-  cublasHandle_t blas, peft_blas;
+  cudnnHandle_t dnn, peft_fwd_dnn, peft_bwd_dnn;
+  cublasHandle_t blas, peft_fwd_blas, peft_bwd_blas;
   cudaStream_t peft_fwd_stream;
+  cudaEvent_t peft_fwd_can_start;
+  cudaEvent_t peft_fwd_done;
 #else
-  miopenHandle_t dnn, peft_dnn;
-  hipblasHandle_t blas, peft_blas;
+  miopenHandle_t dnn, peft_fwd_dnn, peft_bwd_dnn;
+  hipblasHandle_t blas, peft_fwd_blas, peft_bwd_blas;
   hipStream_t peft_fwd_stream;
+  hipEvent_t peft_fwd_can_start;
+  hipEvent_t peft_fwd_done;
 #endif
   void *workSpace;
   size_t workSpaceSize;
diff --git a/include/flexflow/ops/argmax.h b/include/flexflow/ops/argmax.h
index 6d64e8e78..4cc1aa2ee 100644
--- a/include/flexflow/ops/argmax.h
+++ b/include/flexflow/ops/argmax.h
@@ -89,22 +89,20 @@ class ArgMax : public Op {
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
   template <typename DT>
-  static void forward_kernel(ArgMaxMeta const *m,
+  static void inference_kernel(ArgMaxMeta const *m,
                              BatchConfig const *bc,
                              DT const *input_ptr,
                              int *indices_ptr,
                              float *prob_ptr,
                              int *parent_ptr,
-                             int length,
-                             int batch_size,
+                             int num_classes,
                              float *loss,
                              ffStream_t stream);
-  static void forward_kernel_wrapper(ArgMaxMeta const *m,
+  static void inference_kernel_wrapper(ArgMaxMeta const *m,
                                      BatchConfig const *bc,
                                      GenericTensorAccessorR const &input,
                                      GenericTensorAccessorW const &indices,
                                      GenericTensorAccessorW const &parent,
-                                     int batch_size,
                                      float *loss);
   Params get_params() const;
 
diff --git a/include/flexflow/ops/kernels/embedding_kernels.h b/include/flexflow/ops/kernels/embedding_kernels.h
index 58afcf4ad..cfddb946b 100644
--- a/include/flexflow/ops/kernels/embedding_kernels.h
+++ b/include/flexflow/ops/kernels/embedding_kernels.h
@@ -17,13 +17,13 @@ class EmbeddingMeta : public OpMeta {
 
 namespace Kernels {
 namespace Embedding {
-void forward_kernel_wrapper(EmbeddingMeta const *m,
+void inference_kernel_wrapper(EmbeddingMeta const *m,
+                              BatchConfig const *bc,
                             GenericTensorAccessorR const &input,
                             GenericTensorAccessorW const &output,
                             GenericTensorAccessorR const &weight,
                             int in_dim,
-                            int out_dim,
-                            int batch_size);
+                            int out_dim);
 void backward_kernel_wrapper(EmbeddingMeta const *m,
                              GenericTensorAccessorR const &input,
                              GenericTensorAccessorR const &output,
@@ -34,17 +34,26 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
 
 namespace Internal {
 template <typename TI, typename TD>
-void forward_kernel(TI const *input_ptr,
+void forward_kernel(EmbeddingMeta const *m,
+                    BatchConfig const *bc,
+                    TI const *input_ptr,
                     TD *output_ptr,
                     TD const *weight_ptr,
                     int in_dim,
                     int out_dim,
-                    int batch_size,
-                    AggrMode aggr,
-                    int outputSize,
-                    ffStream_t stream);
-
-;
+                    // int batch_size,
+                    // AggrMode aggr,
+                    // int outputSize,
+                    cudaStream_t stream);
+template <typename TI, typename TD>
+void forward_kernel_spatial_sharing(EmbeddingMeta const *m,
+                                    BatchConfig const *bc,
+                                    TI const *input_ptr,
+                                    TD *output_ptr,
+                                    TD const *weight_ptr,
+                                    int in_dim,
+                                    int out_dim,
+                                    cudaStream_t main_stream);
 } // namespace Internal
 } // namespace Embedding
 } // namespace Kernels
diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h
index cc2cfb5af..8f0919cdb 100644
--- a/include/flexflow/ops/kernels/linear_kernels.h
+++ b/include/flexflow/ops/kernels/linear_kernels.h
@@ -58,8 +58,7 @@ void inference_kernel_wrapper(LinearMeta *m,
                               void const *filter_ptr,
                               void const *bias_ptr,
                               int in_dim,
-                              int out_dim,
-                              int batch_size);
+                              int out_dim);
 void peft_bwd_kernel_wrapper(LinearMeta const *m,
                              BatchConfig const *bc,
                              void *input_grad_ptr,
@@ -83,13 +82,13 @@ bool use_activation(ActiMode mode);
 namespace Internal {
 template <typename DT>
 void inference_kernel(LinearMeta const *m,
+                      BatchConfig const *bc,
                       void const *input_ptr,
                       void *output_ptr,
-                      void const *filter_ptr,
+                      void const *weight_ptr,
                       void const *bias_ptr,
                       int in_dim,
                       int out_dim,
-                      int batch_size,
                       ffStream_t stream);
 template <typename DT>
 void store_peft_activations(LinearMeta const *m,
diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h
index 0ab8a515a..deeb127ef 100644
--- a/include/flexflow/ops/kernels/softmax_kernels.h
+++ b/include/flexflow/ops/kernels/softmax_kernels.h
@@ -19,10 +19,10 @@ class SoftmaxMeta : public OpMeta {
               MemoryAllocator &gpu_mem_allocator);
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
   cudnnTensorDescriptor_t inputTensor;
-  cudnnTensorDescriptor_t outputTensor;
+  cudnnTensorDescriptor_t outputTensor, outputTensorPeftFwd;
 #else
   miopenTensorDescriptor_t inputTensor;
-  miopenTensorDescriptor_t outputTensor;
+  miopenTensorDescriptor_t outputTensor, outputTensorPeftFwd;
 #endif
   int dim;
   // PEFT related fields
diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc
index e58e48f80..e953430b7 100644
--- a/src/ops/argmax.cc
+++ b/src/ops/argmax.cc
@@ -354,8 +354,7 @@ BeamInferenceResult
   GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO(
       DT_INT32, regions[2], task->regions[2], FID_DATA, ctx, runtime);
   float loss = 0.0f;
-  ArgMax::forward_kernel_wrapper(
-      m, bc, input, indices, parent, batch_size, &loss);
+  ArgMax::inference_kernel_wrapper(m, bc, input, indices, parent, &loss);
   BeamInferenceResult ir;
   copy_tensor_dev_to_host<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
@@ -396,8 +395,7 @@ InferenceResult
   int batch_size = bc->num_active_tokens();
   float loss = 0.0f;
 
-  ArgMax::forward_kernel_wrapper(
-      m, bc, input, indices, parent, batch_size, &loss);
+  ArgMax::inference_kernel_wrapper(m, bc, input, indices, parent, &loss);
 
   InferenceResult ir;
   ir.finetuning_loss = loss;
diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu
index 0dc714c26..a1a508459 100644
--- a/src/ops/argmax.cu
+++ b/src/ops/argmax.cu
@@ -128,27 +128,111 @@ __global__ void compute_sparse_categorical_crossentropy_loss(
   }
 }
 
+template <typename DT>
+void inference_kernel_spatial_sharing(ArgMaxMeta const *m,
+                                      BatchConfig const *bc,
+                                      DT const *input_ptr,
+                                      int *indices_ptr,
+                                      float *prob_ptr,
+                                      int *parent,
+                                      int const num_classes,
+                                      float *loss,
+                                      cudaStream_t main_stream) {
+  assert(!m->beam_search);
+  
+  // launch finetuning fwd tokens kernel if there are any finetuning fwd tokens
+  if (bc->num_finetuning_fwd_tokens() > 0) {
+    checkCUDA(cudaEventRecord(m->handle.peft_fwd_can_start, main_stream)); 
+    checkCUDA(cudaStreamWaitEvent(m->handle.peft_fwd_stream, m->handle.peft_fwd_can_start, 0));
+
+    launchArgmaxKernel(input_ptr + num_classes * bc->num_inference_tokens(), 
+                      num_classes, 
+                      bc->num_finetuning_fwd_tokens(), 
+                      indices_ptr + bc->num_inference_tokens(), 
+                      prob_ptr + bc->num_inference_tokens(), 
+                      m->handle.peft_fwd_stream);
+
+    // print_tensor(indices_ptr, batch_size, "indices_ptr: ");
+
+    // compute cross-entropy loss if there is a finetuning request
+    assert(loss != nullptr);
+    BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS];
+    int i = bc->finetuning_request_index();
+    assert(bc->requestsInfo[i].peft_model_id != PEFTModelID::NO_ID);
+    assert(!bc->requestsInfo[i].finetuning_backward_phase);
+    int num_finetuning_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1;
+    assert(num_finetuning_tokens + 1 == bc->num_finetuning_fwd_tokens());
+    int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+    for (int j = 0; j < num_finetuning_tokens; j++) {
+      token_ids[j] = bc->tokensInfo[j + first_token_offset + 1].token_id;
+    }
+    checkCUDA(
+        cudaMemcpyAsync(m->handle.workSpace,
+                        token_ids,
+                        sizeof(BatchConfig::TokenId) * num_finetuning_tokens,
+                        cudaMemcpyHostToDevice,
+                        m->handle.peft_fwd_stream));
+    // copy loss to d_loss
+    checkCUDA(cudaMemsetAsync(m->d_loss, 0, sizeof(float), m->handle.peft_fwd_stream));
+    compute_sparse_categorical_crossentropy_loss<<<
+        GET_BLOCKS(num_finetuning_tokens),
+        min(CUDA_NUM_THREADS, num_finetuning_tokens),
+        0,
+        m->handle.peft_fwd_stream>>>(input_ptr + first_token_offset * num_classes,
+                  static_cast<BatchConfig::TokenId *>(m->handle.workSpace),
+                  m->d_loss,
+                  num_finetuning_tokens,
+                  num_classes);
+    // copy value from d_loss to loss
+    checkCUDA(cudaMemcpyAsync(
+        loss, m->d_loss, sizeof(float), cudaMemcpyDeviceToHost, m->handle.peft_fwd_stream));
+    *loss = *loss / (float)num_finetuning_tokens;
+
+    checkCUDA(cudaEventRecord(m->handle.peft_fwd_done, m->handle.peft_fwd_stream));
+  }
+
+  // launch inference kernel if there are inference tokens
+  if (bc->num_inference_tokens() > 0) {
+    launchArgmaxKernel(
+      input_ptr, num_classes, bc->num_inference_tokens(), indices_ptr, prob_ptr, main_stream);
+  }
+
+  if (bc->num_finetuning_fwd_tokens() > 0) {
+    checkCUDA(cudaStreamWaitEvent(main_stream, m->handle.peft_fwd_done, 0));
+  }
+}
+
 /*static*/
 template <typename DT>
-void ArgMax::forward_kernel(ArgMaxMeta const *m,
+void ArgMax::inference_kernel(ArgMaxMeta const *m,
                             BatchConfig const *bc,
                             DT const *input_ptr,
                             int *indices_ptr,
                             float *prob_ptr,
                             int *parent,
-                            int const length,
-                            int const batch_size,
+                            int const num_classes,
                             float *loss,
                             cudaStream_t stream) {
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  if (m->peft_support_mode == SPATIAL_SHARING) {
+    inference_kernel_spatial_sharing(m,
+                                     bc,
+                                     input_ptr,
+                                     indices_ptr,
+                                     prob_ptr,
+                                     parent,
+                                     num_classes,
+                                     loss,
+                                     stream);
+    return;
+  }
 
   if (m->beam_search) {
     // set all parents id zero in arg top1 case.
-    checkCUDA(cudaMemsetAsync(parent, 0, batch_size * sizeof(int), stream));
+    checkCUDA(cudaMemsetAsync(parent, 0, bc->num_active_tokens() * sizeof(int), stream));
   }
 
   launchArgmaxKernel(
-      input_ptr, length, batch_size, indices_ptr, prob_ptr, stream);
+      input_ptr, num_classes, bc->num_active_tokens(), indices_ptr, prob_ptr, stream);
 
   // print_tensor(indices_ptr, batch_size, "indices_ptr: ");
 
@@ -178,11 +262,11 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
         GET_BLOCKS(num_finetuning_tokens),
         min(CUDA_NUM_THREADS, num_finetuning_tokens),
         0,
-        stream>>>(input_ptr + first_token_offset * length,
+        stream>>>(input_ptr + first_token_offset * num_classes,
                   static_cast<BatchConfig::TokenId *>(m->handle.workSpace),
                   m->d_loss,
                   num_finetuning_tokens,
-                  length);
+                  num_classes);
     // copy value from d_loss to loss
     checkCUDA(cudaMemcpyAsync(
         loss, m->d_loss, sizeof(float), cudaMemcpyDeviceToHost, stream));
@@ -191,12 +275,11 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
 }
 
 /*static*/
-void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
+void ArgMax::inference_kernel_wrapper(ArgMaxMeta const *m,
                                     BatchConfig const *bc,
                                     GenericTensorAccessorR const &input,
                                     GenericTensorAccessorW const &indices,
                                     GenericTensorAccessorW const &parent,
-                                    int batch_size,
                                     float *loss) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
@@ -206,31 +289,29 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
     cudaEventCreate(&t_end);
     cudaEventRecord(t_start, stream);
   }
-  int length = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  int num_classes = input.domain.hi()[0] - input.domain.lo()[0] + 1;
 
   if (input.data_type == DT_HALF) {
-    ArgMax::forward_kernel<half>(m,
+    ArgMax::inference_kernel<half>(m,
                                  bc,
                                  input.get_half_ptr(),
                                  indices.get_int32_ptr(),
                                  m->beam_search ? m->probs : nullptr,
                                  m->beam_search ? parent.get_int32_ptr()
                                                 : nullptr,
-                                 length,
-                                 batch_size,
+                                 num_classes,
                                  loss,
                                  stream);
 
   } else if (input.data_type == DT_FLOAT) {
-    ArgMax::forward_kernel<float>(m,
+    ArgMax::inference_kernel<float>(m,
                                   bc,
                                   input.get_float_ptr(),
                                   indices.get_int32_ptr(),
                                   m->beam_search ? m->probs : nullptr,
                                   m->beam_search ? parent.get_int32_ptr()
                                                  : nullptr,
-                                  length,
-                                  batch_size,
+                                  num_classes,
                                   loss,
                                   stream);
   } else {
diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc
index a1af564c9..7136b14c6 100644
--- a/src/ops/embedding.cc
+++ b/src/ops/embedding.cc
@@ -561,24 +561,16 @@ void Embedding::inference_task(Task const *task,
            output.domain.hi()[0] - output.domain.lo()[0]);
   }
 
-  int in_dim, out_dim, effective_batch_size;
+  int in_dim, out_dim;
   if (m->aggr == AGGR_MODE_NONE) {
     in_dim = 1;
     out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
-    // effective_batch_size = output.domain.get_volume() / out_dim;
-    effective_batch_size =
-        bc->num_active_tokens(); // use num_active_tokens for inference
-    assert(effective_batch_size * in_dim <= input.domain.get_volume());
   } else {
     in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
     out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
-    // effective_batch_size = output.domain.get_volume() / out_dim;
-    effective_batch_size =
-        bc->num_active_tokens(); // use num_active_tokens for inference
-    assert(effective_batch_size * in_dim <= input.domain.get_volume());
   }
-  forward_kernel_wrapper(
-      m, input, output, kernel, in_dim, out_dim, effective_batch_size);
+  inference_kernel_wrapper(
+      m, bc, input, output, kernel, in_dim, out_dim);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp
index d10f919fb..4504c145e 100644
--- a/src/ops/fused.cpp
+++ b/src/ops/fused.cpp
@@ -370,7 +370,7 @@ __host__ void
 
         assert(my_input_accessor[0].data_type == DT_INT32 ||
                my_input_accessor[0].data_type == DT_INT64);
-        Kernels::Embedding::forward_kernel_wrapper(m,
+        Kernels::Embedding::inference_kernel_wrapper(m,
                                                    my_input_accessor[0],
                                                    my_output_accessor[0],
                                                    my_weight_accessor[0],
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 39e58c563..9545704f9 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -234,7 +234,6 @@ __host__ void
         }
         assert(m->input_type[0] == my_input_accessor[0].data_type);
         assert(m->input_type[0] == my_output_accessor[0].data_type);
-        batch_size = bc->num_active_tokens();
         Kernels::Linear::inference_kernel_wrapper(m,
                                                   bc,
                                                   my_input_accessor[0].ptr,
@@ -242,8 +241,7 @@ __host__ void
                                                   my_weight_accessor[0].ptr,
                                                   bias_ptr,
                                                   in_dim,
-                                                  out_dim,
-                                                  batch_size);
+                                                  out_dim);
         break;
       }
       case OP_LORA: {
@@ -356,38 +354,26 @@ __host__ void
                  my_output_accessor[0].domain.hi()[0] -
                      my_output_accessor[0].domain.lo()[0]);
         }
-        int in_dim, out_dim, effective_batch_size;
+        int in_dim, out_dim;
         if (m->aggr == AGGR_MODE_NONE) {
           in_dim = 1;
           out_dim = my_output_accessor[0].domain.hi()[0] -
                     my_output_accessor[0].domain.lo()[0] + 1;
-          // effective_batch_size =
-          //     my_output_accessor[0].domain.get_volume() / out_dim;
-          effective_batch_size = bc->num_active_tokens();
-          assert(effective_batch_size * in_dim <=
-                 my_input_accessor[0].domain.get_volume());
         } else {
           assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM);
           in_dim = my_input_accessor[0].domain.hi()[0] -
                    my_input_accessor[0].domain.lo()[0] + 1;
           out_dim = my_output_accessor[0].domain.hi()[0] -
                     my_output_accessor[0].domain.lo()[0] + 1;
-          // effective_batch_size =
-          //     my_output_accessor[0].domain.get_volume() / out_dim;
-          effective_batch_size = bc->num_active_tokens();
-          assert(effective_batch_size * in_dim <=
-                 my_input_accessor[0].domain.get_volume());
         }
-
         assert(my_input_accessor[0].data_type == DT_INT32 ||
                my_input_accessor[0].data_type == DT_INT64);
-        Kernels::Embedding::forward_kernel_wrapper(m,
-                                                   my_input_accessor[0],
-                                                   my_output_accessor[0],
-                                                   my_weight_accessor[0],
-                                                   in_dim,
-                                                   out_dim,
-                                                   effective_batch_size);
+        Kernels::Embedding::inference_kernel_wrapper(m, bc,
+                                                    my_input_accessor[0],
+                                                    my_output_accessor[0],
+                                                    my_weight_accessor[0],
+                                                    in_dim,
+                                                    out_dim);
         break;
       }
       case OP_GELU:
@@ -748,10 +734,10 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
     if (metas->meta[op] != NULL) {
       assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas);
       assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn);
-      assert(metas->meta[start]->handle.peft_blas ==
-             metas->meta[op]->handle.peft_blas);
-      assert(metas->meta[start]->handle.peft_dnn ==
-             metas->meta[op]->handle.peft_dnn);
+      assert(metas->meta[start]->handle.peft_bwd_blas ==
+             metas->meta[op]->handle.peft_bwd_blas);
+      assert(metas->meta[start]->handle.peft_bwd_dnn ==
+             metas->meta[op]->handle.peft_bwd_dnn);
     }
   }
 
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 4ee060032..c4794cf47 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1139,8 +1139,8 @@ void flash_compute_attention_kernel_peft(IncMultiHeadSelfAttentionMeta *m,
     return;
   }
 
-  checkCUDA(cublasSetStream(m->handle.peft_blas, peft_stream));
-  checkCUDNN(cudnnSetStream(m->handle.peft_dnn, peft_stream));
+  checkCUDA(cublasSetStream(m->handle.peft_bwd_blas, peft_stream));
+  checkCUDNN(cudnnSetStream(m->handle.peft_bwd_dnn, peft_stream));
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
@@ -1899,14 +1899,10 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
                           main_stream>>>(
       general_params, data_pointers, rope_params);
 
-  cudaEvent_t prep_done, finetuning_done;
 
   if (bc->num_finetuning_fwd_tokens() > 0) {
-    checkCUDA(cudaEventCreate(&prep_done));
-    checkCUDA(cudaEventCreate(&finetuning_done));
-    // wait until main stream is done running the prep kernel
-    checkCUDA(cudaEventRecord(prep_done, main_stream));
-    cudaStreamWaitEvent(m->handle.peft_fwd_stream, prep_done, 0);
+    checkCUDA(cudaEventRecord(m->handle.peft_fwd_can_start, main_stream)); 
+    checkCUDA(cudaStreamWaitEvent(m->handle.peft_fwd_stream, m->handle.peft_fwd_can_start, 0));
 
     flash_compute_attention_kernel_peft<DT>(
         m, bc, output_ptr, shard_id, m->handle.peft_fwd_stream);
@@ -1923,7 +1919,8 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
       m->peft_token_infos[prev_steps_tokens + j] =
           bc->tokensInfo[tokens_previous_requests + j];
     }
-    checkCUDA(cudaEventRecord(finetuning_done, m->handle.peft_fwd_stream));
+
+    checkCUDA(cudaEventRecord(m->handle.peft_fwd_done, m->handle.peft_fwd_stream));
   }
 
   // Step 2: Run inference
@@ -1935,9 +1932,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
   }
 
   if (bc->num_finetuning_fwd_tokens() > 0) {
-    checkCUDA(cudaStreamWaitEvent(main_stream, finetuning_done, 0));
-    checkCUDA(cudaEventDestroy(prep_done));
-    checkCUDA(cudaEventDestroy(finetuning_done));
+    checkCUDA(cudaStreamWaitEvent(main_stream, m->handle.peft_fwd_done, 0));
   }
 }
 
@@ -1952,8 +1947,8 @@ void flash_peft_bwd_kernel(IncMultiHeadSelfAttentionMeta *m,
   // Step 0: param check as in peft_bwd_kernel
   // ================================================================
   assert(!m->offload);
-  checkCUDA(cublasSetStream(m->handle.peft_blas, peft_stream));
-  checkCUDNN(cudnnSetStream(m->handle.peft_dnn, peft_stream));
+  checkCUDA(cublasSetStream(m->handle.peft_bwd_blas, peft_stream));
+  checkCUDNN(cudnnSetStream(m->handle.peft_bwd_dnn, peft_stream));
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu
index 60e6da8d8..93ac7ee49 100644
--- a/src/ops/kernels/embedding_kernels.cu
+++ b/src/ops/kernels/embedding_kernels.cu
@@ -30,79 +30,87 @@ namespace Kernels {
 namespace Embedding {
 
 /*static*/
-void forward_kernel_wrapper(EmbeddingMeta const *m,
-                            GenericTensorAccessorR const &input,
-                            GenericTensorAccessorW const &output,
-                            GenericTensorAccessorR const &weight,
-                            int in_dim,
-                            int out_dim,
-                            int batch_size) {
+void inference_kernel_wrapper(EmbeddingMeta const *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output,
+                              GenericTensorAccessorR const &weight,
+                              int in_dim,
+                              int out_dim) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+  // int batch_size = bc->num_active_tokens();
+  // assert(batch_size > 0);
   if (input.data_type == DT_INT32) {
     if (weight.data_type == DT_HALF) {
-      Internal::forward_kernel(input.get_int32_ptr(),
+      Internal::forward_kernel(m,bc,
+                               input.get_int32_ptr(),
                                output.get_half_ptr(),
                                weight.get_half_ptr(),
                                in_dim,
                                out_dim,
-                               batch_size,
-                               m->aggr,
-                               output.domain.get_volume(),
+                              //  batch_size,
+                              //  m->aggr,
+                              //  output.domain.get_volume(),
                                stream);
     } else if (weight.data_type == DT_FLOAT) {
-      Internal::forward_kernel(input.get_int32_ptr(),
+      Internal::forward_kernel(m,bc,
+                               input.get_int32_ptr(),
                                output.get_float_ptr(),
                                weight.get_float_ptr(),
                                in_dim,
                                out_dim,
-                               batch_size,
-                               m->aggr,
-                               output.domain.get_volume(),
+                              //  batch_size,
+                              //  m->aggr,
+                              //  output.domain.get_volume(),
                                stream);
     } else if (weight.data_type == DT_DOUBLE) {
-      Internal::forward_kernel(input.get_int32_ptr(),
+      Internal::forward_kernel(m,bc,
+                               input.get_int32_ptr(),
                                output.get_double_ptr(),
                                weight.get_double_ptr(),
                                in_dim,
                                out_dim,
-                               batch_size,
-                               m->aggr,
-                               output.domain.get_volume(),
+                              //  batch_size,
+                              //  m->aggr,
+                              //  output.domain.get_volume(),
                                stream);
     } else {
       assert(false && "Unsupported DataType in Embedding");
     }
   } else if (input.data_type == DT_INT64) {
     if (weight.data_type == DT_HALF) {
-      Internal::forward_kernel(input.get_int64_ptr(),
+      Internal::forward_kernel(m,bc,
+                               input.get_int64_ptr(),
                                output.get_half_ptr(),
                                weight.get_half_ptr(),
                                in_dim,
                                out_dim,
-                               batch_size,
-                               m->aggr,
-                               output.domain.get_volume(),
+                              //  batch_size,
+                              //  m->aggr,
+                              //  output.domain.get_volume(),
                                stream);
     } else if (weight.data_type == DT_FLOAT) {
-      Internal::forward_kernel(input.get_int64_ptr(),
+      Internal::forward_kernel(m,bc,
+                               input.get_int64_ptr(),
                                output.get_float_ptr(),
                                weight.get_float_ptr(),
                                in_dim,
                                out_dim,
-                               batch_size,
-                               m->aggr,
-                               output.domain.get_volume(),
+                              //  batch_size,
+                              //  m->aggr,
+                              //  output.domain.get_volume(),
                                stream);
     } else if (weight.data_type == DT_DOUBLE) {
-      Internal::forward_kernel(input.get_int64_ptr(),
+      Internal::forward_kernel(m,bc,
+                               input.get_int64_ptr(),
                                output.get_double_ptr(),
                                weight.get_double_ptr(),
                                in_dim,
                                out_dim,
-                               batch_size,
-                               m->aggr,
-                               output.domain.get_volume(),
+                              //  batch_size,
+                              //  m->aggr,
+                              //  output.domain.get_volume(),
                                stream);
     } else {
       assert(false && "Unsupported DataType in Embedding");
@@ -172,25 +180,79 @@ __global__ void embed_forward_with_aggr(TI const *input,
 
 /*static*/
 template <typename TI, typename TD>
-void forward_kernel(TI const *input_ptr,
+void forward_kernel_spatial_sharing(EmbeddingMeta const *m,
+                                    BatchConfig const *bc,
+                                    TI const *input_ptr,
+                                    TD *output_ptr,
+                                    TD const *weight_ptr,
+                                    int in_dim,
+                                    int out_dim,
+                                    cudaStream_t main_stream) {
+  assert(input_ptr != nullptr);
+  assert(output_ptr != nullptr);
+  assert(weight_ptr != nullptr);
+  assert (m->aggr == AGGR_MODE_NONE);
+
+  // launch finetuning fwd tokens kernel if there are any finetuning fwd tokens
+  if (bc->num_finetuning_fwd_tokens() > 0) {
+    checkCUDA(cudaEventRecord(m->handle.peft_fwd_can_start, main_stream)); 
+    checkCUDA(cudaStreamWaitEvent(m->handle.peft_fwd_stream, m->handle.peft_fwd_can_start, 0));
+    int parallelism = bc->num_finetuning_fwd_tokens() * out_dim;
+    int num_inference_tokens = bc->num_inference_tokens();
+    embed_forward_no_aggr<TI, TD>
+      <<<GET_BLOCKS(parallelism), CUDA_NUM_THREADS, 0, m->handle.peft_fwd_stream>>>(
+          input_ptr+in_dim*num_inference_tokens, 
+          output_ptr+out_dim*num_inference_tokens, 
+          weight_ptr, 
+          out_dim, 
+          bc->num_finetuning_fwd_tokens());
+    checkCUDA(cudaEventRecord(m->handle.peft_fwd_done, m->handle.peft_fwd_stream));
+  }
+  
+  // launch inference kernel if there are inference tokens
+  if (bc->num_inference_tokens() > 0) {
+    int parallelism = bc->num_finetuning_fwd_tokens() * out_dim;
+    embed_forward_no_aggr<TI, TD>
+      <<<GET_BLOCKS(parallelism), CUDA_NUM_THREADS, 0, main_stream>>>(
+          input_ptr, output_ptr, weight_ptr, out_dim, bc->num_inference_tokens());
+  }
+
+  if (bc->num_finetuning_fwd_tokens() > 0) {
+    checkCUDA(cudaStreamWaitEvent(main_stream, m->handle.peft_fwd_done, 0));
+  }
+}
+
+/*static*/
+template <typename TI, typename TD>
+void forward_kernel(EmbeddingMeta const *m,
+                    BatchConfig const *bc,
+                    TI const *input_ptr,
                     TD *output_ptr,
                     TD const *weight_ptr,
                     int in_dim,
                     int out_dim,
-                    int batch_size,
-                    AggrMode aggr,
-                    int outputSize,
+                    // int batch_size,
+                    // AggrMode aggr,
+                    // int outputSize,
                     cudaStream_t stream) {
+  if (m->peft_support_mode == SPATIAL_SHARING) {
+    forward_kernel_spatial_sharing<TI, TD>(m, bc, input_ptr, output_ptr, weight_ptr, in_dim, out_dim, stream);
+    return;
+  }
   assert(input_ptr != nullptr);
   assert(output_ptr != nullptr);
   assert(weight_ptr != nullptr);
 
-  if (aggr == AGGR_MODE_NONE) {
+  int batch_size = bc->num_active_tokens();
+  int outputSize = batch_size * out_dim;
+  assert(outputSize > 0);
+
+  if (m->aggr == AGGR_MODE_NONE) {
     embed_forward_no_aggr<TI, TD>
         <<<GET_BLOCKS(outputSize), CUDA_NUM_THREADS, 0, stream>>>(
             input_ptr, output_ptr, weight_ptr, out_dim, batch_size);
   } else {
-    assert(aggr == AGGR_MODE_AVG || aggr == AGGR_MODE_SUM);
+    assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM);
     embed_forward_with_aggr<TI, TD>
         <<<GET_BLOCKS(outputSize), CUDA_NUM_THREADS, 0, stream>>>(input_ptr,
                                                                   output_ptr,
@@ -198,7 +260,7 @@ void forward_kernel(TI const *input_ptr,
                                                                   out_dim,
                                                                   in_dim,
                                                                   batch_size,
-                                                                  aggr);
+                                                                  m->aggr);
   }
 }
 
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index c3f048f06..e8fd7fb81 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -178,8 +178,7 @@ void inference_kernel_wrapper(LinearMeta *m,
                               void const *weight_ptr,
                               void const *bias_ptr,
                               int in_dim,
-                              int out_dim,
-                              int batch_size) {
+                              int out_dim) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   cudaEvent_t t_start, t_end;
@@ -190,14 +189,13 @@ void inference_kernel_wrapper(LinearMeta *m,
   }
 
   if (m->input_type[0] == DT_FLOAT) {
-    Internal::inference_kernel<float>(m,
+    Internal::inference_kernel<float>(m,bc,
                                       input_ptr,
                                       output_ptr,
                                       weight_ptr,
                                       bias_ptr,
                                       in_dim,
                                       out_dim,
-                                      batch_size,
                                       stream);
     if ((m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) &&
         bc->num_finetuning_fwd_requests() > 0) {
@@ -205,14 +203,13 @@ void inference_kernel_wrapper(LinearMeta *m,
           m, bc, out_dim, static_cast<float *>(output_ptr), stream);
     }
   } else if (m->input_type[0] == DT_HALF) {
-    Internal::inference_kernel<half>(m,
+    Internal::inference_kernel<half>(m,bc,
                                      input_ptr,
                                      output_ptr,
                                      weight_ptr,
                                      bias_ptr,
                                      in_dim,
                                      out_dim,
-                                     batch_size,
                                      stream);
     if ((m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) &&
         bc->num_finetuning_fwd_requests() > 0) {
@@ -306,16 +303,212 @@ __global__ void AddBiasWithReLU(DT *output_ptr,
   }
 }
 
+template <typename DT>
+void inference_kernel_spatial_sharing(LinearMeta const *m,
+                                      BatchConfig const *bc,
+                                      void const *input_ptr,
+                                      void *output_ptr,
+                                      void const *weight_ptr,
+                                      void const *bias_ptr,
+                                      int in_dim,
+                                      int out_dim,
+                                      ffStream_t main_stream) {
+  
+  // launch finetuning fwd tokens kernel if there are any finetuning fwd tokens
+  if (bc->num_finetuning_fwd_tokens() > 0) {
+    checkCUDA(cudaEventRecord(m->handle.peft_fwd_can_start, main_stream)); 
+    checkCUDA(cudaStreamWaitEvent(m->handle.peft_fwd_stream, m->handle.peft_fwd_can_start, 0));
+
+    checkCUDA(cublasSetStream(m->handle.peft_fwd_blas, m->handle.peft_fwd_stream));
+    checkCUDNN(cudnnSetStream(m->handle.peft_fwd_dnn, m->handle.peft_fwd_stream));
+    DT alpha = 1.0f, beta = 0.0f;
+    cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+    cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
+    cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
+    assert(input_type == weight_type && weight_type == output_type);
+    cudaDataType_t compute_type = output_type;
+    checkCUDA(cublasGemmEx(m->handle.peft_fwd_blas,
+                          CUBLAS_OP_T,
+                          CUBLAS_OP_N,
+                          out_dim,
+                          bc->num_finetuning_fwd_tokens(),
+                          in_dim,
+                          &alpha,
+                          weight_ptr,
+                          weight_type,
+                          in_dim,
+                          static_cast<DT const*>(input_ptr) + in_dim * bc->num_inference_tokens(),
+                          input_type,
+                          in_dim,
+                          &beta,
+                          static_cast<DT*>(output_ptr) + out_dim * bc->num_inference_tokens(),
+                          output_type,
+                          out_dim,
+                          compute_type,
+                          CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+    // use_bias = True
+    if (bias_ptr != NULL) {
+      // fuse bias and relu
+      if (m->activation == AC_MODE_RELU) {
+        int parallelism = out_dim * bc->num_finetuning_fwd_tokens();
+        AddBiasWithReLU<<<GET_BLOCKS(parallelism), CUDA_NUM_THREADS, 0, m->handle.peft_fwd_stream>>>(
+            static_cast<DT*>(output_ptr) + out_dim * bc->num_inference_tokens(),
+            static_cast<DT const *>(bias_ptr),
+            out_dim,
+            bc->num_finetuning_fwd_tokens());
+        return;
+      }
+      checkCUDA(cublasGemmEx(m->handle.peft_fwd_blas,
+                            CUBLAS_OP_T,
+                            CUBLAS_OP_N,
+                            out_dim,
+                            bc->num_finetuning_fwd_tokens(),
+                            1,
+                            &alpha,
+                            bias_ptr,
+                            weight_type,
+                            1,
+                            static_cast<DT *>(m->one_ptr),
+                            weight_type,
+                            1,
+                            &alpha,
+                            static_cast<DT*>(output_ptr) + out_dim * bc->num_inference_tokens(),
+                            output_type,
+                            out_dim,
+                            compute_type,
+                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+    if (use_activation(m->activation)) {
+      checkCUDNN(cudnnActivationForward(m->handle.peft_fwd_dnn,
+                                        m->actiDesc,
+                                        &alpha,
+                                        m->outputTensor,
+                                        static_cast<DT*>(output_ptr) + out_dim * bc->num_inference_tokens(),
+                                        &beta,
+                                        m->outputTensor,
+                                        static_cast<DT*>(output_ptr) + out_dim * bc->num_inference_tokens()));
+    } else if (m->activation == AC_MODE_GELU) {
+      size_t elements = (size_t)out_dim * (size_t)bc->num_finetuning_fwd_tokens();
+      constexpr float B = 0.7978845608028654f;   // sqrt(2.0/M_PI)
+      constexpr float C = 0.035677408136300125f; // 0.044715 * sqrt(2.0/M_PI)
+      gelu_forward_kernel<<<GET_BLOCKS(elements), CUDA_NUM_THREADS, 0, m->handle.peft_fwd_stream>>>(
+          elements, B, C, (float *)static_cast<DT*>(output_ptr) + out_dim * bc->num_inference_tokens());
+    } else if (m->activation == AC_MODE_NONE) {
+      // Do nothing
+    } else {
+      assert(false && "Unsupported activation for Linear");
+    }
+
+    checkCUDA(cudaEventRecord(m->handle.peft_fwd_done, m->handle.peft_fwd_stream));
+  }
+
+  // launch inference kernel if there are inference tokens
+  if (bc->num_inference_tokens() > 0) {
+
+    checkCUDA(cublasSetStream(m->handle.blas, main_stream));
+    checkCUDNN(cudnnSetStream(m->handle.dnn, main_stream));
+    DT alpha = 1.0f, beta = 0.0f;
+    cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+    cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
+    cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
+    assert(input_type == weight_type && weight_type == output_type);
+    cudaDataType_t compute_type = output_type;
+    checkCUDA(cublasGemmEx(m->handle.blas,
+                          CUBLAS_OP_T,
+                          CUBLAS_OP_N,
+                          out_dim,
+                          bc->num_inference_tokens(),
+                          in_dim,
+                          &alpha,
+                          weight_ptr,
+                          weight_type,
+                          in_dim,
+                          input_ptr,
+                          input_type,
+                          in_dim,
+                          &beta,
+                          output_ptr,
+                          output_type,
+                          out_dim,
+                          compute_type,
+                          CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+    // use_bias = True
+    if (bias_ptr != NULL) {
+      // fuse bias and relu
+      if (m->activation == AC_MODE_RELU) {
+        int parallelism = out_dim * bc->num_inference_tokens();
+        AddBiasWithReLU<<<GET_BLOCKS(parallelism), CUDA_NUM_THREADS, 0, main_stream>>>(
+            static_cast<DT *>(output_ptr),
+            static_cast<DT const *>(bias_ptr),
+            out_dim,
+            bc->num_inference_tokens());
+        return;
+      }
+      checkCUDA(cublasGemmEx(m->handle.blas,
+                            CUBLAS_OP_T,
+                            CUBLAS_OP_N,
+                            out_dim,
+                            bc->num_inference_tokens(),
+                            1,
+                            &alpha,
+                            bias_ptr,
+                            weight_type,
+                            1,
+                            static_cast<DT *>(m->one_ptr),
+                            weight_type,
+                            1,
+                            &alpha,
+                            output_ptr,
+                            output_type,
+                            out_dim,
+                            compute_type,
+                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+    if (use_activation(m->activation)) {
+      checkCUDNN(cudnnActivationForward(m->handle.dnn,
+                                        m->actiDesc,
+                                        &alpha,
+                                        m->outputTensor,
+                                        output_ptr,
+                                        &beta,
+                                        m->outputTensor,
+                                        output_ptr));
+    } else if (m->activation == AC_MODE_GELU) {
+      size_t elements = (size_t)out_dim * (size_t)bc->num_inference_tokens();
+      constexpr float B = 0.7978845608028654f;   // sqrt(2.0/M_PI)
+      constexpr float C = 0.035677408136300125f; // 0.044715 * sqrt(2.0/M_PI)
+      gelu_forward_kernel<<<GET_BLOCKS(elements), CUDA_NUM_THREADS, 0, main_stream>>>(
+          elements, B, C, (float *)output_ptr);
+    } else if (m->activation == AC_MODE_NONE) {
+      // Do nothing
+    } else {
+      assert(false && "Unsupported activation for Linear");
+    }
+
+  }
+
+  if (bc->num_finetuning_fwd_tokens() > 0) {
+    checkCUDA(cudaStreamWaitEvent(main_stream, m->handle.peft_fwd_done, 0));
+  }
+
+}
+
 template <typename DT>
 void inference_kernel(LinearMeta const *m,
+                      BatchConfig const *bc,
                       void const *input_ptr,
                       void *output_ptr,
                       void const *weight_ptr,
                       void const *bias_ptr,
                       int in_dim,
                       int out_dim,
-                      int batch_size,
                       ffStream_t stream) {
+  if (m->peft_support_mode == SPATIAL_SHARING) {
+    inference_kernel_spatial_sharing<DT>(m, bc, input_ptr, output_ptr, weight_ptr, bias_ptr, in_dim, out_dim, stream);
+    return;
+  }
   // additional processing for uploading weights
   if (m->offload) {
     // Note that we update weight_ptr when uploading weight
@@ -370,7 +563,7 @@ void inference_kernel(LinearMeta const *m,
                          CUBLAS_OP_T,
                          CUBLAS_OP_N,
                          out_dim,
-                         batch_size,
+                         bc->num_active_tokens(),
                          in_dim,
                          &alpha,
                          m->offload ? m->weight_ptr : weight_ptr,
@@ -390,19 +583,19 @@ void inference_kernel(LinearMeta const *m,
   if (bias_ptr != NULL) {
     // fuse bias and relu
     if (m->activation == AC_MODE_RELU) {
-      int parallelism = out_dim * batch_size;
+      int parallelism = out_dim * bc->num_active_tokens();
       AddBiasWithReLU<<<GET_BLOCKS(parallelism), CUDA_NUM_THREADS, 0, stream>>>(
           static_cast<DT *>(output_ptr),
           static_cast<DT const *>(bias_ptr),
           out_dim,
-          batch_size);
+          bc->num_active_tokens());
       return;
     }
     checkCUDA(cublasGemmEx(m->handle.blas,
                            CUBLAS_OP_T,
                            CUBLAS_OP_N,
                            out_dim,
-                           batch_size,
+                           bc->num_active_tokens(),
                            1,
                            &alpha,
                            bias_ptr,
@@ -428,7 +621,7 @@ void inference_kernel(LinearMeta const *m,
                                       m->outputTensor,
                                       output_ptr));
   } else if (m->activation == AC_MODE_GELU) {
-    size_t elements = (size_t)out_dim * (size_t)batch_size;
+    size_t elements = (size_t)out_dim * (size_t)bc->num_active_tokens();
     constexpr float B = 0.7978845608028654f;   // sqrt(2.0/M_PI)
     constexpr float C = 0.035677408136300125f; // 0.044715 * sqrt(2.0/M_PI)
     gelu_forward_kernel<<<GET_BLOCKS(elements), CUDA_NUM_THREADS, 0, stream>>>(
@@ -477,8 +670,8 @@ void peft_bwd_kernel(LinearMeta const *m,
                      int in_dim,
                      int out_dim,
                      ffStream_t stream) {
-  checkCUDA(cublasSetStream(m->handle.peft_blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.peft_dnn, stream));
+  checkCUDA(cublasSetStream(m->handle.peft_bwd_blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.peft_bwd_dnn, stream));
 
   assert(
       bc->peft_bwd_applies_to_this_layer(m->layer_guid.transformer_layer_id));
@@ -527,7 +720,7 @@ void peft_bwd_kernel(LinearMeta const *m,
   }
 
   if (input_grad_ptr != NULL) {
-    checkCUDA(cublasGemmEx(m->handle.peft_blas,
+    checkCUDA(cublasGemmEx(m->handle.peft_bwd_blas,
                            CUBLAS_OP_N,
                            CUBLAS_OP_N,
                            in_dim,
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 7336a0947..ea23578d2 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -170,6 +170,203 @@ bool lora_applies_to_this_layer(LoraLinearMeta *m,
 
 namespace Internal {
 
+template <typename DT>
+void inference_kernel_spatial_sharing(LoraLinearMeta *m,
+                                      BatchConfig const *bc,
+                                      DT const *input_ptr,
+                                      DT *output_ptr,
+                                      int in_dim,
+                                      int out_dim,
+                                      ffStream_t main_stream) {
+  // launch finetuning fwd tokens kernel if there are any finetuning fwd tokens
+  if (bc->num_finetuning_fwd_tokens() > 0) {
+    checkCUDA(cudaEventRecord(m->handle.peft_fwd_can_start, main_stream)); 
+    checkCUDA(cudaStreamWaitEvent(m->handle.peft_fwd_stream, m->handle.peft_fwd_can_start, 0));
+
+    checkCUDA(cublasSetStream(m->handle.peft_fwd_blas, m->handle.peft_fwd_stream));
+    checkCUDNN(cudnnSetStream(m->handle.peft_fwd_dnn, m->handle.peft_fwd_stream));
+    cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+    cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]);
+    cudaDataType_t lr_actv_type = output_type;
+    assert(input_type == output_type);
+    cudaDataType_t weight_type = output_type;
+    cudaDataType_t compute_type = output_type;
+
+    int i = bc->finetuning_request_index();
+    std::string peft_model_config_str =
+        std::string(bc->requestsInfo[i].peft_model_config_str);
+    LoraLinearConfig lora_config =
+        LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
+    if (lora_applies_to_this_layer(m, lora_config)) {
+      // std::cout << "Lora layer activated!" << std::endl;
+      // std::cout << "Lora Config: " << peft_model_config_str << std::endl;
+      assert(lora_config.trainable == bc->requestsInfo[i].finetuning_request &&
+            "Trainable flag mismatch");
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      // assert(num_peft_tokens == bc->num_finetuning_fwd_tokens());
+      // int max_peft_tokens = bc->requestsInfo[i].max_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      LoraLinearWeight weight = m->peft_memory_manager->get_peft(
+          bc->requestsInfo[i].peft_model_id, lora_config);
+      void *intermediate_result_ptr = (bc->requestsInfo[i].finetuning_request)
+                                          ? weight.low_rank_activation
+                                          : m->handle.workSpace;
+      if (bc->requestsInfo[i].finetuning_request) {
+        checkCUDA(cudaMemcpyAsync(weight.input_activation,
+                                  input_ptr + first_token_offset * in_dim,
+                                  data_type_size(m->input_type[0]) *
+                                      num_peft_tokens * in_dim,
+                                  cudaMemcpyDeviceToDevice,
+                                  m->handle.peft_fwd_stream));
+      } else {
+        // use workspace to save intermediate result
+        assert(m->handle.workSpaceSize >= data_type_size(m->input_type[1]) *
+                                              num_peft_tokens * lora_config.rank);
+      }
+      DT alpha = 1.0f, beta = 0.0f;
+      // buffer = weight_first * input
+      // [rank, num_peft_tokens] = [in_dim, rank].T * [in_dim, num_peft_tokens]
+      checkCUDA(cublasGemmEx(m->handle.peft_fwd_blas,
+                            CUBLAS_OP_T,
+                            CUBLAS_OP_N,
+                            lora_config.rank,
+                            num_peft_tokens,
+                            in_dim,
+                            &alpha,
+                            weight.w0_ptr,
+                            weight_type,
+                            in_dim,
+                            input_ptr + first_token_offset * in_dim,
+                            input_type,
+                            in_dim,
+                            &beta,
+                            intermediate_result_ptr,
+                            lr_actv_type,
+                            lora_config.rank,
+                            compute_type,
+                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      // output = weight_second * buffer
+      // [out_dim, num_peft_tokens] = [rank, out_dim].T * [rank, num_peft_tokens]
+      // Note that we use alpha in both places since we do
+      // an in-place update for LoraLinear
+      DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank);
+      checkCUDA(cublasGemmEx(m->handle.peft_fwd_blas,
+                            CUBLAS_OP_T,
+                            CUBLAS_OP_N,
+                            out_dim,
+                            num_peft_tokens,
+                            lora_config.rank,
+                            &scaling_constant,
+                            weight.w1_ptr,
+                            weight_type,
+                            lora_config.rank,
+                            intermediate_result_ptr,
+                            lr_actv_type,
+                            lora_config.rank,
+                            &alpha,
+                            output_ptr + first_token_offset * out_dim,
+                            output_type,
+                            out_dim,
+                            compute_type,
+                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+    checkCUDA(cudaEventRecord(m->handle.peft_fwd_done, m->handle.peft_fwd_stream));
+  }
+
+  // launch inference kernel if there are inference tokens
+  if (bc->num_inference_tokens() > 0) {
+    
+    checkCUDA(cublasSetStream(m->handle.blas, main_stream));
+    checkCUDNN(cudnnSetStream(m->handle.dnn, main_stream));
+    cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+    cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]);
+    cudaDataType_t lr_actv_type = output_type;
+    assert(input_type == output_type);
+    cudaDataType_t weight_type = output_type;
+    cudaDataType_t compute_type = output_type;
+
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i] ||
+          bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID ||
+          bc->requestsInfo[i].finetuning_request) {
+        continue;
+      }
+      std::string peft_model_config_str =
+          std::string(bc->requestsInfo[i].peft_model_config_str);
+      LoraLinearConfig lora_config =
+          LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
+      if (!lora_applies_to_this_layer(m, lora_config)) {
+        continue;
+      }
+      // std::cout << "Lora layer activated!" << std::endl;
+      // std::cout << "Lora Config: " << peft_model_config_str << std::endl;
+      assert(!lora_config.trainable && "Trainable flag mismatch");
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      // assert(num_peft_tokens == bc->num_finetuning_fwd_tokens());
+      // int max_peft_tokens = bc->requestsInfo[i].max_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      LoraLinearWeight weight = m->peft_memory_manager->get_peft(
+          bc->requestsInfo[i].peft_model_id, lora_config);
+      void *intermediate_result_ptr = m->handle.workSpace;
+      
+      // use workspace to save intermediate result
+      assert(m->handle.workSpaceSize >= data_type_size(m->input_type[1]) *
+                                            num_peft_tokens * lora_config.rank);
+      
+      DT alpha = 1.0f, beta = 0.0f;
+      // buffer = weight_first * input
+      // [rank, num_peft_tokens] = [in_dim, rank].T * [in_dim, num_peft_tokens]
+      checkCUDA(cublasGemmEx(m->handle.blas,
+                            CUBLAS_OP_T,
+                            CUBLAS_OP_N,
+                            lora_config.rank,
+                            num_peft_tokens,
+                            in_dim,
+                            &alpha,
+                            weight.w0_ptr,
+                            weight_type,
+                            in_dim,
+                            input_ptr + first_token_offset * in_dim,
+                            input_type,
+                            in_dim,
+                            &beta,
+                            intermediate_result_ptr,
+                            lr_actv_type,
+                            lora_config.rank,
+                            compute_type,
+                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      // output = weight_second * buffer
+      // [out_dim, num_peft_tokens] = [rank, out_dim].T * [rank, num_peft_tokens]
+      // Note that we use alpha in both places since we do
+      // an in-place update for LoraLinear
+      DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank);
+      checkCUDA(cublasGemmEx(m->handle.blas,
+                            CUBLAS_OP_T,
+                            CUBLAS_OP_N,
+                            out_dim,
+                            num_peft_tokens,
+                            lora_config.rank,
+                            &scaling_constant,
+                            weight.w1_ptr,
+                            weight_type,
+                            lora_config.rank,
+                            intermediate_result_ptr,
+                            lr_actv_type,
+                            lora_config.rank,
+                            &alpha,
+                            output_ptr + first_token_offset * out_dim,
+                            output_type,
+                            out_dim,
+                            compute_type,
+                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+  }
+
+  if (bc->num_finetuning_fwd_tokens() > 0) {
+    checkCUDA(cudaStreamWaitEvent(main_stream, m->handle.peft_fwd_done, 0));
+  }
+}
+
 template <typename DT>
 void inference_kernel(LoraLinearMeta *m,
                       BatchConfig const *bc,
@@ -313,8 +510,8 @@ void peft_bwd_kernel(Context ctx,
                      int in_dim,
                      int out_dim,
                      ffStream_t stream) {
-  checkCUDA(cublasSetStream(m->handle.peft_blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.peft_dnn, stream));
+  checkCUDA(cublasSetStream(m->handle.peft_bwd_blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.peft_bwd_dnn, stream));
   cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   assert(input_type == output_type);
@@ -363,7 +560,7 @@ void peft_bwd_kernel(Context ctx,
           weight.low_rank_activation, {lora_config.rank, num_peft_tokens});
       torch::save(tensor, filename);
     }
-    checkCUDA(cublasGemmEx(m->handle.peft_blas,
+    checkCUDA(cublasGemmEx(m->handle.peft_bwd_blas,
                            CUBLAS_OP_N,
                            CUBLAS_OP_T,
                            lora_config.rank,
@@ -388,7 +585,7 @@ void peft_bwd_kernel(Context ctx,
   // low_rank_activation
   {
     DT alpha = 1.0f, beta = 0.0f;
-    checkCUDA(cublasGemmEx(m->handle.peft_blas,
+    checkCUDA(cublasGemmEx(m->handle.peft_bwd_blas,
                            CUBLAS_OP_N,
                            CUBLAS_OP_N,
                            lora_config.rank,
@@ -415,7 +612,7 @@ void peft_bwd_kernel(Context ctx,
     DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero)
                   ? 0.0f
                   : 1.0f;
-    checkCUDA(cublasGemmEx(m->handle.peft_blas,
+    checkCUDA(cublasGemmEx(m->handle.peft_bwd_blas,
                            CUBLAS_OP_N,
                            CUBLAS_OP_T,
                            in_dim,
@@ -440,7 +637,7 @@ void peft_bwd_kernel(Context ctx,
   if (input_grad_ptr != nullptr) {
     DT alpha = 1.0f;
     DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f;
-    checkCUDA(cublasGemmEx(m->handle.peft_blas,
+    checkCUDA(cublasGemmEx(m->handle.peft_bwd_blas,
                            CUBLAS_OP_N,
                            CUBLAS_OP_N,
                            in_dim,
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 3550e49b5..2492fb6de 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -149,6 +149,55 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t data_dim,
   }
 }
 
+template <typename T>
+void inference_kernel_spatial_sharing(ResidualRMSNormMeta const *m,
+                                      BatchConfig const *bc,
+                                      T const *input1_ptr,
+                                      T const *input2_ptr,
+                                      T const *weight_ptr,
+                                      T *residual_output_ptr,
+                                      T *output_ptr,
+                                      cudaStream_t main_stream) {
+  // launch finetuning fwd tokens kernel if there are any finetuning fwd tokens
+  if (bc->num_finetuning_fwd_tokens() > 0) {
+    checkCUDA(cudaEventRecord(m->handle.peft_fwd_can_start, main_stream)); 
+    checkCUDA(cudaStreamWaitEvent(m->handle.peft_fwd_stream, m->handle.peft_fwd_can_start, 0));
+
+    ResidualRMSNormFusedForwardKernel<T>
+      <<<bc->num_finetuning_fwd_tokens(), std::min(CUDA_NUM_THREADS, m->in_dim), 0, m->handle.peft_fwd_stream>>>(
+          m->in_dim,
+          m->eps,
+          input1_ptr + m->in_dim * bc->num_inference_tokens(),
+          input2_ptr + m->in_dim * bc->num_inference_tokens(),
+          residual_output_ptr + m->in_dim * bc->num_inference_tokens(),
+          static_cast<float *>(m->rms_ptr) + bc->requestsInfo[bc->finetuning_request_index()].first_token_depth_in_request,
+          0 /*first_ft_token_idx*/, 
+          weight_ptr,
+          output_ptr + m->in_dim * bc->num_inference_tokens());
+
+    checkCUDA(cudaEventRecord(m->handle.peft_fwd_done, m->handle.peft_fwd_stream));
+  }
+
+  // launch inference kernel if there are inference tokens
+  if (bc->num_inference_tokens() > 0) {
+    ResidualRMSNormFusedForwardKernel<T>
+      <<<bc->num_inference_tokens(), std::min(CUDA_NUM_THREADS, m->in_dim), 0, main_stream>>>(
+          m->in_dim,
+          m->eps,
+          input1_ptr,
+          input2_ptr,
+          residual_output_ptr,
+          nullptr /*rms_ptr*/,
+          bc->num_inference_tokens() /*first_ft_token_idx*/, 
+          weight_ptr,
+          output_ptr);
+  }
+
+  if (bc->num_finetuning_fwd_tokens() > 0) {
+    checkCUDA(cudaStreamWaitEvent(main_stream, m->handle.peft_fwd_done, 0));
+  }
+}
+
 template <typename T>
 void inference_kernel(ResidualRMSNormMeta const *m,
                       BatchConfig const *bc,
@@ -158,7 +207,10 @@ void inference_kernel(ResidualRMSNormMeta const *m,
                       T *residual_output_ptr,
                       T *output_ptr,
                       cudaStream_t stream) {
-
+  if (m->peft_support_mode == SPATIAL_SHARING) {
+    inference_kernel_spatial_sharing(m, bc, input1_ptr, input2_ptr, weight_ptr, residual_output_ptr, output_ptr, stream);
+    return;
+  }
   int num_tokens = bc->num_active_tokens();
   int data_dim = m->in_dim;
   if (num_tokens <= 0) {
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index b3b8bcbb2..b50de4bb1 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -141,6 +141,49 @@ __global__ void RMSNormFusedForwardKernel(int64_t data_dim,
   }
 }
 
+template <typename T>
+void inference_kernel_spatial_sharing(RMSNormMeta const *m,
+                                      BatchConfig const *bc,
+                                      T const *input_ptr,
+                                      T const *weight_ptr,
+                                      T *output_ptr,
+                                      cudaStream_t main_stream) {
+  // launch finetuning fwd tokens kernel if there are any finetuning fwd tokens
+  if (bc->num_finetuning_fwd_tokens() > 0) {
+    checkCUDA(cudaEventRecord(m->handle.peft_fwd_can_start, main_stream)); 
+    checkCUDA(cudaStreamWaitEvent(m->handle.peft_fwd_stream, m->handle.peft_fwd_can_start, 0));
+
+    RMSNormFusedForwardKernel<T>
+        <<<bc->num_finetuning_fwd_tokens(), std::min(CUDA_NUM_THREADS, m->in_dim), 0, m->handle.peft_fwd_stream>>>(
+            m->in_dim,
+            m->eps,
+            input_ptr + m->in_dim * bc->num_inference_tokens(),
+            static_cast<float *>(m->rms_ptr) + bc->requestsInfo[bc->finetuning_request_index()].first_token_depth_in_request,
+            0 /*first_ft_token_idx*/, 
+            weight_ptr,
+            output_ptr + m->in_dim * bc->num_inference_tokens());
+
+    checkCUDA(cudaEventRecord(m->handle.peft_fwd_done, m->handle.peft_fwd_stream));
+  }
+
+  // launch inference kernel if there are inference tokens
+  if (bc->num_inference_tokens() > 0) {
+    RMSNormFusedForwardKernel<T>
+        <<<bc->num_inference_tokens(), std::min(CUDA_NUM_THREADS, m->in_dim), 0, main_stream>>>(
+            m->in_dim,
+            m->eps,
+            input_ptr,
+            nullptr /*rms_ptr*/,
+            bc->num_inference_tokens() /*first_ft_token_idx*/, 
+            weight_ptr,
+            output_ptr);
+  }
+
+  if (bc->num_finetuning_fwd_tokens() > 0) {
+    checkCUDA(cudaStreamWaitEvent(main_stream, m->handle.peft_fwd_done, 0));
+  }
+}
+
 template <typename T>
 void inference_kernel(RMSNormMeta const *m,
                       BatchConfig const *bc,
@@ -148,7 +191,10 @@ void inference_kernel(RMSNormMeta const *m,
                       T const *weight_ptr,
                       T *output_ptr,
                       cudaStream_t stream) {
-
+  if (m->peft_support_mode == SPATIAL_SHARING) {
+    inference_kernel_spatial_sharing(m, bc, input_ptr, weight_ptr, output_ptr, stream);
+    return;
+  }
   int num_tokens = bc->num_active_tokens();
   int data_dim = m->in_dim;
   if (num_tokens <= 0) {
diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index 436b85b05..95d0e6340 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -33,6 +33,9 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler,
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
   checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax(
       outputTensor, input_domain, softmax->data_type));
+  checkCUDNN(cudnnCreateTensorDescriptor(&outputTensorPeftFwd));
+  checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax(
+      outputTensorPeftFwd, input_domain, softmax->data_type));
   dim = softmax->dim;
   profiling = softmax->profiling;
   inference_debugging = softmax->inference_debugging;
@@ -255,6 +258,70 @@ void backward_kernel(SoftmaxMeta const *m,
                             stream));
 }
 
+template <typename DT>
+void inference_kernel_spatial_sharing(SoftmaxMeta const *m,
+                                    BatchConfig const *bc,
+                                    DT const *input_ptr,
+                                    DT *output_ptr,
+                                    int num_classes,
+                                    cudaStream_t main_stream) {
+  // launch finetuning fwd tokens kernel if there are any finetuning fwd tokens
+  if (bc->num_finetuning_fwd_tokens() > 0) {
+    checkCUDA(cudaEventRecord(m->handle.peft_fwd_can_start, main_stream)); 
+    checkCUDA(cudaStreamWaitEvent(m->handle.peft_fwd_stream, m->handle.peft_fwd_can_start, 0));
+
+    checkCUDNN(cudnnSetStream(m->handle.peft_fwd_dnn, m->handle.peft_fwd_stream));
+    float alpha = 1.0f, beta = 0.0f;
+    cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+    checkCUDNN(cudnnSetTensor4dDescriptor(m->outputTensorPeftFwd,
+                                          CUDNN_TENSOR_NCHW,
+                                          cudnn_data_type,
+                                          bc->num_finetuning_fwd_tokens(),
+                                          num_classes,
+                                          1,
+                                          1));
+    checkCUDNN(cudnnSoftmaxForward(m->handle.peft_fwd_dnn,
+                                  CUDNN_SOFTMAX_ACCURATE,
+                                  CUDNN_SOFTMAX_MODE_CHANNEL,
+                                  &alpha,
+                                  m->outputTensorPeftFwd,
+                                  input_ptr + num_classes * bc->num_inference_tokens(),
+                                  &beta,
+                                  m->outputTensorPeftFwd,
+                                  output_ptr + num_classes * bc->num_inference_tokens()));
+
+    checkCUDA(cudaEventRecord(m->handle.peft_fwd_done, m->handle.peft_fwd_stream));
+  }
+
+  // launch inference kernel if there are inference tokens
+  if (bc->num_inference_tokens() > 0) {
+
+    checkCUDNN(cudnnSetStream(m->handle.dnn, main_stream));
+    float alpha = 1.0f, beta = 0.0f;
+    cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+    checkCUDNN(cudnnSetTensor4dDescriptor(m->outputTensor,
+                                          CUDNN_TENSOR_NCHW,
+                                          cudnn_data_type,
+                                          bc->num_inference_tokens(),
+                                          num_classes,
+                                          1,
+                                          1));
+    checkCUDNN(cudnnSoftmaxForward(m->handle.dnn,
+                                  CUDNN_SOFTMAX_ACCURATE,
+                                  CUDNN_SOFTMAX_MODE_CHANNEL,
+                                  &alpha,
+                                  m->outputTensor,
+                                  input_ptr,
+                                  &beta,
+                                  m->outputTensor,
+                                  output_ptr));
+  }
+
+  if (bc->num_finetuning_fwd_tokens() > 0) {
+    checkCUDA(cudaStreamWaitEvent(main_stream, m->handle.peft_fwd_done, 0));
+  }
+}
+
 template <typename DT>
 void inference_kernel(SoftmaxMeta const *m,
                       BatchConfig const *bc,
@@ -262,8 +329,11 @@ void inference_kernel(SoftmaxMeta const *m,
                       DT *output_ptr,
                       int num_classes,
                       cudaStream_t stream) {
+  if (m->peft_support_mode == SPATIAL_SHARING) {
+    inference_kernel_spatial_sharing(m, bc, input_ptr, output_ptr, num_classes, stream);
+    return;
+  }
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-
   float alpha = 1.0f, beta = 0.0f;
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   checkCUDNN(cudnnSetTensor4dDescriptor(m->outputTensor,
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 7d596ac55..c1a87c06b 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -623,7 +623,7 @@ void Linear::inference_task(Task const *task,
   assert((weight.domain.hi()[1] - weight.domain.lo()[1] + 1) == out_dim);
   assert(weight.domain.get_volume() == in_dim * out_dim);
 
-  int batch_size = bc->num_active_tokens();
+  // int batch_size = bc->num_active_tokens();
   GenericTensorAccessorR bias;
   if (m->use_bias &&
       !(m->add_bias_only_once && task->index_point.point_data[0] != 0)) {
@@ -642,8 +642,7 @@ void Linear::inference_task(Task const *task,
                            weight.ptr,
                            bias.ptr,
                            in_dim,
-                           out_dim,
-                           batch_size);
+                           out_dim);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index 44b3c577e..6082b00e2 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -93,6 +93,8 @@ FFHandler
   handle.allowTensorOpMathConversion = info->allowTensorOpMathConversion;
 
   checkCUDA(cudaStreamCreate(&handle.peft_fwd_stream));
+  checkCUDA(cudaEventCreate(&handle.peft_fwd_can_start));
+  checkCUDA(cudaEventCreate(&handle.peft_fwd_done));
 
   // flashinfer
   handle.incr_attention_metadata = new AttentionMetaData();
@@ -105,12 +107,18 @@ FFHandler
     checkCUDA(cublasSetMathMode(handle.blas, CUBLAS_TENSOR_OP_MATH));
   }
   checkCUDNN(cudnnCreate(&handle.dnn));
-  // cublas/dnn handles for peft stream
-  checkCUDA(cublasCreate(&handle.peft_blas));
+  // cublas/dnn handles for peft bwd stream
+  checkCUDA(cublasCreate(&handle.peft_bwd_blas));
   if (handle.allowTensorOpMathConversion) {
-    checkCUDA(cublasSetMathMode(handle.peft_blas, CUBLAS_TENSOR_OP_MATH));
+    checkCUDA(cublasSetMathMode(handle.peft_bwd_blas, CUBLAS_TENSOR_OP_MATH));
   }
-  checkCUDNN(cudnnCreate(&handle.peft_dnn));
+  checkCUDNN(cudnnCreate(&handle.peft_bwd_dnn));
+  // cublas/dnn handles for peft fwd stream (should only be used for SPATIAL SHARING)
+  checkCUDA(cublasCreate(&handle.peft_fwd_blas));
+  if (handle.allowTensorOpMathConversion) {
+    checkCUDA(cublasSetMathMode(handle.peft_fwd_blas, CUBLAS_TENSOR_OP_MATH));
+  }
+  checkCUDNN(cudnnCreate(&handle.peft_fwd_dnn));
 
   // #ifdef FF_USE_NCCL
   //   checkNCCL(ncclCommInitRank(&handle.nccl, info->allRanks, info->ncclId,

From d16f2663cf25e74469ed2266a6765598f5d61db1 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 14 Apr 2025 13:38:01 -0700
Subject: [PATCH 08/17] finish implementing baselines

---
 include/flexflow/request_manager.h |  12 +++
 src/runtime/request_manager.cc     | 155 +++++++++++++++++++++++++----
 2 files changed, 145 insertions(+), 22 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 8ffe35858..596916377 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -168,6 +168,11 @@ class RequestManager {
     SERVING = 1002,
     TERMINATED = 1003,
   };
+  enum PeftTemporalSharingState {
+    INFERENCE = 0,
+    FINETUNING_FWD = 1,
+    FINETUNING_BWD = 2,
+  };
   using TokenId = BatchConfig::TokenId;
 
   RequestManager();
@@ -191,6 +196,7 @@ class RequestManager {
 
   void push_spec_infer_tree_width(int tree_width);
   void set_peft_support_mode(PeftSupportMode peft_support_mode_);
+  void update_peft_temporal_sharing_state(void);
   void set_inference_finished(bool finished = true);
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
@@ -289,6 +295,9 @@ class RequestManager {
   BatchConfig prepare_next_bwd_batch(BatchConfig &new_bc);
   BatchConfig prepare_next_fwd_batch(BatchConfig const &old_bc,
                                      InferenceResult const &result);
+  void add_inference_work_if_needed(BatchConfig &new_bc,
+                                    BatchConfig const &old_bc);
+  void check_new_bc(BatchConfig const &new_bc);
   BatchConfigFuture
       prepare_next_batch(BatchConfigFuture const &old_bc,
                          InferenceResultFuture const &result,
@@ -408,6 +417,9 @@ class RequestManager {
   int max_concurrent_adapters = 0;
   // peft benchmarking
   PeftSupportMode peft_support_mode = PEFT_DISABLED;
+  PeftTemporalSharingState peft_temporal_sharing_state =
+      PeftTemporalSharingState::INFERENCE;
+  BatchConfig ts_saved_old_batch;
   bool inference_finished = false;
   int num_transformer_layers = 0;
   int num_layers_per_finetuning_step = 0;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index a77559aa2..052f1ed1d 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -345,6 +345,11 @@ void RequestManager::set_max_fwd_finetuning_tokens_per_batch(
 int RequestManager::get_max_fwd_finetuning_tokens_per_batch() {
   // assert(max_fwd_finetuning_tokens_per_batch > 0 &&
   // max_fwd_finetuning_tokens_per_batch <= max_tokens_per_batch);
+  if (peft_support_mode == SPATIAL_SHARING || peft_support_mode == TEMPORAL_SHARING) {
+    assert(max_fwd_finetuning_tokens_per_batch == BatchConfig::MAX_NUM_TOKENS);
+  } else {
+    assert(max_fwd_finetuning_tokens_per_batch < BatchConfig::MAX_NUM_TOKENS);
+  }
   return max_fwd_finetuning_tokens_per_batch;
 }
 
@@ -754,6 +759,51 @@ BatchConfigFuture RequestManager::prepare_next_batch(
   return runtime->execute_task(ctx, launcher);
 }
 
+void RequestManager::check_new_bc(BatchConfig const &new_bc) {
+  assert(new_bc.num_inference_tokens() <= max_tokens_per_batch && max_tokens_per_batch <= BatchConfig::MAX_NUM_TOKENS);
+  assert(new_bc.num_active_tokens() <= BatchConfig::MAX_NUM_TOKENS);
+  assert(new_bc.num_finetuning_fwd_tokens() <= max_fwd_finetuning_tokens_per_batch);
+  if (new_bc.num_finetuning_fwd_tokens() > 0) {
+    assert(new_bc.num_finetuning_bwd_tokens() == 0);
+  }
+
+  switch(peft_support_mode) {
+    case SPATIAL_SHARING: {
+      break;
+    } 
+    case TEMPORAL_SHARING: {
+      if (peft_temporal_sharing_state == INFERENCE) {
+        assert(new_bc.num_finetuning_fwd_tokens() == 0);
+        assert(new_bc.num_finetuning_bwd_tokens() == 0);
+      } else if (peft_temporal_sharing_state == FINETUNING_FWD) {
+        assert(new_bc.num_inference_tokens() == 0);
+        assert(new_bc.num_finetuning_bwd_tokens() == 0);
+      } else if (peft_temporal_sharing_state == FINETUNING_BWD) {
+        assert(new_bc.num_inference_tokens() == 0);
+        assert(new_bc.num_finetuning_fwd_tokens() == 0);
+      }
+      break;
+    } 
+    case COSERVING: {
+      assert(new_bc.num_active_tokens() <= max_tokens_per_batch);
+      break;
+    } 
+    case PEFT_INFERENCE_ONLY: {
+      assert(new_bc.num_finetuning_bwd_tokens() == 0);
+      assert(new_bc.num_active_tokens() <= max_tokens_per_batch);
+      break;
+    } 
+    case PEFT_DISABLED: {
+      assert(new_bc.num_active_tokens() <= max_tokens_per_batch);
+      assert(new_bc.num_finetuning_fwd_tokens() == 0 && new_bc.num_finetuning_bwd_tokens() == 0);
+      break;
+    }
+    default: {
+      assert(false && "Invalid PEFT support mode");
+    }
+  }
+}
+
 BatchConfig RequestManager::prepare_next_batch_task(
     Task const *task,
     std::vector<PhysicalRegion> const &regions,
@@ -772,9 +822,23 @@ BatchConfig RequestManager::prepare_next_batch_task(
   rm->process_work_from_old_batch(*old_bc, result);
   BatchConfig new_bc = rm->prepare_next_fwd_batch(*old_bc, result);
   new_bc = rm->prepare_next_bwd_batch(new_bc);
+  rm->check_new_bc(new_bc);
   return new_bc;
 }
 
+void RequestManager::update_peft_temporal_sharing_state(void) {
+  assert(peft_support_mode == TEMPORAL_SHARING);
+  if (peft_temporal_sharing_state == INFERENCE) {
+    peft_temporal_sharing_state = FINETUNING_FWD;
+  } else if (peft_temporal_sharing_state == FINETUNING_FWD) {
+    peft_temporal_sharing_state = FINETUNING_BWD;
+  } else if (peft_temporal_sharing_state == FINETUNING_BWD) {  
+    peft_temporal_sharing_state = INFERENCE;
+  } else {
+    assert(false && "Invalid temporal sharing state");
+  }
+}
+
 bool RequestManager::is_eos_token(int token_id) {
   for (int eos_token : eos_token_ids) {
     if (token_id == eos_token) {
@@ -1333,8 +1397,10 @@ void RequestManager::add_finetuning_req_fwd_batch(BatchConfig &new_bc) {
   assert(peft_finetuning_enabled(peft_support_mode) && "PEFT finetuning is not enabled");
   assert(!pending_peft_request_queue.empty() &&
          "Trying to add a new finetuning request when there are none");
-  assert(new_bc.num_tokens < get_max_tokens_per_batch() &&
-         "Trying to add a new finetuning request when the batch is full");
+  if (peft_support_mode != TEMPORAL_SHARING && peft_support_mode != SPATIAL_SHARING) {
+    assert(new_bc.num_tokens < get_max_tokens_per_batch() &&
+          "Trying to add a new finetuning request when the batch is full");
+  }
   int inference_batch_size =
       BatchConfig::max_requests_per_batch() - (int)peft_finetuning_enabled(peft_support_mode);
   assert(new_bc.request_completed[inference_batch_size] &&
@@ -1356,6 +1422,12 @@ void RequestManager::add_finetuning_req_fwd_batch(BatchConfig &new_bc) {
   int batch_capacity_left =
       std::min(get_max_fwd_finetuning_tokens_per_batch(),
                get_max_tokens_per_batch() - new_bc.num_active_tokens());
+  if (peft_support_mode == TEMPORAL_SHARING || peft_support_mode == SPATIAL_SHARING) {
+    assert(get_max_fwd_finetuning_tokens_per_batch() == BatchConfig::MAX_NUM_TOKENS);
+    batch_capacity_left =
+      std::min(get_max_fwd_finetuning_tokens_per_batch(),
+               BatchConfig::MAX_NUM_TOKENS - new_bc.num_active_tokens());
+  }
   int num_peft_tokens =
       std::min(num_tokens_left_in_dataset_entry, batch_capacity_left);
   assert(num_peft_tokens > 0 && "No tokens left to add to the batch");
@@ -1404,8 +1476,10 @@ void RequestManager::add_finetuning_req_bwd_batch(BatchConfig &new_bc) {
   assert(peft_finetuning_enabled(peft_support_mode) && "PEFT finetuning is not enabled");
   assert(!pending_peft_request_queue.empty() &&
          "Trying to add a new finetuning request when there are none");
-  assert(new_bc.num_tokens <= get_max_tokens_per_batch() &&
-         "Trying to add a new finetuning request when the batch is full");
+  if (peft_support_mode != TEMPORAL_SHARING && peft_support_mode != SPATIAL_SHARING) {
+    assert(new_bc.num_tokens <= get_max_tokens_per_batch() &&
+          "Trying to add a new finetuning request when the batch is full");
+  }
   int inference_batch_size =
       BatchConfig::max_requests_per_batch() - (int)peft_finetuning_enabled(peft_support_mode);
   assert(new_bc.request_completed[inference_batch_size] &&
@@ -1443,7 +1517,12 @@ void RequestManager::add_finetuning_req_bwd_batch(BatchConfig &new_bc) {
   new_bc.requestsInfo[inference_batch_size].finetuning_request = true;
   new_bc.requestsInfo[inference_batch_size].finetuning_backward_phase = true;
 
-  if (get_num_layers_per_finetuning_step() == 0) {
+  int num_layers_per_finetuning_step = get_num_layers_per_finetuning_step();
+  if (peft_support_mode == TEMPORAL_SHARING || peft_support_mode == SPATIAL_SHARING) {
+    num_layers_per_finetuning_step = get_num_transformer_layers();
+  }
+
+  if (num_layers_per_finetuning_step == 0) {
     new_bc.requestsInfo[inference_batch_size].peft_bwd_last_layer =
         new_bc.requestsInfo[inference_batch_size].peft_bwd_first_layer =
             INT_MAX;
@@ -1455,7 +1534,7 @@ void RequestManager::add_finetuning_req_bwd_batch(BatchConfig &new_bc) {
     new_bc.requestsInfo[inference_batch_size].peft_bwd_first_layer =
         std::max(0,
                  new_bc.requestsInfo[inference_batch_size].peft_bwd_last_layer -
-                     get_num_layers_per_finetuning_step() + 1); // inclusive
+                     num_layers_per_finetuning_step + 1); // inclusive
   }
 
   // if (new_bc.requestsInfo[inference_batch_size].peft_bwd_first_layer < 0) {
@@ -1692,6 +1771,9 @@ void RequestManager::process_work_from_old_batch(
   if (peft_finetuning_enabled(peft_support_mode)) {
     process_finetuning_req_fwd_progress(old_bc, result);
     process_finetuning_req_bwd_progress(old_bc);
+    if (peft_support_mode == TEMPORAL_SHARING) {
+      update_peft_temporal_sharing_state();
+    }
   }
 }
 
@@ -1711,20 +1793,9 @@ BatchConfig RequestManager::prepare_next_bwd_batch(BatchConfig &new_bc) {
   return new_bc;
 }
 
-BatchConfig
-    RequestManager::prepare_next_fwd_batch(BatchConfig const &old_bc,
-                                           InferenceResult const &result) {
-  // printf("\nEntering prepare_next_fwd_batch\n");
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
-
-  if (verbose) {
-    std::cout << "\n############### prepare_next_fwd_batch ###############\n";
-    std::cout << "old_bc: " << old_bc << std::endl;
-    std::cout << "result: " << result << std::endl;
-  }
-
-  // Step 1: Create new batch config
-  BatchConfig new_bc;
+void RequestManager::add_inference_work_if_needed(BatchConfig &new_bc,
+                                                  BatchConfig const &old_bc) {
+    
   // params
   int num_active_req = -1;
   // when finetuning is enabled, the last entry in the batch cannot be used for
@@ -1769,11 +1840,51 @@ BatchConfig
       }
     }
   }
+}
+
+BatchConfig
+    RequestManager::prepare_next_fwd_batch(BatchConfig const &old_bc,
+                                           InferenceResult const &result) {
+  // printf("\nEntering prepare_next_fwd_batch\n");
+  const std::lock_guard<std::mutex> lock(request_queue_mutex);
+
+  if (verbose) {
+    std::cout << "\n############### prepare_next_fwd_batch ###############\n";
+    std::cout << "old_bc: " << old_bc << std::endl;
+    std::cout << "result: " << result << std::endl;
+  }
+
+  // Step 1: Create new batch config
+  BatchConfig new_bc;
+
+  if (peft_support_mode != TEMPORAL_SHARING) {
+    add_inference_work_if_needed(new_bc, old_bc);
+  } else {
+    // old_bc is only allowed to have inference tokens if we just finished a INFERENCE phase
+    if (old_bc.num_inference_tokens() > 0) {
+      assert(peft_temporal_sharing_state == FINETUNING_FWD &&
+             "Old batch should not have inference tokens");
+    }
+    if (peft_temporal_sharing_state == INFERENCE) {
+      add_inference_work_if_needed(new_bc, ts_saved_old_batch);
+    } else if (peft_temporal_sharing_state == FINETUNING_FWD) {
+      // if we just finished a finetuning fwd phase, we need to save the old batch for later
+      ts_saved_old_batch = old_bc;
+    }
+  }
 
   // Step 4: add finetuning fwd tokens, if there is additional space
+  int slots_available_for_peft_fwd = 0;
+  if (peft_support_mode == COSERVING) {
+    slots_available_for_peft_fwd = get_max_tokens_per_batch() - new_bc.num_tokens;
+  } else if (peft_support_mode == TEMPORAL_SHARING || peft_support_mode == SPATIAL_SHARING) {
+    slots_available_for_peft_fwd = BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens;
+  }
+  assert(slots_available_for_peft_fwd >= 0);
   if (finetuning_fwd_work_available() &&
-      new_bc.num_tokens < get_max_tokens_per_batch() &&
-      get_max_fwd_finetuning_tokens_per_batch() > 0) {
+      get_max_fwd_finetuning_tokens_per_batch() > 0 &&
+      (peft_support_mode != TEMPORAL_SHARING || peft_temporal_sharing_state == FINETUNING_FWD) &&
+      slots_available_for_peft_fwd > 0) {
     add_finetuning_req_fwd_batch(new_bc);
   }
 

From 801cb67039a76893f3b2ce6f07fbe4d618328531 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 14 Apr 2025 20:05:46 -0700
Subject: [PATCH 09/17] add overhead test

---
 inference/flexllm/CMakeLists.txt   |  73 +++++
 inference/flexllm/overhead_test.cc | 479 +++++++++++++++++++++++++++++
 tests/peft_test.sh                 |   8 +-
 3 files changed, 556 insertions(+), 4 deletions(-)
 create mode 100644 inference/flexllm/overhead_test.cc

diff --git a/inference/flexllm/CMakeLists.txt b/inference/flexllm/CMakeLists.txt
index e31169bbf..85b68fcd5 100644
--- a/inference/flexllm/CMakeLists.txt
+++ b/inference/flexllm/CMakeLists.txt
@@ -71,3 +71,76 @@ set(BIN_DEST "bin")
 install(TARGETS ${project_target1} DESTINATION ${BIN_DEST})
 install(PROGRAMS "${CMAKE_CURRENT_BINARY_DIR}/peft_train" DESTINATION ${BIN_DEST})
 install(PROGRAMS "${CMAKE_CURRENT_BINARY_DIR}/gdb_peft_train" DESTINATION ${BIN_DEST})
+
+
+
+
+# Overhead test
+set(project_target2 overhead_test_unwrapped)
+set(CPU_SRC2
+  ${FLEXFLOW_CPP_DRV_SRC}
+  overhead_test.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/starcoder.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target2} ${CPU_SRC2})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target2} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC2} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target2} ${CPU_SRC2})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target2} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target2} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target2} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target2} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target2} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target2} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+
+
+set(TARGET_PATH "${project_target2}")
+
+# Configure the normal execution wrapper.
+# Here, LAUNCHER is simply "exec" so that it runs the executable normally.
+set(LAUNCHER "exec")
+configure_file(
+  "${CMAKE_CURRENT_SOURCE_DIR}/../inference_wrapper.in"
+  "${CMAKE_CURRENT_BINARY_DIR}/overhead_test"
+  @ONLY
+)
+
+file(CHMOD "${CMAKE_CURRENT_BINARY_DIR}/overhead_test"
+     PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE
+     GROUP_READ GROUP_EXECUTE
+     WORLD_READ WORLD_EXECUTE
+)
+
+# Configure the debugging launcher wrapper.
+# Here, LAUNCHER is set to "gdb --args" so that it runs under gdb.
+set(LAUNCHER "gdb -ex run --args")
+configure_file(
+  "${CMAKE_CURRENT_SOURCE_DIR}/../inference_wrapper.in"
+  "${CMAKE_CURRENT_BINARY_DIR}/gdb_overhead_test"
+  @ONLY
+)
+file(CHMOD "${CMAKE_CURRENT_BINARY_DIR}/gdb_overhead_test"
+     PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE
+     GROUP_READ GROUP_EXECUTE
+     WORLD_READ WORLD_EXECUTE
+)
+
+
+set(BIN_DEST "bin")
+install(TARGETS ${project_target2} DESTINATION ${BIN_DEST})
+install(PROGRAMS "${CMAKE_CURRENT_BINARY_DIR}/overhead_test" DESTINATION ${BIN_DEST})
+install(PROGRAMS "${CMAKE_CURRENT_BINARY_DIR}/gdb_overhead_test" DESTINATION ${BIN_DEST})
diff --git a/inference/flexllm/overhead_test.cc b/inference/flexllm/overhead_test.cc
new file mode 100644
index 000000000..38fc38f25
--- /dev/null
+++ b/inference/flexllm/overhead_test.cc
@@ -0,0 +1,479 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
+#include "models/falcon.h"
+#include "models/llama.h"
+#include "models/mpt.h"
+#include "models/opt.h"
+#include "models/starcoder.h"
+#include <wordexp.h>
+
+#include <cmath>
+#include <iostream>
+#include <vector>
+
+#include <nlohmann/json.hpp>
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+Legion::Logger log_app("llama");
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string output_file_path;
+  std::string profiling_folder_path;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      std::string &llm_model_name,
+                      std::string &peft_model_name,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length,
+                      int &num_kv_cache_slots,
+                      std::vector<int> &num_layers_per_finetuning_step,
+                      std::vector<int> &max_fwd_finetuning_tokens) {
+  for (int i = 1; i < argc; i++) {
+    // llm model type
+    if (!strcmp(argv[i], "-llm-model")) {
+      llm_model_name = std::string(argv[++i]);
+      for (char &c : llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-model")) {
+      peft_model_name = std::string(argv[++i]);
+      for (char &c : peft_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+
+    // output file
+    if (!strcmp(argv[i], "-output-file")) {
+      paths.output_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "-profiling-folder")) {
+      paths.profiling_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-per-batch")) {
+      max_requests_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-batch")) {
+      max_tokens_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--num-kv-cache-slots")) {
+      num_kv_cache_slots = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--num-layers-per-finetuning-step")) {
+      std::string layers_str = std::string(argv[++i]);
+      std::stringstream ss(layers_str);
+      std::string item;
+      while (std::getline(ss, item, ',')) {
+        num_layers_per_finetuning_step.push_back(std::stoi(item));
+      }
+      // std::cout << "ARG num_layers_per_finetuning_step: ";
+      // for (int num_layers : num_layers_per_finetuning_step) {
+      //   std::cout << num_layers << " ";
+      // }
+      // std::cout << std::endl;
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-fwd-finetuning-tokens")) {
+      std::string tokens_str = std::string(argv[++i]);
+      std::stringstream ss(tokens_str);
+      std::string item;
+      while (std::getline(ss, item, ',')) {
+        max_fwd_finetuning_tokens.push_back(std::stoi(item));
+      }
+      // std::cout << "ARG max_fwd_finetuning_tokens: ";
+      // for (int num_tokens : max_fwd_finetuning_tokens) {
+      //   std::cout << num_tokens << " ";
+      // }
+      // std::cout << std::endl;
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+std::vector<Request> make_warmup_requests(int num_inf_request,
+                                          int num_finetuning_steps,
+                                          PEFTModelID *peft_model_id) {
+  std::vector<Request> warmup_requests;
+
+  for (int i = 0; i < num_inf_request; i++) {
+    Request inference_req;
+    inference_req.benchmarking_tokens = 512;
+    inference_req.max_new_tokens = 30;
+    inference_req.warmup = true;
+    warmup_requests.push_back(inference_req);
+  }
+  Request finetuning_req;
+  finetuning_req.req_type = RequestType::REQ_FINETUNING;
+  finetuning_req.benchmarking_tokens = 4096;
+  finetuning_req.add_special_tokens = false;
+  finetuning_req.max_length = 4096;
+  finetuning_req.warmup = true;
+  finetuning_req.peft_model_id =
+      (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+  finetuning_req.peft_finetuning_info.max_training_epochs = num_finetuning_steps;
+  warmup_requests.push_back(finetuning_req);
+  return warmup_requests;
+}
+
+std::vector<Request> make_requests(int max_requests_per_batch,
+                                   int finetuning_entry_size,
+                                   int max_fwd_tokens_per_batch,
+                                   int tot_llm_layers,
+                                   int bwd_layers_per_step,
+                                   PEFTModelID *peft_model_id) {
+  if (bwd_layers_per_step == 0) {
+    bwd_layers_per_step = tot_llm_layers;
+  }
+  std::vector<Request> requests;
+  int target_num_steps = 10;
+  if (max_fwd_tokens_per_batch > 0) {
+    target_num_steps +=
+        10*((finetuning_entry_size + max_fwd_tokens_per_batch - 1) /
+            max_fwd_tokens_per_batch +
+        (tot_llm_layers + bwd_layers_per_step - 1) / bwd_layers_per_step);
+  }
+  for (int i = 0; i < max_requests_per_batch; i++) {
+    Request inference_req;
+    inference_req.benchmarking_tokens = 1;
+    inference_req.add_special_tokens = false;
+    inference_req.max_new_tokens = target_num_steps;
+    inference_req.warmup = false;
+    inference_req.ignore_eos = true;
+    requests.push_back(inference_req);
+  }
+  if (max_fwd_tokens_per_batch > 0) {
+    Request finetuning_req;
+    finetuning_req.req_type = RequestType::REQ_FINETUNING;
+    finetuning_req.add_special_tokens = false;
+    finetuning_req.benchmarking_tokens = finetuning_entry_size;
+    finetuning_req.max_length = finetuning_entry_size;
+    finetuning_req.peft_model_id =
+        (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+    finetuning_req.peft_finetuning_info.max_training_epochs = 10;
+    finetuning_req.warmup = false;
+    requests.push_back(finetuning_req);
+  }
+  return requests;
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) {
+    assert(false && "Doesn't support quantization in non-offload mode");
+  }
+  FilePaths file_paths;
+  std::string llm_model_name, peft_model_name;
+  bool use_full_precision = false;
+  bool verbose = false;
+  bool do_sample = false;
+  ffconfig.peft_support_mode = COSERVING;
+  float temperature = 0.0f;
+  float topp = 0.0f;
+  int max_requests_per_batch = 256;
+  int max_tokens_per_batch = 256;
+  int max_sequence_length = 8192;
+  int num_kv_cache_slots = 240000;
+  int rank = 16;
+  std::vector<int> num_layers_per_finetuning_step;
+  std::vector<int> max_fwd_finetuning_tokens;
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   llm_model_name,
+                   peft_model_name,
+                   use_full_precision,
+                   verbose,
+                   max_requests_per_batch,
+                   max_tokens_per_batch,
+                   max_sequence_length,
+                    num_kv_cache_slots,
+                   num_layers_per_finetuning_step,
+                   max_fwd_finetuning_tokens);
+  // std::cout << "max_fwd_finetuning_tokens: ";
+  // for (int num_tokens : max_fwd_finetuning_tokens) {
+  //   std::cout << num_tokens << " ";
+  // }
+  // std::cout << std::endl;
+  // std::cout << "num_layers_per_finetuning_step: ";
+  // for (int num_layers : num_layers_per_finetuning_step) {
+  //   std::cout << num_layers << " ";
+  // }
+  // std::cout << std::endl;
+
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+
+  std::string config_filepath = join_path(
+      {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
+  std::string tokenizer_filepath =
+      join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name});
+  std::string weights_filepath =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+  std::ifstream config_file_handle(config_filepath);
+  if (!config_file_handle.good()) {
+    std::cout << "Model config file " << config_filepath << " not found."
+              << std::endl;
+    assert(false);
+  }
+
+  json model_config = json::parse(config_file_handle,
+                                  /*parser_callback_t */ nullptr,
+                                  /*allow_exceptions */ true,
+                                  /*ignore_comments */ true);
+  ModelType model_type = ModelType::UNKNOWN;
+  auto architectures = model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" ||
+        str == "Qwen2ForCausalLM" || str == "MistralForCausalLM") {
+      model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_type = ModelType::FALCON;
+      break;
+    } else if (str == "GPTBigCodeForCausalLM") {
+      model_type = ModelType::STARCODER;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_type = ModelType::MPT;
+      break;
+    }
+  }
+  int bos_token_id = model_config.find("bos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("bos_token_id");
+  // parse eos token id, which can be either a single integer or an array of
+  // integers. Convert to std::vector<int>
+  std::vector<int> eos_token_ids;
+  if (model_config.find("eos_token_id") != model_config.end()) {
+    if (model_config["eos_token_id"].is_array()) {
+      for (auto &eos_token_id : model_config["eos_token_id"]) {
+        eos_token_ids.push_back(eos_token_id);
+      }
+    } else {
+      eos_token_ids.push_back(model_config["eos_token_id"]);
+    }
+  } else {
+    eos_token_ids.push_back(-1);
+  }
+
+  assert(model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  // load PEFT config
+  LoraOptimizerConfig *optim_config = new LoraSGDOptimizerConfig(0.001f);
+  std::vector<std::string> target_modules = {"down_proj"};
+  LoraLinearConfig peft_config_finetuning(file_paths.cache_folder_path,
+                                          peft_model_name,
+                                          true /*trainable*/,
+                                          optim_config,
+                                          true /*init_lora_weights*/,
+                                          llm_model_name,
+                                          use_full_precision ? "fp32" : "fp16",
+                                          rank,
+                                          (float)rank,
+                                          0.0f,
+                                          target_modules);
+
+  GenerationConfig generationConfig(do_sample, temperature, topp);
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_verbose(verbose);
+  rm->set_max_requests_per_batch(max_requests_per_batch +
+                                 1); // add one slot for finetuning if needed
+  rm->set_max_concurrent_adapters(1);
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->register_tokenizer(
+      model_type, bos_token_id, eos_token_ids, tokenizer_filepath);
+  rm->register_output_filepath(file_paths.output_file_path);
+  rm->set_peft_support_mode(ffconfig.peft_support_mode);
+  rm->set_max_lora_rank(rank);
+
+  FFModel model(ffconfig, ffconfig.cpu_offload);
+  model.set_num_kv_cache_pages(
+      compute_num_kv_cache_pages_needed(num_kv_cache_slots, 1, false));
+  if (model_type == ModelType::LLAMA) {
+    LLAMA::create_llama_model(model,
+                              config_filepath,
+                              weights_filepath,
+                              INC_DECODING_MODE,
+                              generationConfig,
+                              use_full_precision);
+  } else if (model_type == ModelType::OPT) {
+    OPT::create_opt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          use_full_precision);
+  } else if (model_type == ModelType::FALCON) {
+    FALCON::create_falcon_model(model,
+                                config_filepath,
+                                weights_filepath,
+                                INC_DECODING_MODE,
+                                use_full_precision);
+  } else if (model_type == ModelType::STARCODER) {
+    STARCODER::create_starcoder_model(model,
+                                      config_filepath,
+                                      weights_filepath,
+                                      INC_DECODING_MODE,
+                                      generationConfig,
+                                      use_full_precision);
+  } else if (model_type == ModelType::MPT) {
+    MPT::create_mpt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          generationConfig,
+                          use_full_precision);
+  } else {
+    assert(false && "unknow model type");
+  }
+  int tot_num_layers_in_model = model.current_transformer_layer_id + 1;
+  rm->set_num_transformer_layers(tot_num_layers_in_model);
+  // if (num_layers_per_finetuning_step > 0) {
+  //   rm->set_num_layers_per_finetuning_step(num_layers_per_finetuning_step);
+  // }
+
+  // Start background server
+  rm->start_background_server(&model);
+
+  PEFTModelID *peft_model_id_finetuning =
+      model.register_peft_adapter(peft_config_finetuning);
+
+  // Run workload
+  {
+    std::cout << "----------warmup started--------------" << std::endl;
+    std::vector<Request> warmup_requests =
+        make_warmup_requests(10, 1000, peft_model_id_finetuning);
+    std::vector<GenerationResult> warmup_result =
+        model.generate(warmup_requests);
+    rm->set_inference_finished(false); // reset inference finished flag
+    std::cout << "----------warmup finished--------------" << std::endl
+              << std::endl
+              << std::endl;
+
+    for (int max_fwd_tokens_per_batch : max_fwd_finetuning_tokens) {
+      rm->set_max_fwd_finetuning_tokens_per_batch(max_fwd_tokens_per_batch);
+      for (int num_bwd_layers_per_step : num_layers_per_finetuning_step) {
+        rm->set_num_layers_per_finetuning_step(num_bwd_layers_per_step);
+        std::cout << "Benchmarking overhead of " << max_fwd_tokens_per_batch
+                  << " fwd tokens and " << num_bwd_layers_per_step
+                  << " bwd layers per step."
+                  << " Run idx: " << rm->run_idx << std::endl;
+        std::vector<Request> requests = make_requests(max_requests_per_batch,
+                                                      4096,
+                                                      max_fwd_tokens_per_batch,
+                                                      tot_num_layers_in_model,
+                                                      num_bwd_layers_per_step,
+                                                      peft_model_id_finetuning);
+        std::vector<GenerationResult> result = model.generate(requests);
+        std::cout << "----------inference finished--------------" << std::endl
+                  << std::endl
+                  << std::endl;
+      }
+    }
+  }
+
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
+  // Execution fence
+  {
+    Future future = runtime->issue_execution_fence(ctx);
+    future.get_void_result();
+  }
+  std::string dataset_name = "overhead_test";
+  std::cout << "Saving profiling info..." << std::endl;
+  rm->save_profiling_info_to_csv(file_paths.profiling_folder_path,
+                                 dataset_name,
+                                 llm_model_name,
+                                 ffconfig.tensor_parallelism_degree,
+                                 max_requests_per_batch,
+                                 max_tokens_per_batch,
+                                 num_kv_cache_slots,
+                                 0.0, // arrival rate
+                                 10); // num_warmup_requests
+
+  if (peft_model_id_finetuning != nullptr) {
+    free(peft_model_id_finetuning);
+  }
+
+  std::cout << "----------inference finished--------------" << std::endl;
+
+  // free tokenizer space in memory
+}
+
+void FlexFlow::register_custom_tasks() {}
\ No newline at end of file
diff --git a/tests/peft_test.sh b/tests/peft_test.sh
index b30fbc80b..296097802 100755
--- a/tests/peft_test.sh
+++ b/tests/peft_test.sh
@@ -43,13 +43,13 @@ mkdir -p ./inference/output
 export LEGION_BACKTRACE=1
 
 # Download test model
-python ./inference/utils/download_peft_model.py "${MODEL_NAME}"
+# python ./inference/utils/download_peft_model.py "${MODEL_NAME}"
 
 if [ "$FULL_PRECISION" = "true" ]; then full_precision_flag="--use-full-precision"; else full_precision_flag=""; fi
 if [ "$FUSION" = "true" ]; then fusion_flag="--fusion"; else fusion_flag=""; fi
 
 # Run PEFT in Huggingface to get ground truth tensors
-eval python ./tests/peft/hf_finetune.py --peft-model-id "${MODEL_NAME}" --save-peft-tensors "${full_precision_flag}" -lr "${LEARNING_RATE}"
+# eval python ./tests/peft/hf_finetune.py --peft-model-id "${MODEL_NAME}" --save-peft-tensors "${full_precision_flag}" -lr "${LEARNING_RATE}"
 
 # Python test
 echo "Python test"
@@ -83,9 +83,9 @@ json_config=$(cat <<-END
 END
 )
 echo "$json_config" > /tmp/peft_config.json
-python ./inference/python/ff_peft.py -config-file /tmp/peft_config.json
+# python ./inference/python/ff_peft.py -config-file /tmp/peft_config.json
 # Check alignment
-python ./tests/peft/peft_alignment_test.py -m "${MODEL_NAME}" -tp "${TP_DEGREE}" -lr "${LEARNING_RATE}"
+# python ./tests/peft/peft_alignment_test.py -m "${MODEL_NAME}" -tp "${TP_DEGREE}" -lr "${LEARNING_RATE}"
 
 # C++ test
 echo "C++ test"

From 53ed02e23bfde29d69afa69576f702ba04694f01 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 15 Apr 2025 01:41:44 -0700
Subject: [PATCH 10/17] fixes

---
 inference/flexllm/overhead_test.cc      |  5 +++--
 src/ops/inc_multihead_self_attention.cu | 17 ++++++++++++++++-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/inference/flexllm/overhead_test.cc b/inference/flexllm/overhead_test.cc
index 38fc38f25..19a58f77f 100644
--- a/inference/flexllm/overhead_test.cc
+++ b/inference/flexllm/overhead_test.cc
@@ -188,7 +188,7 @@ std::vector<Request> make_requests(int max_requests_per_batch,
   int target_num_steps = 10;
   if (max_fwd_tokens_per_batch > 0) {
     target_num_steps +=
-        10*((finetuning_entry_size + max_fwd_tokens_per_batch - 1) /
+        5*((finetuning_entry_size + max_fwd_tokens_per_batch - 1) /
             max_fwd_tokens_per_batch +
         (tot_llm_layers + bwd_layers_per_step - 1) / bwd_layers_per_step);
   }
@@ -209,7 +209,8 @@ std::vector<Request> make_requests(int max_requests_per_batch,
     finetuning_req.max_length = finetuning_entry_size;
     finetuning_req.peft_model_id =
         (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
-    finetuning_req.peft_finetuning_info.max_training_epochs = 10;
+    finetuning_req.peft_finetuning_info.max_training_epochs = 5;
+    finetuning_req.peft_finetuning_info.num_logging_steps = 10;
     finetuning_req.warmup = false;
     requests.push_back(finetuning_req);
   }
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index c4794cf47..c9e0d19ea 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1087,7 +1087,22 @@ std::vector<at::Tensor> _wrapper_mha_bwd_1(
   flash::set_params_alibi(params, alibi_slopes_, batch_size, num_heads);
 
   if (seqlen_q > 0) {
-    launch(params, stream);
+    try {
+      // code that might raise error
+      launch(params, stream);
+    } catch (const std::exception &e) {
+      fprintf(stderr, "Caught in FlashAttention backward kernel.\n");
+      std::cerr << e.what() << std::endl;
+      // throw; // optional rethrow
+      // assert(false);
+    } catch(const c10::Error& e) {
+        // Print the error to the terminal.
+        fprintf(stderr, "Caught in FlashAttention backward kernel.\n");
+        std::cerr << e.what() << std::endl;
+        // assert(false);
+    }
+
+    
   } else {
     // If seqlen_q == 0, then we have an empty tensor. We need to set the output
     // to 0.

From b55ad7121b7cf9fcf968f4a7b38032e07976af73 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 15 Apr 2025 12:12:30 -0700
Subject: [PATCH 11/17] updates

---
 inference/flexllm/peft_train.cc      |  2 +-
 src/ops/argmax.cu                    | 14 +++++++-------
 src/ops/kernels/embedding_kernels.cu |  2 +-
 src/runtime/inference_manager.cc     | 11 +++++++++++
 src/runtime/request_manager.cc       | 18 +++++++++++++++---
 src/runtime/request_manager.cu       | 10 +++++-----
 6 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/inference/flexllm/peft_train.cc b/inference/flexllm/peft_train.cc
index c2573e5b3..fcbe9ba91 100644
--- a/inference/flexllm/peft_train.cc
+++ b/inference/flexllm/peft_train.cc
@@ -254,7 +254,7 @@ std::vector<Request> load_trace(nlohmann::ordered_json prompt_json,
     } else {
       inference_req.prompt = text;
     }
-    inference_req.max_new_tokens = response_length;
+    inference_req.max_new_tokens = max(response_length, 2);
     inference_req.ignore_eos = true;
     inference_req.arrival_time_us = arrival_time_us;
     requests.push_back(inference_req);
diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu
index a1a508459..d21186619 100644
--- a/src/ops/argmax.cu
+++ b/src/ops/argmax.cu
@@ -71,7 +71,7 @@ __global__ void argmaxKernel(T const *__restrict__ input,
   int thread_idx = -1;
   for (int i = threadIdx.x; i < vocab_size; i += BLOCK_SIZE) {
     float val = toFloat(col_ptr[i]);
-    if (val > thread_max) {
+    if (val > thread_max || (val == thread_max && thread_idx == -1)) {
       thread_max = val;
       thread_idx = i;
     }
@@ -133,12 +133,14 @@ void inference_kernel_spatial_sharing(ArgMaxMeta const *m,
                                       BatchConfig const *bc,
                                       DT const *input_ptr,
                                       int *indices_ptr,
-                                      float *prob_ptr,
-                                      int *parent,
                                       int const num_classes,
                                       float *loss,
                                       cudaStream_t main_stream) {
   assert(!m->beam_search);
+  assert(input_ptr != nullptr);
+  assert(indices_ptr != nullptr);
+  assert(loss != nullptr);
+  assert(bc != nullptr);
   
   // launch finetuning fwd tokens kernel if there are any finetuning fwd tokens
   if (bc->num_finetuning_fwd_tokens() > 0) {
@@ -149,7 +151,7 @@ void inference_kernel_spatial_sharing(ArgMaxMeta const *m,
                       num_classes, 
                       bc->num_finetuning_fwd_tokens(), 
                       indices_ptr + bc->num_inference_tokens(), 
-                      prob_ptr + bc->num_inference_tokens(), 
+                      nullptr, 
                       m->handle.peft_fwd_stream);
 
     // print_tensor(indices_ptr, batch_size, "indices_ptr: ");
@@ -194,7 +196,7 @@ void inference_kernel_spatial_sharing(ArgMaxMeta const *m,
   // launch inference kernel if there are inference tokens
   if (bc->num_inference_tokens() > 0) {
     launchArgmaxKernel(
-      input_ptr, num_classes, bc->num_inference_tokens(), indices_ptr, prob_ptr, main_stream);
+      input_ptr, num_classes, bc->num_inference_tokens(), indices_ptr, nullptr, main_stream);
   }
 
   if (bc->num_finetuning_fwd_tokens() > 0) {
@@ -218,8 +220,6 @@ void ArgMax::inference_kernel(ArgMaxMeta const *m,
                                      bc,
                                      input_ptr,
                                      indices_ptr,
-                                     prob_ptr,
-                                     parent,
                                      num_classes,
                                      loss,
                                      stream);
diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu
index 93ac7ee49..432c732c1 100644
--- a/src/ops/kernels/embedding_kernels.cu
+++ b/src/ops/kernels/embedding_kernels.cu
@@ -211,7 +211,7 @@ void forward_kernel_spatial_sharing(EmbeddingMeta const *m,
   
   // launch inference kernel if there are inference tokens
   if (bc->num_inference_tokens() > 0) {
-    int parallelism = bc->num_finetuning_fwd_tokens() * out_dim;
+    int parallelism = bc->num_inference_tokens() * out_dim;
     embed_forward_no_aggr<TI, TD>
       <<<GET_BLOCKS(parallelism), CUDA_NUM_THREADS, 0, main_stream>>>(
           input_ptr, output_ptr, weight_ptr, out_dim, bc->num_inference_tokens());
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 20ec7d4f3..14e0d2be8 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -302,6 +302,17 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
     }
     assert(model->check_operators_integrity(old_operators, &tensor_buffer));
     fprintf(stderr, "%zu operators after fusion...\n", model->operators.size());
+    for (size_t i = 0; i < model->operators.size(); i++) {
+      Op *op = model->operators[i];
+      if (op->op_type == OP_INPUT || op->op_type == OP_WEIGHT) {
+        continue;
+      }
+      fprintf(stderr,
+          "operator[%zu]: type(%s) guid(%lu)\n",
+          i,
+          get_operator_type_name(model->operators[i]->op_type).c_str(),
+          model->operators[i]->op_guid);
+    }
   }
 
   // print optimized graph
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 052f1ed1d..4213cfaea 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -338,7 +338,9 @@ void RequestManager::set_max_fwd_finetuning_tokens_per_batch(
     int max_num_tokens) {
   max_fwd_finetuning_tokens_per_batch = max_num_tokens;
   assert(max_fwd_finetuning_tokens_per_batch <= BatchConfig::MAX_NUM_TOKENS);
-  assert(max_fwd_finetuning_tokens_per_batch <= max_tokens_per_batch);
+  if (peft_support_mode == COSERVING) {
+    assert(max_fwd_finetuning_tokens_per_batch <= max_tokens_per_batch);
+  }
   // assert(max_fwd_finetuning_tokens_per_batch > 0);
 }
 
@@ -386,6 +388,9 @@ void RequestManager::push_spec_infer_tree_width(int tree_width) {
 
 void RequestManager::set_peft_support_mode(PeftSupportMode peft_support_mode_) {
   peft_support_mode = peft_support_mode_;
+  if (peft_support_mode == SPATIAL_SHARING || peft_support_mode == TEMPORAL_SHARING) {
+    set_max_fwd_finetuning_tokens_per_batch(BatchConfig::MAX_NUM_TOKENS);
+  }
 }
 
 void RequestManager::set_inference_finished(bool finished) {
@@ -1007,8 +1012,15 @@ void RequestManager::process_inf_req_progress(BatchConfig const &old_fwd_bc,
       // This is a decoding token
       assert(old_fwd_bc.tokensInfo[i].abs_depth_in_request + 1 ==
              request.tokens.size());
-      assert(result.token_ids[i] >= 0);
-      request.tokens.push_back(result.token_ids[i]);
+      if (result.token_ids[i] >= 0) {
+        request.tokens.push_back(result.token_ids[i]);
+      } else {
+        // Log the error and use a placeholder token
+        std::cerr << "Error: Encountered negative token ID: " << result.token_ids[i] << std::endl;
+        std::cerr << "Token index: " << i << ", Request GUID: " << request.guid << std::endl;
+        // std::cerr << "Batch Config: " << old_fwd_bc << std::endl;
+        request.tokens.push_back(15); // placeholder token
+      }
       if (!profiling_requests[guid].first_token_time_set) {
         profiling_requests[guid].first_token_time =
             Realm::Clock::current_time_in_microseconds();
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 6644293fa..b3f70adf8 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -45,7 +45,7 @@ void RequestManager::load_tokens_task(
 
   // Extreme long prompts are not supported, only load up to
   // BatchConfig::max_tokens_per_batch() as prompt
-  if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch() &&
+  if (batch_config->num_inference_tokens() > BatchConfig::max_tokens_per_batch() &&
       batch_config->get_mode() == INC_DECODING_MODE) {
     printf("Warning: too many tokens in prompt, only load up to %d tokens\n",
            BatchConfig::max_tokens_per_batch());
@@ -62,7 +62,7 @@ void RequestManager::load_tokens_task(
     //     std::cout << "Unable to open file: " << filename << std::endl;
     // }
 
-  } else if (batch_config->num_tokens >
+  } else if (batch_config->num_inference_tokens() >
                  BatchConfig::max_verify_tokens_per_batch() &&
              batch_config->get_mode() != INC_DECODING_MODE) {
     printf("Warning: Speculative decoding. too many tokens in prompt, only "
@@ -71,19 +71,19 @@ void RequestManager::load_tokens_task(
     printf("Got: %d tokens\n", batch_config->num_tokens);
   }
 
-  for (int i = 0; i < batch_config->num_tokens; i++) {
+  for (int i = 0; i < batch_config->num_active_tokens(); i++) {
     dram_copy[i] = batch_config->tokensInfo[i].token_id;
   }
   TokenId *fb_ptr = helperGetTensorPointerWO<TokenId>(
       regions[0], task->regions[0], FID_DATA, ctx, runtime);
   Domain domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
-  assert(batch_config->num_tokens <= domain.get_volume());
+  assert(batch_config->num_active_tokens() <= domain.get_volume());
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   checkCUDA(cudaMemcpyAsync(fb_ptr,
                             dram_copy,
-                            sizeof(TokenId) * batch_config->num_tokens,
+                            sizeof(TokenId) * batch_config->num_active_tokens(),
                             cudaMemcpyHostToDevice,
                             stream));
 }

From 579200ad85e9c08308ca9428734fadb785329501 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 15 Apr 2025 13:09:27 -0700
Subject: [PATCH 12/17] update

---
 inference/flexllm/peft_train.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/inference/flexllm/peft_train.cc b/inference/flexllm/peft_train.cc
index fcbe9ba91..a2ff53b16 100644
--- a/inference/flexllm/peft_train.cc
+++ b/inference/flexllm/peft_train.cc
@@ -523,6 +523,10 @@ void FlexFlow::top_level_task(Task const *task,
     std::vector<Request> inference_requests;
     if (!file_paths.prompt_file_path.empty()) {
       inference_requests = load_requests(file_paths.prompt_file_path, 128);
+      // cap number of inference requests to 2048 for spatial sharing, otherwise it will be too time consuming
+      if (ffconfig.peft_support_mode == TEMPORAL_SHARING && inference_requests.size() > 2048) {
+        inference_requests.resize(2048);
+      }
     }
 
     // Add fine-tuning request

From e9620321ec66350b1c7c4a183dfc1684d33bf416 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 15 Apr 2025 19:02:50 -0700
Subject: [PATCH 13/17] update

---
 inference/flexllm/peft_train.cc | 13 +++++--------
 src/ops/argmax.cu               | 24 +++++++++++++-----------
 src/ops/kernels/softmax.cu      | 31 ++++++++++++++++++++++++-------
 3 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/inference/flexllm/peft_train.cc b/inference/flexllm/peft_train.cc
index a2ff53b16..bee76e998 100644
--- a/inference/flexllm/peft_train.cc
+++ b/inference/flexllm/peft_train.cc
@@ -524,7 +524,7 @@ void FlexFlow::top_level_task(Task const *task,
     if (!file_paths.prompt_file_path.empty()) {
       inference_requests = load_requests(file_paths.prompt_file_path, 128);
       // cap number of inference requests to 2048 for spatial sharing, otherwise it will be too time consuming
-      if (ffconfig.peft_support_mode == TEMPORAL_SHARING && inference_requests.size() > 2048) {
+      if ((ffconfig.peft_support_mode == TEMPORAL_SHARING || ffconfig.peft_support_mode == SPATIAL_SHARING) && inference_requests.size() > 2048) {
         inference_requests.resize(2048);
       }
     }
@@ -566,13 +566,10 @@ void FlexFlow::top_level_task(Task const *task,
   if (!file_paths.profiling_folder_path.empty()) {
     std::cout << "Saving profiling info..." << std::endl;
     std::string dataset_name;
-    // set dataset name to "wildchat" if the prompt file path contains
-    // "wildchat"
-    if (file_paths.prompt_file_path.find("wildchat") != std::string::npos) {
-      dataset_name = "wildchat";
-    } else if (file_paths.prompt_file_path.find("sharegpt") !=
-               std::string::npos) {
-      dataset_name = "sharegpt";
+    if (!file_paths.prompt_file_path.empty()) {
+      // Extract just the filename from the path without extension
+      std::filesystem::path p(file_paths.prompt_file_path);
+      dataset_name = p.filename().stem().string();
     } else {
       dataset_name = "unknown";
     }
diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu
index d21186619..58ae0c6aa 100644
--- a/src/ops/argmax.cu
+++ b/src/ops/argmax.cu
@@ -68,11 +68,11 @@ __global__ void argmaxKernel(T const *__restrict__ input,
 
   // Each thread processes a subset of the column.
   float thread_max = -FLT_MAX;
-  int thread_idx = -1;
+  int thread_idx = 0;
   for (int i = threadIdx.x; i < vocab_size; i += BLOCK_SIZE) {
     float val = toFloat(col_ptr[i]);
-    if (val > thread_max || (val == thread_max && thread_idx == -1)) {
-      thread_max = val;
+    if (val > thread_max || isnan(val)) {
+      thread_max = isnan(val) ? -FLT_MAX : val; // Handle NaN values
       thread_idx = i;
     }
   }
@@ -147,12 +147,12 @@ void inference_kernel_spatial_sharing(ArgMaxMeta const *m,
     checkCUDA(cudaEventRecord(m->handle.peft_fwd_can_start, main_stream)); 
     checkCUDA(cudaStreamWaitEvent(m->handle.peft_fwd_stream, m->handle.peft_fwd_can_start, 0));
 
-    launchArgmaxKernel(input_ptr + num_classes * bc->num_inference_tokens(), 
-                      num_classes, 
-                      bc->num_finetuning_fwd_tokens(), 
-                      indices_ptr + bc->num_inference_tokens(), 
-                      nullptr, 
-                      m->handle.peft_fwd_stream);
+    // launchArgmaxKernel(input_ptr + num_classes * bc->num_inference_tokens(), 
+    //                   num_classes, 
+    //                   bc->num_finetuning_fwd_tokens(), 
+    //                   indices_ptr + bc->num_inference_tokens(), 
+    //                   nullptr, 
+    //                   m->handle.peft_fwd_stream);
 
     // print_tensor(indices_ptr, batch_size, "indices_ptr: ");
 
@@ -231,8 +231,10 @@ void ArgMax::inference_kernel(ArgMaxMeta const *m,
     checkCUDA(cudaMemsetAsync(parent, 0, bc->num_active_tokens() * sizeof(int), stream));
   }
 
-  launchArgmaxKernel(
-      input_ptr, num_classes, bc->num_active_tokens(), indices_ptr, prob_ptr, stream);
+  if (bc->num_inference_tokens() > 0) {
+    launchArgmaxKernel(
+        input_ptr, num_classes, bc->num_inference_tokens(), indices_ptr, prob_ptr, stream);
+  }
 
   // print_tensor(indices_ptr, batch_size, "indices_ptr: ");
 
diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index 95d0e6340..caa6827f0 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -147,6 +147,9 @@ void inference_kernel_wrapper(SoftmaxMeta *m,
     cudaEventCreate(&t_end);
     cudaEventRecord(t_start, stream);
   }
+  if (bc->num_active_tokens() <= 0) {
+    return;
+  }
   int num_classes = output.domain.hi()[0] - output.domain.lo()[0] + 1;
   if (m->output_type[0] == DT_FLOAT) {
     Internal::inference_kernel(m,
@@ -336,13 +339,27 @@ void inference_kernel(SoftmaxMeta const *m,
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
   float alpha = 1.0f, beta = 0.0f;
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
-  checkCUDNN(cudnnSetTensor4dDescriptor(m->outputTensor,
-                                        CUDNN_TENSOR_NCHW,
-                                        cudnn_data_type,
-                                        bc->num_active_tokens(),
-                                        num_classes,
-                                        1,
-                                        1));
+  // fprintf(stderr, "Error in cudnnSetTensor4dDescriptor: %s\n", e.what());
+  // printf("num_active_tokens: %d, num_classes: %d\n",
+  //         bc->num_active_tokens(), num_classes);
+  // printf("input_ptr: %p, output_ptr: %p\n", input_ptr, output_ptr);
+  // std::cerr << "bc: " << *bc << std::endl;
+  // try {
+    checkCUDNN(cudnnSetTensor4dDescriptor(m->outputTensor,
+                                          CUDNN_TENSOR_NCHW,
+                                          cudnn_data_type,
+                                          bc->num_active_tokens(),
+                                          num_classes,
+                                          1,
+                                          1));
+  // } catch (const std::exception &e) {
+  //   fprintf(stderr, "Error in cudnnSetTensor4dDescriptor: %s\n", e.what());
+  //   fprintf(stderr, "num_active_tokens: %d, num_classes: %d\n",
+  //           bc->num_active_tokens(), num_classes);
+  //   fprintf(stderr, "input_ptr: %p, output_ptr: %p\n", input_ptr, output_ptr);
+  //   std::cerr << "bc: " << *bc << std::endl;
+  //   assert(false);
+  // }
   checkCUDNN(cudnnSoftmaxForward(m->handle.dnn,
                                  CUDNN_SOFTMAX_ACCURATE,
                                  CUDNN_SOFTMAX_MODE_CHANNEL,

From 051333f3a39348f020520c350f41185d9d1e1057 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 16 Apr 2025 03:56:32 -0700
Subject: [PATCH 14/17] update

---
 inference/incr_decoding/incr_decoding.cc |  2 +-
 src/ops/sigmoid_silu_multi.cu            | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 09c79ce51..b816d7a5a 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -211,7 +211,7 @@ std::vector<Request> load_trace(nlohmann::ordered_json prompt_json,
       inference_req.prompt = text;
     }
     inference_req.arrival_time_us = arrival_time;
-    inference_req.max_new_tokens = response_length;
+    inference_req.max_new_tokens = max(response_length, 2);
     arrival_time += interarrival_time;
     requests.push_back(inference_req);
   }
diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index 6fb13f9b8..55d244fa6 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -99,9 +99,10 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
 
-  int num_elements = input1.domain.get_volume();
-  assert(input2.domain.get_volume() == num_elements);
-  assert(output.domain.get_volume() == num_elements);
+  int num_elements = bc->num_active_tokens() *
+                     (input1.domain.hi()[0] - input1.domain.lo()[0] + 1);
+  // assert(input2.domain.get_volume() == num_elements);
+  // assert(output.domain.get_volume() == num_elements);
 
   cudaEvent_t t_start, t_end;
   if (m->profiling) {
@@ -162,7 +163,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
     SigmoidSiluMultiKernel<<<GET_BLOCKS(num_elements),
                              min(CUDA_NUM_THREADS, num_elements),
                              0,
-                             stream>>>(input1.domain.get_volume(),
+                             stream>>>(num_elements,
                                        input1.get_float_ptr(),
                                        input2.get_float_ptr(),
                                        output.get_float_ptr());
@@ -170,7 +171,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
     SigmoidSiluMultiKernel<<<GET_BLOCKS(num_elements),
                              min(CUDA_NUM_THREADS, num_elements),
                              0,
-                             stream>>>(input1.domain.get_volume(),
+                             stream>>>(num_elements,
                                        input1.get_half_ptr(),
                                        input2.get_half_ptr(),
                                        output.get_half_ptr());

From 787a4ba2bb2d1627076f90b194b97c468aa833e9 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 17 Apr 2025 16:19:24 -0700
Subject: [PATCH 15/17] update

---
 inference/flexllm/peft_train.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/inference/flexllm/peft_train.cc b/inference/flexllm/peft_train.cc
index bee76e998..907b730bd 100644
--- a/inference/flexllm/peft_train.cc
+++ b/inference/flexllm/peft_train.cc
@@ -524,8 +524,10 @@ void FlexFlow::top_level_task(Task const *task,
     if (!file_paths.prompt_file_path.empty()) {
       inference_requests = load_requests(file_paths.prompt_file_path, 128);
       // cap number of inference requests to 2048 for spatial sharing, otherwise it will be too time consuming
-      if ((ffconfig.peft_support_mode == TEMPORAL_SHARING || ffconfig.peft_support_mode == SPATIAL_SHARING) && inference_requests.size() > 2048) {
-        inference_requests.resize(2048);
+      if (ffconfig.peft_support_mode == SPATIAL_SHARING && inference_requests.size() > 1024) {
+        inference_requests.resize(1024);
+      } else if (ffconfig.peft_support_mode == TEMPORAL_SHARING && inference_requests.size() > 1024) {
+        inference_requests.resize(1024);
       }
     }
 

From 761d1d1f305dcf61da71917c0d2bacc2e16d4ae6 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 23 Apr 2025 15:44:51 -0700
Subject: [PATCH 16/17] new spatial and temporal sharing baselines

---
 include/flexflow/ffconst.h                   | 10 ++++
 include/flexflow/request_manager.h           |  1 +
 inference/flexllm/peft_train.cc              | 12 +++--
 inference/peft/peft.cc                       |  6 +++
 python/flexflow/type.py                      |  3 ++
 src/ops/argmax.cu                            |  2 +-
 src/ops/kernels/embedding_kernels.cu         |  2 +-
 src/ops/kernels/linear_kernels.cu            |  2 +-
 src/ops/kernels/residual_rms_norm_kernels.cu |  2 +-
 src/ops/kernels/rms_norm_kernels.cu          |  2 +-
 src/ops/kernels/softmax.cu                   |  2 +-
 src/runtime/ffconst_utils.cc                 | 20 ++++---
 src/runtime/request_manager.cc               | 55 +++++++++++++-------
 13 files changed, 82 insertions(+), 37 deletions(-)

diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index 18e19cfa9..7925a18a6 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -86,10 +86,20 @@ enum RequestType {
 
 enum PeftSupportMode {
   PEFT_DISABLED = 5001,
+  // no finetuning supported
   PEFT_INFERENCE_ONLY = 5002,
+  // finetuning fwd limited by max tokens per batch, bwd layers limited to 1
   COSERVING = 5003,
+  // finetuning fwd/bwd unlimited, alternating inference and finetuning batches
   TEMPORAL_SHARING = 5004,
+  // finetuning fwd/bwd unlimited, inference and finetuning work in the same batch (different kernels)
   SPATIAL_SHARING = 5005,
+  // finetuning fwd limited by max tokens per batch, bwd layers limited to 1. Inference and finetuning work in the same batch (different kernels)
+  SPATIAL_SHARING_LIMITED = 5006,
+  // finetuning fwd limited by max tokens per batch, bwd layers limited to 1. Alternating inference and finetuning batches
+  TEMPORAL_SHARING_LIMITED = 5007,
+  // finetuning fwd/bwd unlimited, inference and finetuning work in separate Legion tasks
+  SPATIAL_SHARING_SEPARATE_TASKS = 5008,
 };
 
 // This is consistent with TASO's OpType
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 596916377..31f913d5c 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -419,6 +419,7 @@ class RequestManager {
   PeftSupportMode peft_support_mode = PEFT_DISABLED;
   PeftTemporalSharingState peft_temporal_sharing_state =
       PeftTemporalSharingState::INFERENCE;
+  int peft_temporal_sharing_inf_step = 0;
   BatchConfig ts_saved_old_batch;
   bool inference_finished = false;
   int num_transformer_layers = 0;
diff --git a/inference/flexllm/peft_train.cc b/inference/flexllm/peft_train.cc
index 907b730bd..84ecf16e1 100644
--- a/inference/flexllm/peft_train.cc
+++ b/inference/flexllm/peft_train.cc
@@ -177,6 +177,12 @@ void parse_input_args(char **argv,
         peft_support_mode = TEMPORAL_SHARING;
       } else if (mode == "spatial_sharing" || mode == "spatial-sharing") {
         peft_support_mode = SPATIAL_SHARING;
+      } else if (mode == "spatial_sharing_limited" || mode == "spatial-sharing-limited") {
+        peft_support_mode = SPATIAL_SHARING_LIMITED;
+      } else if (mode == "temporal_sharing_limited" || mode == "temporal-sharing-limited") {
+        peft_support_mode = TEMPORAL_SHARING_LIMITED;
+      } else if (mode == "spatial_sharing_separate_tasks" || mode == "spatial-sharing-separate-tasks") {
+        peft_support_mode = SPATIAL_SHARING_SEPARATE_TASKS;
       } else {
         std::cerr << "Unknown peft support mode: " << mode << std::endl;
         assert(false && "Invalid peft support mode");
@@ -524,10 +530,8 @@ void FlexFlow::top_level_task(Task const *task,
     if (!file_paths.prompt_file_path.empty()) {
       inference_requests = load_requests(file_paths.prompt_file_path, 128);
       // cap number of inference requests to 2048 for spatial sharing, otherwise it will be too time consuming
-      if (ffconfig.peft_support_mode == SPATIAL_SHARING && inference_requests.size() > 1024) {
-        inference_requests.resize(1024);
-      } else if (ffconfig.peft_support_mode == TEMPORAL_SHARING && inference_requests.size() > 1024) {
-        inference_requests.resize(1024);
+      if ((ffconfig.peft_support_mode == SPATIAL_SHARING || ffconfig.peft_support_mode == TEMPORAL_SHARING || ffconfig.peft_support_mode == SPATIAL_SHARING_LIMITED || ffconfig.peft_support_mode == TEMPORAL_SHARING_LIMITED || ffconfig.peft_support_mode == SPATIAL_SHARING_SEPARATE_TASKS) && inference_requests.size() > 2048) {
+        inference_requests.resize(2048);
       }
     }
 
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index 6ad8d628e..567818acd 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -81,6 +81,12 @@ void parse_input_args(char **argv,
         peft_support_mode = TEMPORAL_SHARING;
       } else if (mode == "spatial_sharing" || mode == "spatial-sharing") {
         peft_support_mode = SPATIAL_SHARING;
+      } else if (mode == "spatial_sharing_limited" || mode == "spatial-sharing-limited") {
+        peft_support_mode = SPATIAL_SHARING_LIMITED;
+      } else if (mode == "temporal_sharing_limited" || mode == "temporal-sharing-limited") {
+        peft_support_mode = TEMPORAL_SHARING_LIMITED;
+      } else if (mode == "spatial_sharing_separate_tasks" || mode == "spatial-sharing-separate-tasks") {
+        peft_support_mode = SPATIAL_SHARING_SEPARATE_TASKS;
       } else {
         std::cerr << "Unknown peft support mode: " << mode << std::endl;
         assert(false && "Invalid peft support mode");
diff --git a/python/flexflow/type.py b/python/flexflow/type.py
index 0db7ce39c..092a85a40 100644
--- a/python/flexflow/type.py
+++ b/python/flexflow/type.py
@@ -169,6 +169,9 @@ class PeftSupportMode(Enum):
     COSERVING = 5003
     TEMPORAL_SHARING = 5004
     SPATIAL_SHARING = 5005
+    SPATIAL_SHARING_LIMITED = 5006
+    TEMPORAL_SHARING_LIMITED = 5007
+    SPATIAL_SHARING_SEPARATE_TASKS = 5008
     def __str__(self):
         return self.name
 
diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu
index 58ae0c6aa..c28f529b2 100644
--- a/src/ops/argmax.cu
+++ b/src/ops/argmax.cu
@@ -215,7 +215,7 @@ void ArgMax::inference_kernel(ArgMaxMeta const *m,
                             int const num_classes,
                             float *loss,
                             cudaStream_t stream) {
-  if (m->peft_support_mode == SPATIAL_SHARING) {
+  if (m->peft_support_mode == SPATIAL_SHARING || m->peft_support_mode == SPATIAL_SHARING_LIMITED) {
     inference_kernel_spatial_sharing(m,
                                      bc,
                                      input_ptr,
diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu
index 432c732c1..07b317735 100644
--- a/src/ops/kernels/embedding_kernels.cu
+++ b/src/ops/kernels/embedding_kernels.cu
@@ -235,7 +235,7 @@ void forward_kernel(EmbeddingMeta const *m,
                     // AggrMode aggr,
                     // int outputSize,
                     cudaStream_t stream) {
-  if (m->peft_support_mode == SPATIAL_SHARING) {
+  if (m->peft_support_mode == SPATIAL_SHARING || m->peft_support_mode == SPATIAL_SHARING_LIMITED) {
     forward_kernel_spatial_sharing<TI, TD>(m, bc, input_ptr, output_ptr, weight_ptr, in_dim, out_dim, stream);
     return;
   }
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index e8fd7fb81..a77c56584 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -505,7 +505,7 @@ void inference_kernel(LinearMeta const *m,
                       int in_dim,
                       int out_dim,
                       ffStream_t stream) {
-  if (m->peft_support_mode == SPATIAL_SHARING) {
+  if (m->peft_support_mode == SPATIAL_SHARING || m->peft_support_mode == SPATIAL_SHARING_LIMITED) {
     inference_kernel_spatial_sharing<DT>(m, bc, input_ptr, output_ptr, weight_ptr, bias_ptr, in_dim, out_dim, stream);
     return;
   }
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 2492fb6de..6925d4a14 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -207,7 +207,7 @@ void inference_kernel(ResidualRMSNormMeta const *m,
                       T *residual_output_ptr,
                       T *output_ptr,
                       cudaStream_t stream) {
-  if (m->peft_support_mode == SPATIAL_SHARING) {
+  if (m->peft_support_mode == SPATIAL_SHARING || m->peft_support_mode == SPATIAL_SHARING_LIMITED) {
     inference_kernel_spatial_sharing(m, bc, input1_ptr, input2_ptr, weight_ptr, residual_output_ptr, output_ptr, stream);
     return;
   }
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index b50de4bb1..5e610752d 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -191,7 +191,7 @@ void inference_kernel(RMSNormMeta const *m,
                       T const *weight_ptr,
                       T *output_ptr,
                       cudaStream_t stream) {
-  if (m->peft_support_mode == SPATIAL_SHARING) {
+  if (m->peft_support_mode == SPATIAL_SHARING || m->peft_support_mode == SPATIAL_SHARING_LIMITED) {
     inference_kernel_spatial_sharing(m, bc, input_ptr, weight_ptr, output_ptr, stream);
     return;
   }
diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index caa6827f0..e278bfb7f 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -332,7 +332,7 @@ void inference_kernel(SoftmaxMeta const *m,
                       DT *output_ptr,
                       int num_classes,
                       cudaStream_t stream) {
-  if (m->peft_support_mode == SPATIAL_SHARING) {
+  if (m->peft_support_mode == SPATIAL_SHARING || m->peft_support_mode == SPATIAL_SHARING_LIMITED) {
     inference_kernel_spatial_sharing(m, bc, input_ptr, output_ptr, num_classes, stream);
     return;
   }
diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc
index 613d2b06d..9ecd5bd98 100644
--- a/src/runtime/ffconst_utils.cc
+++ b/src/runtime/ffconst_utils.cc
@@ -250,18 +250,24 @@ std::ostream &operator<<(std::ostream &s, OperatorType op_type) {
 
 const char* peftSupportModeToString(const PeftSupportMode mode) {
   switch(mode) {
-    case PEFT_DISABLED:         return "PEFT_DISABLED";
-    case PEFT_INFERENCE_ONLY:     return "PEFT_INFERENCE_ONLY";
-    case COSERVING:               return "COSERVING";
-    case TEMPORAL_SHARING:        return "TEMPORAL_SHARING";
-    case SPATIAL_SHARING:         return "SPATIAL_SHARING";
-    default:                      return "UNKNOWN";
+    case PEFT_DISABLED:                   return "PEFT_DISABLED";
+    case PEFT_INFERENCE_ONLY:             return "PEFT_INFERENCE_ONLY";
+    case COSERVING:                       return "COSERVING";
+    case TEMPORAL_SHARING:                return "TEMPORAL_SHARING";
+    case SPATIAL_SHARING:                 return "SPATIAL_SHARING";
+    case SPATIAL_SHARING_LIMITED:         return "SPATIAL_SHARING_LIMITED";
+    case TEMPORAL_SHARING_LIMITED:        return "TEMPORAL_SHARING_LIMITED";
+    case SPATIAL_SHARING_SEPARATE_TASKS:  return "SPATIAL_SHARING_SEPARATE_TASKS";
+    default:                              return "UNKNOWN";
   }
 }
 bool peft_finetuning_enabled(const PeftSupportMode peft_support_mode) {
   return peft_support_mode == COSERVING ||
          peft_support_mode == TEMPORAL_SHARING ||
-         peft_support_mode == SPATIAL_SHARING;
+         peft_support_mode == SPATIAL_SHARING ||
+         peft_support_mode == SPATIAL_SHARING_LIMITED ||
+         peft_support_mode == TEMPORAL_SHARING_LIMITED ||
+         peft_support_mode == SPATIAL_SHARING_SEPARATE_TASKS;
 }
 bool peft_enabled(const PeftSupportMode peft_support_mode) {
   return peft_support_mode != PEFT_DISABLED;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 4213cfaea..82950d508 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -347,7 +347,7 @@ void RequestManager::set_max_fwd_finetuning_tokens_per_batch(
 int RequestManager::get_max_fwd_finetuning_tokens_per_batch() {
   // assert(max_fwd_finetuning_tokens_per_batch > 0 &&
   // max_fwd_finetuning_tokens_per_batch <= max_tokens_per_batch);
-  if (peft_support_mode == SPATIAL_SHARING || peft_support_mode == TEMPORAL_SHARING) {
+  if (peft_support_mode == SPATIAL_SHARING || peft_support_mode == TEMPORAL_SHARING || peft_support_mode == SPATIAL_SHARING_SEPARATE_TASKS) {
     assert(max_fwd_finetuning_tokens_per_batch == BatchConfig::MAX_NUM_TOKENS);
   } else {
     assert(max_fwd_finetuning_tokens_per_batch < BatchConfig::MAX_NUM_TOKENS);
@@ -388,7 +388,7 @@ void RequestManager::push_spec_infer_tree_width(int tree_width) {
 
 void RequestManager::set_peft_support_mode(PeftSupportMode peft_support_mode_) {
   peft_support_mode = peft_support_mode_;
-  if (peft_support_mode == SPATIAL_SHARING || peft_support_mode == TEMPORAL_SHARING) {
+  if (peft_support_mode == SPATIAL_SHARING || peft_support_mode == TEMPORAL_SHARING || peft_support_mode == SPATIAL_SHARING_SEPARATE_TASKS) {
     set_max_fwd_finetuning_tokens_per_batch(BatchConfig::MAX_NUM_TOKENS);
   }
 }
@@ -773,7 +773,8 @@ void RequestManager::check_new_bc(BatchConfig const &new_bc) {
   }
 
   switch(peft_support_mode) {
-    case SPATIAL_SHARING: {
+    case SPATIAL_SHARING: 
+    case SPATIAL_SHARING_SEPARATE_TASKS: {
       break;
     } 
     case TEMPORAL_SHARING: {
@@ -788,7 +789,9 @@ void RequestManager::check_new_bc(BatchConfig const &new_bc) {
         assert(new_bc.num_finetuning_fwd_tokens() == 0);
       }
       break;
-    } 
+    }
+    case SPATIAL_SHARING_LIMITED:
+    case TEMPORAL_SHARING_LIMITED:
     case COSERVING: {
       assert(new_bc.num_active_tokens() <= max_tokens_per_batch);
       break;
@@ -832,13 +835,19 @@ BatchConfig RequestManager::prepare_next_batch_task(
 }
 
 void RequestManager::update_peft_temporal_sharing_state(void) {
-  assert(peft_support_mode == TEMPORAL_SHARING);
+  assert(peft_support_mode == TEMPORAL_SHARING || 
+         peft_support_mode == TEMPORAL_SHARING_LIMITED);
   if (peft_temporal_sharing_state == INFERENCE) {
-    peft_temporal_sharing_state = FINETUNING_FWD;
+    peft_temporal_sharing_inf_step++;
+    if (peft_temporal_sharing_inf_step >= 10 || peft_support_mode == TEMPORAL_SHARING) {
+      peft_temporal_sharing_inf_step = 0;
+      peft_temporal_sharing_state = FINETUNING_FWD;
+    }
   } else if (peft_temporal_sharing_state == FINETUNING_FWD) {
     peft_temporal_sharing_state = FINETUNING_BWD;
   } else if (peft_temporal_sharing_state == FINETUNING_BWD) {  
     peft_temporal_sharing_state = INFERENCE;
+    peft_temporal_sharing_inf_step = 0;
   } else {
     assert(false && "Invalid temporal sharing state");
   }
@@ -1409,7 +1418,7 @@ void RequestManager::add_finetuning_req_fwd_batch(BatchConfig &new_bc) {
   assert(peft_finetuning_enabled(peft_support_mode) && "PEFT finetuning is not enabled");
   assert(!pending_peft_request_queue.empty() &&
          "Trying to add a new finetuning request when there are none");
-  if (peft_support_mode != TEMPORAL_SHARING && peft_support_mode != SPATIAL_SHARING) {
+  if (peft_support_mode != TEMPORAL_SHARING && peft_support_mode != SPATIAL_SHARING && peft_support_mode != SPATIAL_SHARING_SEPARATE_TASKS) {
     assert(new_bc.num_tokens < get_max_tokens_per_batch() &&
           "Trying to add a new finetuning request when the batch is full");
   }
@@ -1434,7 +1443,7 @@ void RequestManager::add_finetuning_req_fwd_batch(BatchConfig &new_bc) {
   int batch_capacity_left =
       std::min(get_max_fwd_finetuning_tokens_per_batch(),
                get_max_tokens_per_batch() - new_bc.num_active_tokens());
-  if (peft_support_mode == TEMPORAL_SHARING || peft_support_mode == SPATIAL_SHARING) {
+  if (peft_support_mode == TEMPORAL_SHARING || peft_support_mode == SPATIAL_SHARING || peft_support_mode == SPATIAL_SHARING_SEPARATE_TASKS) {
     assert(get_max_fwd_finetuning_tokens_per_batch() == BatchConfig::MAX_NUM_TOKENS);
     batch_capacity_left =
       std::min(get_max_fwd_finetuning_tokens_per_batch(),
@@ -1488,7 +1497,7 @@ void RequestManager::add_finetuning_req_bwd_batch(BatchConfig &new_bc) {
   assert(peft_finetuning_enabled(peft_support_mode) && "PEFT finetuning is not enabled");
   assert(!pending_peft_request_queue.empty() &&
          "Trying to add a new finetuning request when there are none");
-  if (peft_support_mode != TEMPORAL_SHARING && peft_support_mode != SPATIAL_SHARING) {
+  if (peft_support_mode != TEMPORAL_SHARING && peft_support_mode != SPATIAL_SHARING && peft_support_mode != SPATIAL_SHARING_SEPARATE_TASKS) {
     assert(new_bc.num_tokens <= get_max_tokens_per_batch() &&
           "Trying to add a new finetuning request when the batch is full");
   }
@@ -1530,7 +1539,7 @@ void RequestManager::add_finetuning_req_bwd_batch(BatchConfig &new_bc) {
   new_bc.requestsInfo[inference_batch_size].finetuning_backward_phase = true;
 
   int num_layers_per_finetuning_step = get_num_layers_per_finetuning_step();
-  if (peft_support_mode == TEMPORAL_SHARING || peft_support_mode == SPATIAL_SHARING) {
+  if (peft_support_mode == TEMPORAL_SHARING || peft_support_mode == SPATIAL_SHARING || peft_support_mode == SPATIAL_SHARING_SEPARATE_TASKS) {
     num_layers_per_finetuning_step = get_num_transformer_layers();
   }
 
@@ -1783,7 +1792,8 @@ void RequestManager::process_work_from_old_batch(
   if (peft_finetuning_enabled(peft_support_mode)) {
     process_finetuning_req_fwd_progress(old_bc, result);
     process_finetuning_req_bwd_progress(old_bc);
-    if (peft_support_mode == TEMPORAL_SHARING) {
+    if (peft_support_mode == TEMPORAL_SHARING ||
+        peft_support_mode == TEMPORAL_SHARING_LIMITED) {
       update_peft_temporal_sharing_state();
     }
   }
@@ -1869,27 +1879,32 @@ BatchConfig
   // Step 1: Create new batch config
   BatchConfig new_bc;
 
-  if (peft_support_mode != TEMPORAL_SHARING) {
+  if (peft_support_mode != TEMPORAL_SHARING && 
+      peft_support_mode != TEMPORAL_SHARING_LIMITED) {
     add_inference_work_if_needed(new_bc, old_bc);
   } else {
     // old_bc is only allowed to have inference tokens if we just finished a INFERENCE phase
-    if (old_bc.num_inference_tokens() > 0) {
-      assert(peft_temporal_sharing_state == FINETUNING_FWD &&
-             "Old batch should not have inference tokens");
-    }
+    // if (old_bc.num_inference_tokens() > 0) {
+    //   // assert(peft_temporal_sharing_state == FINETUNING_FWD &&
+    //   //        "Old batch should not have inference tokens");
+    // }
     if (peft_temporal_sharing_state == INFERENCE) {
-      add_inference_work_if_needed(new_bc, ts_saved_old_batch);
+      if (peft_temporal_sharing_inf_step == 0) {
+        add_inference_work_if_needed(new_bc, ts_saved_old_batch);
+      } else {
+        add_inference_work_if_needed(new_bc, old_bc);
+      }
     } else if (peft_temporal_sharing_state == FINETUNING_FWD) {
-      // if we just finished a finetuning fwd phase, we need to save the old batch for later
+      // if we just finished the inference phase, we need to save the old batch for later
       ts_saved_old_batch = old_bc;
     }
   }
 
   // Step 4: add finetuning fwd tokens, if there is additional space
   int slots_available_for_peft_fwd = 0;
-  if (peft_support_mode == COSERVING) {
+  if (peft_support_mode == COSERVING || peft_support_mode == TEMPORAL_SHARING_LIMITED || peft_support_mode == SPATIAL_SHARING_LIMITED) {
     slots_available_for_peft_fwd = get_max_tokens_per_batch() - new_bc.num_tokens;
-  } else if (peft_support_mode == TEMPORAL_SHARING || peft_support_mode == SPATIAL_SHARING) {
+  } else if (peft_support_mode == TEMPORAL_SHARING || peft_support_mode == SPATIAL_SHARING || peft_support_mode == SPATIAL_SHARING_SEPARATE_TASKS) {
     slots_available_for_peft_fwd = BatchConfig::MAX_NUM_TOKENS - new_bc.num_tokens;
   }
   assert(slots_available_for_peft_fwd >= 0);

From 37250a4c3dcd57307dd17c38504ddf77ebbf16b5 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 8 May 2025 12:14:56 -0700
Subject: [PATCH 17/17] backup

---
 include/flexflow/request_manager.h |  4 ++
 inference/flexllm/peft_train.cc    |  8 +++
 src/ops/kernels/softmax.cu         |  2 +-
 src/runtime/request_manager.cc     | 84 ++++++++++++++++++++++++------
 4 files changed, 81 insertions(+), 17 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 31f913d5c..d8e98a060 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -215,6 +215,9 @@ class RequestManager {
   int get_num_transformer_layers();
   void set_num_layers_per_finetuning_step(int num_layers_per_finetuning_step);
   int get_num_layers_per_finetuning_step();
+  void set_temporal_sharing_frequency(int temporal_sharing_frequency);
+  int get_temporal_sharing_frequency();
+
   void initBitMask(BatchConfig::BitMask &bitmask, int initLength);
   void appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength);
   void appendBitMask(BatchConfig::BitMask &bitmask,
@@ -424,6 +427,7 @@ class RequestManager {
   bool inference_finished = false;
   int num_transformer_layers = 0;
   int num_layers_per_finetuning_step = 0;
+  int temporal_sharing_frequency = 10;
 
   // tree width in each speculative step, if not specified 1
   std::vector<int> spec_infer_tree_width;
diff --git a/inference/flexllm/peft_train.cc b/inference/flexllm/peft_train.cc
index 84ecf16e1..2c33596a8 100644
--- a/inference/flexllm/peft_train.cc
+++ b/inference/flexllm/peft_train.cc
@@ -58,6 +58,7 @@ void parse_input_args(char **argv,
                       int &gradient_accumulation_steps,
                       int &num_logging_steps,
                       int &num_layers_per_finetuning_step,
+                      int &temporal_sharing_frequency,
                       bool &run_warmup) {
   for (int i = 1; i < argc; i++) {
     // llm model type
@@ -189,6 +190,10 @@ void parse_input_args(char **argv,
       }
       continue;
     }
+    if (!strcmp(argv[i], "--temporal-sharing-frequency")) {
+      temporal_sharing_frequency = std::stoi(argv[++i]);
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -329,6 +334,7 @@ void FlexFlow::top_level_task(Task const *task,
   int gradient_accumulation_steps = 8;
   int num_logging_steps = 10;
   int num_layers_per_finetuning_step = -1;
+  int temporal_sharing_frequency = 10;
   bool run_warmup = false;
   int num_kv_cache_slots = -1;
   int rank = 16;
@@ -356,6 +362,7 @@ void FlexFlow::top_level_task(Task const *task,
                    gradient_accumulation_steps,
                    num_logging_steps,
                    num_layers_per_finetuning_step,
+                   temporal_sharing_frequency,
                    run_warmup);
   assert(peft_finetuning_enabled(ffconfig.peft_support_mode) &&
          "Cannot train LORA adapter if finetuning is not enabled");
@@ -461,6 +468,7 @@ void FlexFlow::top_level_task(Task const *task,
       model_type, bos_token_id, eos_token_ids, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
   rm->set_peft_support_mode(ffconfig.peft_support_mode);
+  rm->set_temporal_sharing_frequency(temporal_sharing_frequency);
   rm->set_max_lora_rank(rank);
 
   FFModel model(ffconfig, ffconfig.cpu_offload);
diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index e278bfb7f..545290765 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -342,7 +342,7 @@ void inference_kernel(SoftmaxMeta const *m,
   // fprintf(stderr, "Error in cudnnSetTensor4dDescriptor: %s\n", e.what());
   // printf("num_active_tokens: %d, num_classes: %d\n",
   //         bc->num_active_tokens(), num_classes);
-  // printf("input_ptr: %p, output_ptr: %p\n", input_ptr, output_ptr);
+  // printf("input_ptr: %p, output_ptr: %p, m->outputTensor: %p\n", input_ptr, output_ptr, m->outputTensor);
   // std::cerr << "bc: " << *bc << std::endl;
   // try {
     checkCUDNN(cudnnSetTensor4dDescriptor(m->outputTensor,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 82950d508..fc687c04c 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -555,6 +555,15 @@ int RequestManager::get_num_layers_per_finetuning_step() {
   return num_layers_per_finetuning_step;
 }
 
+void RequestManager::set_temporal_sharing_frequency(
+    int temporal_sharing_frequency_) {
+  temporal_sharing_frequency = temporal_sharing_frequency_;
+}
+
+int RequestManager::get_temporal_sharing_frequency() {
+  return temporal_sharing_frequency;
+}
+
 PEFTModelID *
     FFModel::register_peft_adapter(LoraLinearConfig const &peft_config) {
   assert(peft_enabled(config.peft_support_mode) &&
@@ -831,15 +840,19 @@ BatchConfig RequestManager::prepare_next_batch_task(
   BatchConfig new_bc = rm->prepare_next_fwd_batch(*old_bc, result);
   new_bc = rm->prepare_next_bwd_batch(new_bc);
   rm->check_new_bc(new_bc);
+  // if (rm->inference_finished) {
+  //   printf("prepare_next_batch_task finished, returning new batch\n");
+  // }
   return new_bc;
 }
 
 void RequestManager::update_peft_temporal_sharing_state(void) {
+  // int old_state = peft_temporal_sharing_state;
   assert(peft_support_mode == TEMPORAL_SHARING || 
          peft_support_mode == TEMPORAL_SHARING_LIMITED);
   if (peft_temporal_sharing_state == INFERENCE) {
     peft_temporal_sharing_inf_step++;
-    if (peft_temporal_sharing_inf_step >= 10 || peft_support_mode == TEMPORAL_SHARING) {
+    if (peft_temporal_sharing_inf_step >= get_temporal_sharing_frequency()) {
       peft_temporal_sharing_inf_step = 0;
       peft_temporal_sharing_state = FINETUNING_FWD;
     }
@@ -851,6 +864,9 @@ void RequestManager::update_peft_temporal_sharing_state(void) {
   } else {
     assert(false && "Invalid temporal sharing state");
   }
+  // if (inference_finished) {
+  //   printf("PEFT temporal sharing state changed from %d to %d with inference_finished=true\n", old_state, peft_temporal_sharing_state);
+  // }
 }
 
 bool RequestManager::is_eos_token(int token_id) {
@@ -1002,6 +1018,10 @@ void RequestManager::record_decoding_req_profiling_info(
 void RequestManager::process_inf_req_progress(BatchConfig const &old_fwd_bc,
                                               InferenceResult const &result) {
   // printf("Entering process_inf_req_progress\n");
+  if (inference_finished) {
+    assert(old_fwd_bc.num_inference_tokens() == 0);
+    return;
+  }
   for (int i = 0; i < old_fwd_bc.num_active_tokens(); i++) {
     size_t guid =
         old_fwd_bc.requestsInfo[old_fwd_bc.tokensInfo[i].request_index]
@@ -1364,30 +1384,30 @@ void RequestManager::handle_completed_finetuning_req(
     BatchConfig const &old_finetuning_bc) {
   // printf("Entering handle_completed_finetuning_req\n");
   if (!inference_finished) {
-    assert(
-        old_finetuning_bc.num_finetuning_bwd_requests() == 1 &&
-        "Number of active peft bwd requests in a finetuning batch should be 1");
+    assert(old_finetuning_bc.num_finetuning_bwd_requests() == 1 && "Finetuning request can only be finalized after a bwd pass");
+    assert(old_finetuning_bc.num_finetuning_fwd_requests() == 0 && "Finetuning request can only be finalized after a bwd pass");
+    int inference_batch_size =
+      BatchConfig::max_requests_per_batch() - (int)peft_finetuning_enabled(peft_support_mode);
+    assert(!old_finetuning_bc.request_completed[inference_batch_size] &&
+          "Finetuning request not found in new batch");
+    assert(pending_peft_request_queue.size() == 1 &&
+           "Finetuning request queue should only have one request");
   } else {
-    assert(old_finetuning_bc.num_finetuning_fwd_requests() +
-                   old_finetuning_bc.num_finetuning_bwd_requests() ==
-               1 &&
-           "Number of active peft requests should be 1");
+    if (pending_peft_request_queue.empty()) {
+      assert(old_finetuning_bc.num_finetuning_bwd_requests() == 0 && old_finetuning_bc.num_finetuning_fwd_requests() == 0);
+      return;
+    } 
   }
 
-  int inference_batch_size =
-      BatchConfig::max_requests_per_batch() - (int)peft_finetuning_enabled(peft_support_mode);
-  assert(!old_finetuning_bc.request_completed[inference_batch_size] &&
-         "Finetuning request not found in new batch");
-
   // sync metadata with all_requests
   Request &pq_request = pending_peft_request_queue.front();
   Request &request = all_requests[pq_request.guid];
   assert(request.req_type == RequestType::REQ_FINETUNING &&
          "Found misplaced inference request");
   assert(request.guid == pq_request.guid && "Request GUID mismatch");
-  assert(old_finetuning_bc.requestsInfo[inference_batch_size].request_guid ==
-             pq_request.guid &&
-         "Request GUID mismatch");
+  // assert(old_finetuning_bc.requestsInfo[inference_batch_size].request_guid ==
+  //            pq_request.guid &&
+  //        "Request GUID mismatch");
   request.status = Request::COMPLETED;
   request.peft_finetuning_info = pq_request.peft_finetuning_info;
   // remove from pending peft queue
@@ -1610,7 +1630,15 @@ void RequestManager::process_finetuning_req_fwd_progress(
                  old_bc.num_finetuning_bwd_requests() <=
              1 &&
          "More than 1 finetuning request in the batch");
+  // if (inference_finished) {
+  //   printf("Running process_finetuning_req_fwd_progress with inference_finished=true. Num finetuning fwd tokens: %i\n",
+  //        old_bc.num_finetuning_fwd_tokens());
+  // }
   if (old_bc.num_finetuning_fwd_requests() == 0) {
+    if (inference_finished) {
+      // complete finetuning request if there is one in the pending queue
+      handle_completed_finetuning_req(old_bc);
+    }
     return;
   }
   int inference_batch_size =
@@ -1672,6 +1700,10 @@ void RequestManager::process_finetuning_req_bwd_progress(
                  old_bc.num_finetuning_bwd_requests() <=
              1 &&
          "More than 1 finetuning request in the batch");
+  // if (inference_finished) {
+  //   printf("Running process_finetuning_req_bwd_progress with inference_finished=true. Num finetuning bwd tokens: %i\n",
+  //        old_bc.num_finetuning_bwd_tokens());
+  // }
   if (old_bc.num_finetuning_bwd_requests() == 0) {
     return;
   }
@@ -1804,6 +1836,8 @@ BatchConfig RequestManager::prepare_next_bwd_batch(BatchConfig &new_bc) {
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
 
   if (finetuning_bwd_work_available()) {
+    assert(!inference_finished &&
+           "Trying to add finetuning bwd request to next batch when inference_finished=true");
     add_finetuning_req_bwd_batch(new_bc);
   }
 
@@ -1826,6 +1860,14 @@ void RequestManager::add_inference_work_if_needed(BatchConfig &new_bc,
       BatchConfig::max_requests_per_batch() - (int)peft_finetuning_enabled(peft_support_mode);
   int num_concurrent_inf_adapters = 0;
 
+  if (inference_finished) {
+    assert(old_bc.num_inference_tokens() == 0 &&
+           "Old batch should not have inference tokens when inference_finished=true");
+    assert(pending_infr_request_queue.empty() &&
+           "Pending inference request queue should be empty when inference_finished=true");
+    return;
+  }
+
   // Step 2: evict any requests that will not fit in the kv cache
   evict_requests_if_needed(old_bc, inference_batch_size);
 
@@ -1890,8 +1932,14 @@ BatchConfig
     // }
     if (peft_temporal_sharing_state == INFERENCE) {
       if (peft_temporal_sharing_inf_step == 0) {
+        // if (inference_finished) {
+        //   printf("Add inference work if needed from saved old batch with inference_finished=true\n");
+        // }
         add_inference_work_if_needed(new_bc, ts_saved_old_batch);
       } else {
+        // if (inference_finished) {
+        //   printf("Add inference work if needed from immediately previous batch with inference_finished=true\n");
+        // }
         add_inference_work_if_needed(new_bc, old_bc);
       }
     } else if (peft_temporal_sharing_state == FINETUNING_FWD) {
@@ -1912,6 +1960,7 @@ BatchConfig
       get_max_fwd_finetuning_tokens_per_batch() > 0 &&
       (peft_support_mode != TEMPORAL_SHARING || peft_temporal_sharing_state == FINETUNING_FWD) &&
       slots_available_for_peft_fwd > 0) {
+    assert(!inference_finished && "Attempting to add finetuning work to new batch when inference is finished");
     add_finetuning_req_fwd_batch(new_bc);
   }
 
@@ -3948,13 +3997,16 @@ std::vector<GenerationResult>
     results.push_back(rm->get_generation_result(inf_guids[i]));
   }
   if (inf_guids.size() > 0) {
+    // printf("Inference workload finished. Stopping finetuning\n");
     rm->set_inference_finished();
   }
   // block until all PEFT requests have been processed (or get interrupted at
   // the end of the inference workload)
+  // printf("Waiting for PEFT workload to finish\n");
   for (int i = 0; i < peft_guids.size(); i++) {
     results.push_back(rm->get_generation_result(peft_guids[i]));
   }
+  // printf("PEFT workload finished\n");
   rm->save_output_to_json();
   return results;
 }