From a97f3c347a823f76465034822252125fe8271af8 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 4 Oct 2024 04:49:04 +0000
Subject: [PATCH 1/8] first commit

---
 CMakeLists.txt                 | 10 ++++++++++
 src/runtime/request_manager.cc |  1 +
 2 files changed, 11 insertions(+)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4e24e1e54b..c0c75f8686 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -296,6 +296,16 @@ if(NOT BUILD_LEGION_ONLY)
   endif()
 
   set(FLEXFLOW_CPP_DRV_SRC ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc)
+  # SuffixDecoding
+  include(FetchContent)
+  FetchContent_Declare(
+    suffix_decoding
+    GIT_REPOSITORY git@github.com:Snowflake-Labs/suffix-tree-decoding.git
+    GIT_TAG main  # or a specific tag/commit hash
+  )
+  FetchContent_MakeAvailable(suffix_decoding)
+  list(APPEND FLEXFLOW_INCLUDE_DIRS ${suffix_decoding_SOURCE_DIR}/src)
+  list(APPEND FLEXFLOW_SRC ${suffix_decoding_SOURCE_DIR}/src/suffix_decoding.cc)
 
   add_library(substitution_loader SHARED
     ${FLEXFLOW_ROOT}/src/runtime/substitution_loader.cc)
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 31a32dd3c8..24fdc2e572 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -18,6 +18,7 @@
 #include "flexflow/ops/lora_linear.h"
 #include "flexflow/parallel_ops/parallel_op.h"
 // #include "flexflow/tokenizers.h"
+#include "suffix_decoding.h"
 #include <bitset>
 #include <filesystem>
 #include <future>

From 673184d41008e075fcaeb031595fefd30f2f79c9 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 7 Oct 2024 03:50:26 +0000
Subject: [PATCH 2/8] update

---
 src/runtime/request_manager.cc | 71 ++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 24fdc2e572..d49a20a2f6 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -3002,6 +3002,77 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
   }
 }
 
+/*static*/
+void RequestManager::serve_suffix_decoding(FFModel *llm) {
+  Context ctx = llm->config.lg_ctx;
+  Runtime *runtime = llm->config.lg_hlr;
+  InferenceManager *im = InferenceManager::get_inference_manager();
+  {
+    // Compile the llm
+    im->compile_model_and_allocate_buffer(llm);
+    assert(im->model_weights_loaders.find(llm) !=
+           im->model_weights_loaders.end());
+    // Load model weights
+    im->model_weights_loaders[llm]->load_weights(llm);
+    // init operators
+    im->init_operators_inference(llm);
+  }
+
+  std::queue<std::pair<TreeVerifyBatchConfigFuture, InferenceResultFuture>>
+      batch_pipeline;
+  // Legion futures for inc_decoding and spec_infer
+  TreeVerifyBatchConfigFuture last_tree_bcf;
+  InferenceResultFuture last_tree_irf;
+  {
+    // Initialize futures for spec infer
+    TreeVerifyBatchConfig tree_bc;
+    InferenceResult tree_ir;
+    last_tree_bcf = Future::from_value<TreeVerifyBatchConfig>(tree_bc);
+    last_tree_irf = Future::from_value<InferenceResult>(tree_ir);
+  }
+  batch_pipeline.push(std::make_pair(last_tree_bcf, last_tree_irf));
+
+  while (!is_background_server_terminated()) {
+
+    if (batch_pipeline.size() >= 4) {
+      // Block here to avoid launching too many batches
+      auto const &batch = batch_pipeline.front();
+      batch.second.get_void_result();
+    }
+    // deque finished batches
+    while (batch_pipeline.size() > 1) {
+      auto const &batch = batch_pipeline.front();
+      if (batch.second.is_ready()) {
+        batch_pipeline.pop();
+      } else {
+        break;
+      }
+    }
+    
+    runtime->begin_trace(ctx, 12347 /*trace_id*/);
+    auto const &next_batch = batch_pipeline.back();
+    
+    BeamSearchBatchConfigFuture beam_bcf = prepare_next_batch_init(next_batch.first, next_batch.second, 0, ctx, runtime);
+    FutureMap fm = im->inference(ssm, 0, beam_bcf);
+    assert(fm.get_future_map_domain().get_volume() == 1);
+    BeamInferenceResultFuture beam_irf = fm.get_future(0);
+    beam_bcf = prepare_next_batch_beam(beam_bcf, beam_irf, ctx, runtime);
+    std::vector<BeamSearchBatchConfigFuture> beam_bcf_vec(1);
+    beam_bcf_vec[0] = beam_bcf;
+    // Token Tree Verification
+    {
+      TreeVerifyBatchConfigFuture tree_bcf = prepare_next_batch_verify(beam_bcf, ctx, runtime);
+      FutureMap fm = im->inference(llm, 0, tree_bcf);
+      assert(fm.get_future_map_domain().get_volume() == 1);
+      InferenceResultFuture tree_irf = fm.get_future(0);
+      batch_pipeline.push(std::make_pair(tree_bcf, tree_irf));
+      last_tree_bcf = tree_bcf;
+      last_tree_irf = tree_irf;
+    }
+    runtime->end_trace(ctx, 12347 /*trace_id*/);
+  }
+}
+
 void RequestManager::trigger_request_completion_future(
     RequestGuid const &guid) {
   const std::lock_guard<std::mutex> lock(request_to_promise_mutex);

From 4a3d1bd811b8494b8ac900f19bfc8a3136d44399 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 7 Oct 2024 04:03:52 +0000
Subject: [PATCH 3/8] update

---
 src/runtime/request_manager.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index d49a20a2f6..f13277ddd1 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -3053,7 +3053,7 @@ void RequestManager::serve_suffix_decoding(FFModel *llm) {
     auto const &next_batch = batch_pipeline.back();
     
     BeamSearchBatchConfigFuture beam_bcf = prepare_next_batch_init(next_batch.first, next_batch.second, 0, ctx, runtime);
-    FutureMap fm = im->inference(ssm, 0, beam_bcf);
+    FutureMap fm = im->suffix_decode(llm, 0, beam_bcf);
     assert(fm.get_future_map_domain().get_volume() == 1);
     BeamInferenceResultFuture beam_irf = fm.get_future(0);
     beam_bcf = prepare_next_batch_beam(beam_bcf, beam_irf, ctx, runtime);
@@ -3061,7 +3061,7 @@ void RequestManager::serve_suffix_decoding(FFModel *llm) {
     beam_bcf_vec[0] = beam_bcf;
     // Token Tree Verification
     {
-      TreeVerifyBatchConfigFuture tree_bcf = prepare_next_batch_verify(beam_bcf, ctx, runtime);
+      TreeVerifyBatchConfigFuture tree_bcf = prepare_next_batch_verify(beam_bcf_vec, ctx, runtime);
       FutureMap fm = im->inference(llm, 0, tree_bcf);
       assert(fm.get_future_map_domain().get_volume() == 1);
       InferenceResultFuture tree_irf = fm.get_future(0);

From d2caba835bfab2fabcb06648e59808c629d17ef6 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 7 Oct 2024 19:40:17 +0000
Subject: [PATCH 4/8] update

---
 include/flexflow/model.h           |   1 +
 include/flexflow/request_manager.h |  19 +++
 src/mapper/mapper.cc               |   1 +
 src/runtime/model.cc               |  21 +++
 src/runtime/request_manager.cc     | 257 ++++++++++++++++++++++++-----
 5 files changed, 259 insertions(+), 40 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 4ad735ef7d..8fcee5e2f6 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -277,6 +277,7 @@ enum TaskIDs {
   RM_PREPARE_NEXT_BATCH_INIT_TASK_ID,
   RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
   RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
+  RM_PREPARE_NEXT_BATCH_SUFFIX_DECODE_TASK_ID,
   RM_BACKGROUND_SERVING_TASK_ID,
   // Custom tasks
   CUSTOM_GPU_TASK_ID_FIRST,
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index f0fab957ee..45731efe33 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -19,6 +19,7 @@
 #include "flexflow/inference.h"
 #include "flexflow/model.h"
 #include "flexflow/utils/file_loader.h"
+#include "suffix_decoding.h"
 #include <future>
 #include <mutex>
 #include <tokenizers_cpp.h>
@@ -164,6 +165,7 @@ class RequestManager {
 
   void serve_incr_decoding(FFModel *model);
   void serve_spec_infer(FFModel *model);
+  void serve_suffix_decoding(FFModel *model);
   GenerationResult get_generation_result(RequestGuid const &guid);
   RequestGuid register_new_request(Request const &request_);
   RequestGuid register_new_peft_request(Request const &request_);
@@ -210,6 +212,15 @@ class RequestManager {
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
+  TreeVerifyBatchConfig
+      prepare_next_batch_suffix_decode(TreeVerifyBatchConfig const &old_bc,
+                                       InferenceResult const &result);
+  TreeVerifyBatchConfigFuture prepare_next_batch_suffix_decode(
+      TreeVerifyBatchConfigFuture const &old_bc,
+      InferenceResultFuture const &result,
+      Legion::Context ctx,
+      Legion::Runtime *runtime);
+
   void store_beam_metadata(BeamSearchBatchConfig const &old_bc,
                            BeamInferenceResult const &result);
   void update_beam_metadata(BeamSearchBatchConfig &new_bc,
@@ -280,6 +291,12 @@ class RequestManager {
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
+  static TreeVerifyBatchConfig prepare_next_batch_suffix_decode_task(
+      Legion::Task const *task,
+      std::vector<Legion::PhysicalRegion> const &regions,
+      Legion::Context ctx,
+      Legion::Runtime *runtime);
+
 private:
   // configuration parameters
   int max_requests_per_batch;
@@ -295,6 +312,8 @@ class RequestManager {
   // tree width in each speculative step, if not specified 1
   std::vector<int> spec_infer_tree_width;
 
+  SuffixTree *suffix_tree;
+
   // private fields
   std::unique_ptr<Tokenizer> tokenizer_;
   bool verbose;
diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc
index d7b9a5e99d..d321aeb583 100644
--- a/src/mapper/mapper.cc
+++ b/src/mapper/mapper.cc
@@ -284,6 +284,7 @@ void FFMapper::select_task_options(const MapperContext ctx,
       (task.task_id == RM_PREPARE_NEXT_BATCH_INIT_TASK_ID) ||
       (task.task_id == RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID) ||
       (task.task_id == RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID) ||
+      (task.task_id == RM_PREPARE_NEXT_BATCH_SUFFIX_DECODE_TASK_ID) ||
       (task.task_id == RM_BACKGROUND_SERVING_TASK_ID)) {
     output.initial_proc = all_cpus[0];
     return;
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 52f1dd2220..4e8aebe2aa 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -4822,6 +4822,27 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           RequestManager::prepare_next_batch_verify_task>(registrar);
     }
   }
+  // RequestManager prepare_next_batch_suffix_decode
+  {
+    TaskVariantRegistrar registrar(
+        RM_PREPARE_NEXT_BATCH_SUFFIX_DECODE_TASK_ID,
+        "RequestManager Prepare Next Batch (Suffix Decode)");
+    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<
+          TreeVerifyBatchConfig,
+          RequestManager::prepare_next_batch_suffix_decode_task>(
+          registrar, "RequestManager Prepare Next Batch (Suffix Decode) Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<
+          TreeVerifyBatchConfig,
+          RequestManager::prepare_next_batch_suffix_decode_task>(registrar);
+    }
+  }
   // RequestManager background serving task
   {
     TaskVariantRegistrar registrar(RM_BACKGROUND_SERVING_TASK_ID,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index f13277ddd1..90d060c97f 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -18,7 +18,6 @@
 #include "flexflow/ops/lora_linear.h"
 #include "flexflow/parallel_ops/parallel_op.h"
 // #include "flexflow/tokenizers.h"
-#include "suffix_decoding.h"
 #include <bitset>
 #include <filesystem>
 #include <future>
@@ -97,6 +96,124 @@ std::ostream &operator<<(std::ostream &os, Request const &req) {
 
 bool RequestManager::inference_finished = false;
 
+DailSqlTrace load_trace_dail_sql(std::string const &trace_filepath) {
+  std::filesystem::path cwd = std::filesystem::current_path();
+  std::ifstream file(trace_filepath);
+  assert(file.good() && "File does not exist or cannot be opened");
+
+  nlohmann::json data;
+  try {
+    file >> data;
+  } catch (nlohmann::json::parse_error &e) {
+    std::cerr << "JSON parse error: " << e.what() << std::endl;
+    assert(false);
+  }
+  std::cout << "finished loading json file: " << trace_filepath << std::endl;
+
+  DailSqlTrace trace;
+  for (auto const &question : data["questions"]) {
+    trace.prompts.push_back(question["prompt"]);
+    trace.responses.push_back(question["response"]);
+  }
+
+  assert(trace.prompts.size() == trace.responses.size());
+  return trace;
+}
+
+std::string replaceSlashes(std::string str) {
+  size_t pos = 0;
+  while ((pos = str.find('/', pos)) != std::string::npos) {
+    str.replace(pos, 1, "--");
+    pos += 2;
+  }
+  return str;
+}
+
+std::string getHFHome() {
+  if (char const *env_p = std::getenv("HF_HOME")) {
+    return std::string(env_p);
+  } else {
+    std::filesystem::path home = std::filesystem::path(getenv("HOME"));
+    return (home / ".cache" / "huggingface").string();
+  }
+}
+
+std::string get_tokenizer_base_folder(std::string const &model_name) {
+  // Replace '/' with '--' in the model name
+  std::string model_name_without_slashes = model_name;
+  model_name_without_slashes = replaceSlashes(model_name_without_slashes);
+
+  // Construct the base path
+  std::string hf_home = getHFHome();
+
+  std::string base_path =
+      hf_home + "/hub/models--" + model_name_without_slashes + "/snapshots/";
+
+  // Find the first subfolder in the snapshots directory
+  std::string first_folder_name;
+  for (auto const &entry : fs::directory_iterator(base_path)) {
+    if (fs::is_directory(entry)) {
+      first_folder_name = entry.path().filename().string();
+      break;
+    }
+  }
+  return base_path + first_folder_name;
+}
+
+std::string get_tokenizer_path(std::string const &model_name) {
+  std::string base_folder = get_tokenizer_base_folder(model_name);
+
+  if (fs::exists(base_folder + "/tokenizer.model")) {
+    return base_folder + "/tokenizer.model";
+  } else if (fs::exists(base_folder + "/tokenizer.json")) {
+    return base_folder + "/tokenizer.json";
+  } else {
+    assert(false);
+  }
+}
+
+int get_bos_token_id(std::string const &model_name) {
+  std::string base_folder = get_tokenizer_base_folder(model_name);
+  std::string filename = base_folder + "/config.json";
+  if (!fs::exists(filename)) {
+    assert(false && "config.json not found");
+    return -1;
+  }
+  // Read the JSON file
+  std::ifstream file(filename);
+  if (!file.is_open()) {
+    assert(false && "Unable to open file");
+    return -1;
+  }
+
+  // Parse JSON
+  nlohmann::json j;
+  file >> j;
+
+  // Get the bos_token_id value
+  if (j.contains("bos_token_id")) {
+    return j["bos_token_id"].get<int>();
+  } else {
+    assert(false && "bos_token_id not found in JSON");
+    return -1;
+  }
+  return -1;
+}
+
+auto get_tokenizer(std::string const &model_name) {
+  std::string tokenizer_path = get_tokenizer_path(model_name);
+  // if the tokenizer_path ends with the ".json" extension:
+  if (tokenizer_path.find("tokenizer.json") != std::string::npos) {
+    auto blob = LoadBytesFromFile(tokenizer_path);
+    return Tokenizer::FromBlobJSON(blob);
+  } else if (tokenizer_path.find("tokenizer.model") != std::string::npos) {
+    auto blob = LoadBytesFromFile(tokenizer_path);
+    return Tokenizer::FromBlobSentencePiece(blob);
+  } else {
+    assert(false);
+  }
+}
+
 RequestManager::RequestManager()
     : request_manager_status(INITIALIZED), verbose(false),
       next_available_guid(1000000), num_processed_requests(0),
@@ -110,6 +227,30 @@ RequestManager::RequestManager()
   max_tokens_per_batch = -1;
   max_spec_tree_token_num = -1;
   max_sequence_length = -1;
+
+  std::string model_name = "meta-llama/Meta-Llama-3-70B";
+  DailSqlTrace dail_sql_trace =
+      load_trace_dail_sql("/usr/suffix-tree-decoding/trace/spider.json");
+  int num_prompts = dail_sql_trace.prompts.size();
+  int num_responses = dail_sql_trace.responses.size();
+  assert(num_prompts == num_responses);
+  auto tokenizer = get_tokenizer(model_name);
+  int bos_token_id = get_bos_token_id(model_name);
+  int train_size = num_prompts / 2;
+  std::cout << "Number of prompts: " << num_prompts << std::endl;
+  std::cout << "Train size: " << train_size << std::endl;
+  std::vector<std::vector<int>> training_dataset;
+  for (int i = 0; i < train_size; i++) {
+    std::string text = dail_sql_trace.prompts[i] + dail_sql_trace.responses[i];
+    std::vector<int> encoded = tokenizer->Encode(text);
+    encoded.insert(encoded.begin(), bos_token_id);
+    training_dataset.push_back(encoded);
+  }
+  suffix_tree = new SuffixTree(50);
+  for (auto const &text : training_dataset) {
+    suffix_tree->insert(text, suffix_tree->query_guid);
+    suffix_tree->query_guid++;
+  }
 }
 
 void RequestManager::set_max_requests_per_batch(int max_num_requests) {
@@ -1721,6 +1862,38 @@ BeamSearchBatchConfig
   return new_bc;
 }
 
+TreeVerifyBatchConfigFuture RequestManager::prepare_next_batch_suffix_decode(
+    TreeVerifyBatchConfigFuture const &old_bc,
+    InferenceResultFuture const &result,
+    Context ctx,
+    Runtime *runtime) {
+
+  RequestManager *rm = this;
+  TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_SUFFIX_DECODE_TASK_ID,
+                        TaskArgument(&rm, sizeof(RequestManager *)));
+  launcher.add_future(old_bc);
+  launcher.add_future(result);
+  return runtime->execute_task(ctx, launcher);
+}
+
+TreeVerifyBatchConfig RequestManager::prepare_next_batch_suffix_decode_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  RequestManager *rm = *((RequestManager **)task->args);
+  TreeVerifyBatchConfig const &bc =
+      Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
+  InferenceResult const &result =
+      Future(task->futures[1]).get_result<InferenceResult>();
+  return rm->prepare_next_batch_suffix_decode(bc, result);
+}
+
+TreeVerifyBatchConfig RequestManager::prepare_next_batch_suffix_decode(
+    TreeVerifyBatchConfig const &old_bc, InferenceResult const &result) {
+  const std::lock_guard<std::mutex> lock(request_queue_mutex);
+}
+
 /***** Verify Phase *****/
 
 TreeVerifyBatchConfigFuture RequestManager::prepare_next_batch_verify(
@@ -2802,8 +2975,15 @@ void RequestManager::background_serving_task(
       "###PEFT DEBUGGING### Updated models' configuration.");
 
   if (rm->get_num_ssms() == 0) {
-    // No SSMs: perform incremental decoding
-    rm->serve_incr_decoding(llm);
+
+    char const *env_var = std::getenv("FF_SUFFIX_DECODING");
+
+    if (env_var != nullptr && std::string(env_var) == "1") {
+      rm->serve_suffix_decoding(llm);
+    } else {
+      // No SSMs: perform incremental decoding
+      rm->serve_incr_decoding(llm);
+    }
   } else {
     // Registered SSMs: perform speculative inference
     rm->serve_spec_infer(llm);
@@ -3004,33 +3184,41 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
 
 /*static*/
 void RequestManager::serve_suffix_decoding(FFModel *llm) {
+
+  // Check if the model object exists
+  if (llm == nullptr) {
+    std::cout << "###PEFT DEBUGGING### LLM Model object does not exist."
+              << std::endl;
+    return; // Early return to prevent further operations on a nullptr
+  } else {
+    std::cout << "###PEFT DEBUGGING### LLM Model object exists." << std::endl;
+  }
+
   Context ctx = llm->config.lg_ctx;
   Runtime *runtime = llm->config.lg_hlr;
+  // Compile the llm
   InferenceManager *im = InferenceManager::get_inference_manager();
+  im->compile_model_and_allocate_buffer(llm);
+  assert(im->model_weights_loaders.find(llm) !=
+         im->model_weights_loaders.end());
+  // Load model weights
+  im->model_weights_loaders[llm]->load_weights(llm);
+  // init operators
+  im->init_operators_inference(llm);
+  // Legion futures for inc_decoding and spec_infer
+  TreeVerifyBatchConfigFuture last_bcf;
+  InferenceResultFuture last_irf;
   {
-    // Compile the llm
-    im->compile_model_and_allocate_buffer(llm);
-    assert(im->model_weights_loaders.find(llm) !=
-           im->model_weights_loaders.end());
-    // Load model weights
-    im->model_weights_loaders[llm]->load_weights(llm);
-    // init operators
-    im->init_operators_inference(llm);
+    // Initialize futures for incr decoding
+    TreeVerifyBatchConfig bc;
+    InferenceResult ir;
+    last_bcf = Future::from_value<TreeVerifyBatchConfig>(bc);
+    last_irf = Future::from_value<InferenceResult>(ir);
   }
 
   std::queue<std::pair<TreeVerifyBatchConfigFuture, InferenceResultFuture>>
       batch_pipeline;
-  // Legion futures for inc_decoding and spec_infer
-  TreeVerifyBatchConfigFuture last_tree_bcf;
-  InferenceResultFuture last_tree_irf;
-  {
-    // Initialize futures for spec infer
-    TreeVerifyBatchConfig tree_bc;
-    InferenceResult tree_ir;
-    last_tree_bcf = Future::from_value<TreeVerifyBatchConfig>(tree_bc);
-    last_tree_irf = Future::from_value<InferenceResult>(tree_ir);
-  }
-  batch_pipeline.push(std::make_pair(last_tree_bcf, last_tree_irf));
+  { batch_pipeline.push(std::make_pair(last_bcf, last_irf)); }
 
   while (!is_background_server_terminated()) {
 
@@ -3048,27 +3236,16 @@ void RequestManager::serve_suffix_decoding(FFModel *llm) {
         break;
       }
     }
-    
     runtime->begin_trace(ctx, 12347 /*trace_id*/);
     auto const &next_batch = batch_pipeline.back();
-    
-    BeamSearchBatchConfigFuture beam_bcf = prepare_next_batch_init(next_batch.first, next_batch.second, 0, ctx, runtime);
-    FutureMap fm = im->suffix_decode(llm, 0, beam_bcf);
+    TreeVerifyBatchConfigFuture bcf = prepare_next_batch_suffix_decode(
+        next_batch.first, next_batch.second, ctx, runtime);
+    FutureMap fm = im->inference(llm, 0, bcf);
     assert(fm.get_future_map_domain().get_volume() == 1);
-    BeamInferenceResultFuture beam_irf = fm.get_future(0);
-    beam_bcf = prepare_next_batch_beam(beam_bcf, beam_irf, ctx, runtime);
-    std::vector<BeamSearchBatchConfigFuture> beam_bcf_vec(1);
-    beam_bcf_vec[0] = beam_bcf;
-    // Token Tree Verification
-    {
-      TreeVerifyBatchConfigFuture tree_bcf = prepare_next_batch_verify(beam_bcf_vec, ctx, runtime);
-      FutureMap fm = im->inference(llm, 0, tree_bcf);
-      assert(fm.get_future_map_domain().get_volume() == 1);
-      InferenceResultFuture tree_irf = fm.get_future(0);
-      batch_pipeline.push(std::make_pair(tree_bcf, tree_irf));
-      last_tree_bcf = tree_bcf;
-      last_tree_irf = tree_irf;
-    }
+    InferenceResultFuture irf = fm.get_future(0);
+    batch_pipeline.push(std::make_pair(bcf, irf));
+    last_bcf = bcf;
+    last_irf = irf;
     runtime->end_trace(ctx, 12347 /*trace_id*/);
   }
 }

From 0194ef221463705c053519792d4d4131641ea26b Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 11 Oct 2024 05:43:25 +0000
Subject: [PATCH 5/8] update

---
 CMakeLists.txt                               |   1 +
 inference/suffix_decoding/CMakeLists.txt     |  38 +++
 inference/suffix_decoding/Makefile           |  37 +++
 inference/suffix_decoding/suffix_decoding.cc | 129 ++++++++
 inference/suffix_decoding/utils.cc           | 311 +++++++++++++++++++
 inference/suffix_decoding/utils.h            |  65 ++++
 src/ops/inc_multihead_self_attention.cu      |   2 +-
 7 files changed, 582 insertions(+), 1 deletion(-)
 create mode 100644 inference/suffix_decoding/CMakeLists.txt
 create mode 100644 inference/suffix_decoding/Makefile
 create mode 100644 inference/suffix_decoding/suffix_decoding.cc
 create mode 100644 inference/suffix_decoding/utils.cc
 create mode 100644 inference/suffix_decoding/utils.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c0c75f8686..d0a1c3a4fc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -544,6 +544,7 @@ if(NOT BUILD_LEGION_ONLY)
 
   if(FF_BUILD_INFERENCE)
     add_subdirectory(inference/spec_infer)
+    add_subdirectory(inference/suffix_decoding)
     add_subdirectory(inference/incr_decoding)
     add_subdirectory(inference/peft)
   endif()
diff --git a/inference/suffix_decoding/CMakeLists.txt b/inference/suffix_decoding/CMakeLists.txt
new file mode 100644
index 0000000000..c865ae04e6
--- /dev/null
+++ b/inference/suffix_decoding/CMakeLists.txt
@@ -0,0 +1,38 @@
+cmake_minimum_required(VERSION 3.10)
+
+project(FlexFlow_SpecInfer)
+set(project_target suffix_decoding)
+
+
+set(CPU_SRC
+  ${FLEXFLOW_CPP_DRV_SRC}
+  suffix_decoding.cc
+  utils.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target} ${CPU_SRC})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target} ${CPU_SRC})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+
+set(BIN_DEST "bin")
+install(TARGETS ${project_target} DESTINATION ${BIN_DEST})
diff --git a/inference/suffix_decoding/Makefile b/inference/suffix_decoding/Makefile
new file mode 100644
index 0000000000..0e4b79f51f
--- /dev/null
+++ b/inference/suffix_decoding/Makefile
@@ -0,0 +1,37 @@
+# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Flags for directing the runtime makefile what to include
+DEBUG           ?= 0		# Include debugging symbols
+MAX_DIM         ?= 4		# Maximum number of dimensions
+OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
+USE_CUDA        ?= 1		# Include CUDA support (requires CUDA)
+USE_GASNET      ?= 0		# Include GASNet support (requires GASNet)
+USE_HDF         ?= 1		# Include HDF5 support (requires HDF5)
+ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
+
+# Put the binary file name here
+OUTFILE		?= llama_pipeline
+# List all the application source files here
+ifndef CUDA_HOME
+CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1))
+endif
+
+
+ifndef FF_HOME
+$(error FF_HOME variable is not defined, aborting build)
+endif
+
+include $(FF_HOME)/FlexFlow.mk
diff --git a/inference/suffix_decoding/suffix_decoding.cc b/inference/suffix_decoding/suffix_decoding.cc
new file mode 100644
index 0000000000..d2b4aa17c7
--- /dev/null
+++ b/inference/suffix_decoding/suffix_decoding.cc
@@ -0,0 +1,129 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "suffix_decoding/utils.h"
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+Legion::Logger log_app("llama");
+
+
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  FilePaths file_paths;
+  ModelMeta model_metadata;
+  bool use_full_precision = false;
+  bool verbose = false;
+  int max_requests_per_batch = 16;
+  int max_tokens_per_batch = 256;
+  int max_sequence_length = 1024;
+  int max_spec_tree_token_num = 23;
+  int expansion_degree = 3;
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   model_metadata.model_names,
+                   use_full_precision,
+                   verbose,
+                   max_requests_per_batch,
+                   max_tokens_per_batch,
+                   max_sequence_length,
+                   expansion_degree);
+
+  get_model_meta(file_paths, model_metadata, use_full_precision);
+
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+
+  GenerationConfig generationConfig;
+  InferenceManager *im = InferenceManager::get_inference_manager();
+  RequestManager *rm = RequestManager::get_request_manager();
+  init_request_manager(rm,
+                       model_metadata,
+                       file_paths,
+                       max_requests_per_batch,
+                       max_tokens_per_batch,
+                       max_spec_tree_token_num,
+                       max_sequence_length,
+                       expansion_degree);
+
+  // Create LLM model
+  FFModel tree_model(ffconfig, ffconfig.cpu_offload);
+  init_llm(tree_model, model_metadata, generationConfig, use_full_precision);
+
+  // Create SSM models
+  int num_ssms = model_metadata.ssm_model_types.size();
+  std::vector<FFModel> ssm_models;
+  FFConfig bm_config = ffconfig;
+  bm_config.data_parallelism_degree = bm_config.tensor_parallelism_degree =
+      bm_config.pipeline_parallelism_degree = 1;
+  for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) {
+    FFModel beam_model(bm_config);
+    ssm_models.push_back(beam_model);
+  }
+  init_ssms(rm, ssm_models, num_ssms, model_metadata, generationConfig, use_full_precision);
+  
+  rm->start_background_server(&tree_model);
+
+  // Register requests from prompt file
+  int total_num_requests = 0;
+  {
+    using json = nlohmann::json;
+    std::ifstream file_handle(file_paths.prompt_file_path);
+    assert(file_handle.good() && "Prompt file does not exist.");
+    json prompt_json = json::parse(file_handle,
+                                   /*parser_callback_t */ nullptr,
+                                   /*allow_exceptions */ true,
+                                   /*ignore_comments */ true);
+
+    std::vector<Request> requests;
+    for (auto &prompt : prompt_json) {
+      std::string text = prompt.get<std::string>();
+      printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
+      // Add inference request
+      Request inference_req;
+      inference_req.prompt = text;
+      inference_req.max_length = 128;
+      requests.push_back(inference_req);
+      total_num_requests++;
+    }
+    tree_model.generate(requests);
+  }
+
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
+  // Execution fence
+  {
+    Future future = runtime->issue_execution_fence(ctx);
+    future.get_void_result();
+  }
+
+  // float* data
+  std::cout << "----------inference finished--------------" << std::endl;
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/inference/suffix_decoding/utils.cc b/inference/suffix_decoding/utils.cc
new file mode 100644
index 0000000000..0f20c1e5ff
--- /dev/null
+++ b/inference/suffix_decoding/utils.cc
@@ -0,0 +1,311 @@
+#include "suffix_decoding/utils.h"
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      ModelNames &model_names,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length,
+                      int &expansion_degree) {
+  for (int i = 1; i < argc; i++) {
+    // llm model name
+    if (!strcmp(argv[i], "-llm-model")) {
+      model_names.llm_model_name = std::string(argv[++i]);
+      for (char &c : model_names.llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // ssm models names
+    if (!strcmp(argv[i], "-ssm-model")) {
+      std::string ssm_model_name = std::string(argv[++i]);
+      for (char &c : ssm_model_name) {
+        c = std::tolower(c);
+      }
+      model_names.ssm_model_names.push_back(ssm_model_name);
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // prompts
+    if (!strcmp(argv[i], "-prompt")) {
+      paths.prompt_file_path = std::string(argv[++i]);
+      continue;
+    }
+    // output file
+    if (!strcmp(argv[i], "-output-file")) {
+      paths.output_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-per-batch")) {
+      max_requests_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-batch")) {
+      max_tokens_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--expansion-degree")) {
+      expansion_degree = std::stoi(argv[++i]);
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void get_model_meta(FilePaths &file_paths,
+                    ModelMeta &model_metadata,
+                    bool use_full_precision) {
+  if (model_metadata.model_names.llm_model_name.empty() ||
+      model_metadata.model_names.ssm_model_names.size() == 0) {
+    assert(false && "SpecInfer needs at least one LLM and one SSM for "
+                    "speculative inference");
+  }
+  model_metadata.llm_model_config_path =
+      join_path({file_paths.cache_folder_path,
+                 "configs",
+                 model_metadata.model_names.llm_model_name,
+                 "config.json"});
+  model_metadata.llm_tokenizer_path =
+      join_path({file_paths.cache_folder_path,
+                 "tokenizers",
+                 model_metadata.model_names.llm_model_name});
+  model_metadata.llm_weights_path =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 model_metadata.model_names.llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+
+  std::ifstream llm_config_file_handle(model_metadata.llm_model_config_path);
+  if (!llm_config_file_handle.good()) {
+    std::cout << "LLM Model config file "
+              << model_metadata.llm_model_config_path << " not found."
+              << std::endl;
+    assert(false);
+  }
+  json llm_model_config = json::parse(llm_config_file_handle,
+                                      /*parser_callback_t */ nullptr,
+                                      /*allow_exceptions */ true,
+                                      /*ignore_comments */ true);
+
+  model_metadata.llm_model_type = ModelType::UNKNOWN;
+  auto architectures = llm_model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+      model_metadata.llm_model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_metadata.llm_model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_metadata.llm_model_type = ModelType::FALCON;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_metadata.llm_model_type = ModelType::MPT;
+      break;
+    }
+  }
+  model_metadata.bos_token_id =
+      llm_model_config.find("bos_token_id") == llm_model_config.end()
+          ? -1
+          : (int)llm_model_config.at("bos_token_id");
+  model_metadata.eos_token_id =
+      llm_model_config.find("eos_token_id") == llm_model_config.end()
+          ? -1
+          : (int)llm_model_config.at("eos_token_id");
+
+  for (auto ssm_model_name : model_metadata.model_names.ssm_model_names) {
+    std::string ssm_config_path = join_path({file_paths.cache_folder_path,
+                                             "configs",
+                                             ssm_model_name,
+                                             "config.json"});
+    std::string ssm_tokenizer_path =
+        join_path({file_paths.cache_folder_path, "tokenizers", ssm_model_name});
+    std::string ssm_weights_path =
+        join_path({file_paths.cache_folder_path,
+                   "weights",
+                   ssm_model_name,
+                   use_full_precision ? "full-precision" : "half-precision"});
+
+    std::ifstream ssm_config_file_handle(ssm_config_path);
+    if (!ssm_config_file_handle.good()) {
+      std::cout << "SSM Model config file " << ssm_config_path << " not found."
+                << std::endl;
+      assert(false);
+    }
+    json ssm_model_config = json::parse(ssm_config_file_handle,
+                                        /*parser_callback_t */ nullptr,
+                                        /*allow_exceptions */ true,
+                                        /*ignore_comments */ true);
+
+    ModelType ssm_model_type = ModelType::UNKNOWN;
+    auto architectures = ssm_model_config["architectures"];
+    for (auto const &str : architectures) {
+      if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+        ssm_model_type = ModelType::LLAMA;
+        break;
+      } else if (str == "OPTForCausalLM") {
+        ssm_model_type = ModelType::OPT;
+        break;
+      } else if (str == "RWForCausalLM") {
+        ssm_model_type = ModelType::FALCON;
+        break;
+      } else if (str == "MPTForCausalLM") {
+        ssm_model_type = ModelType::MPT;
+        break;
+      }
+    }
+    int ssm_bos_id =
+        ssm_model_config.find("bos_token_id") == ssm_model_config.end()
+            ? -1
+            : (int)ssm_model_config.at("bos_token_id");
+    int ssm_eos_id =
+        ssm_model_config.find("eos_token_id") == ssm_model_config.end()
+            ? -1
+            : (int)ssm_model_config.at("eos_token_id");
+    if (ssm_bos_id != model_metadata.bos_token_id ||
+        ssm_eos_id != model_metadata.eos_token_id) {
+      printf("Warning: bos/eos token id mismatch between LLM and one of the "
+             "SSMs!\n");
+    }
+    model_metadata.ssm_model_types.push_back(ssm_model_type);
+    model_metadata.ssm_model_config_paths.push_back(ssm_config_path);
+    model_metadata.ssm_model_weights_paths.push_back(ssm_weights_path);
+  }
+
+  assert(model_metadata.llm_model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  for (auto mt : model_metadata.ssm_model_types) {
+    if (mt == ModelType::UNKNOWN) {
+      assert(false && "One of the SSM model types passed is invalid.");
+    }
+  }
+}
+
+void init_request_manager(RequestManager *rm, ModelMeta &model_metadata,
+                          FilePaths &file_paths, int max_requests_per_batch,
+                          int max_tokens_per_batch, int max_spec_tree_token_num, int max_sequence_length,
+                          int expansion_degree) {
+    rm->set_max_requests_per_batch(max_requests_per_batch);
+    rm->set_max_tokens_per_batch(max_tokens_per_batch);
+    rm->set_max_spec_tree_token_num(max_spec_tree_token_num);
+    rm->set_max_sequence_length(max_sequence_length);
+    rm->register_tokenizer(model_metadata.llm_model_type,
+                            model_metadata.bos_token_id,
+                            model_metadata.eos_token_id,
+                            model_metadata.llm_tokenizer_path);
+    rm->register_output_filepath(file_paths.output_file_path);
+
+    // first decoding step: 3 results
+    if (expansion_degree != -1) {
+        rm->push_spec_infer_tree_width(1);
+        rm->push_spec_infer_tree_width(1);
+        rm->push_spec_infer_tree_width(expansion_degree);
+    }
+}
+
+void init_llm(FFModel &tree_model, ModelMeta &model_metadata, 
+                GenerationConfig &generationConfig, bool use_full_precision) {
+    if (model_metadata.llm_model_type == ModelType::LLAMA) {
+        LLAMA::create_llama_model(tree_model,
+                                model_metadata.llm_model_config_path,
+                                model_metadata.llm_weights_path,
+                                TREE_VERIFY_MODE,
+                                generationConfig,
+                                use_full_precision);
+    } else if (model_metadata.llm_model_type == ModelType::OPT) {
+        OPT::create_opt_model(tree_model,
+                            model_metadata.llm_model_config_path,
+                            model_metadata.llm_weights_path,
+                            TREE_VERIFY_MODE,
+                            use_full_precision);
+    } else if (model_metadata.llm_model_type == ModelType::FALCON) {
+        FALCON::create_falcon_model(tree_model,
+                                    model_metadata.llm_model_config_path,
+                                    model_metadata.llm_weights_path,
+                                    TREE_VERIFY_MODE,
+                                    use_full_precision);
+    } else if (model_metadata.llm_model_type == ModelType::MPT) {
+        MPT::create_mpt_model(tree_model,
+                            model_metadata.llm_model_config_path,
+                            model_metadata.llm_weights_path,
+                            TREE_VERIFY_MODE,
+                            generationConfig,
+                            use_full_precision);
+    } else {
+        assert(false && "Invalid LLM model type passed (or no type was passed).");
+    }
+}
+
+void init_ssms(RequestManager *rm, std::vector<FFModel> &ssm_models, int num_ssms,
+                ModelMeta &model_metadata, GenerationConfig &generationConfig,
+                bool use_full_precision) {
+    for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) {
+        FFModel &beam_model = ssm_models[ssm_id];
+        if (model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA) {
+        LLAMA::create_llama_model(beam_model,
+                                    model_metadata.ssm_model_config_paths[ssm_id],
+                                    model_metadata.ssm_model_weights_paths[ssm_id],
+                                    BEAM_SEARCH_MODE,
+                                    generationConfig,
+                                    use_full_precision);
+        } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::OPT) {
+        OPT::create_opt_model(beam_model,
+                                model_metadata.ssm_model_config_paths[ssm_id],
+                                model_metadata.ssm_model_weights_paths[ssm_id],
+                                BEAM_SEARCH_MODE,
+                                use_full_precision);
+        } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::FALCON) {
+        FALCON::create_falcon_model(
+            beam_model,
+            model_metadata.ssm_model_config_paths[ssm_id],
+            model_metadata.ssm_model_weights_paths[ssm_id],
+            BEAM_SEARCH_MODE,
+            use_full_precision);
+        } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::MPT) {
+        MPT::create_mpt_model(beam_model,
+                                model_metadata.ssm_model_config_paths[ssm_id],
+                                model_metadata.ssm_model_weights_paths[ssm_id],
+                                BEAM_SEARCH_MODE,
+                                generationConfig,
+                                use_full_precision);
+        } else {
+        assert(false && "Invalid SSM model type passed.");
+        }
+
+        rm->register_ssm_model(&beam_model);
+    }
+}
\ No newline at end of file
diff --git a/inference/suffix_decoding/utils.h b/inference/suffix_decoding/utils.h
new file mode 100644
index 0000000000..f4f1f90216
--- /dev/null
+++ b/inference/suffix_decoding/utils.h
@@ -0,0 +1,65 @@
+#include "flexflow/inference.h"
+#include "models/falcon.h"
+#include "models/llama.h"
+#include "models/mpt.h"
+#include "models/opt.h"
+#include <filesystem>
+#include <nlohmann/json.hpp>
+#include <wordexp.h>
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string prompt_file_path;
+  std::string output_file_path;
+};
+
+struct ModelNames {
+  std::string llm_model_name;
+  std::vector<std::string> ssm_model_names;
+};
+
+struct ModelMeta {
+  ModelNames model_names;
+
+  ModelType llm_model_type;
+  std::string llm_tokenizer_path;
+  std::string llm_weights_path;
+  std::string llm_model_config_path;
+
+  int bos_token_id, eos_token_id;
+
+  std::vector<ModelType> ssm_model_types;
+  std::vector<std::string> ssm_model_config_paths;
+  std::vector<std::string> ssm_model_weights_paths;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      ModelNames &model_names,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length,
+                      int &expansion_degree);
+
+void get_model_meta(FilePaths &file_paths,
+                    ModelMeta &model_metadata,
+                    bool use_full_precision);
+
+void init_request_manager(RequestManager *rm, ModelMeta &model_metadata,
+                          FilePaths &file_paths, int max_requests_per_batch,
+                          int max_tokens_per_batch, int max_spec_tree_token_num,
+                          int max_sequence_length, int expansion_degree);
+
+void init_llm(FFModel &tree_model, ModelMeta &model_metadata, 
+                GenerationConfig &generationConfig, bool use_full_precision);
+
+void init_ssms(RequestManager *rm, std::vector<FFModel> &ssm_models, int num_ssms,
+                ModelMeta &model_metadata, GenerationConfig &generationConfig,
+                bool use_full_precision);
\ No newline at end of file
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 2802dd41b6..454926bcdb 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -126,7 +126,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
                        bc->requestsInfo[i].num_tokens_in_batch;
-    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+    int max_peft_tokens = bc->requestsInfo[i].max_length;
     // Copy query to m->query_activation_buffer if we need to compute
     // PEFT backward
     if (bc->requestsInfo[i].peft_bwd) {

From e5d05d7b45f3248d70f25ad1f75cdd24d7fea2f7 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 11 Oct 2024 06:52:03 +0000
Subject: [PATCH 6/8] update

---
 inference/suffix_decoding/suffix_decoding.cc | 57 +++++++++++++-----
 inference/suffix_decoding/utils.cc           | 63 ++++++++++++++++++++
 inference/suffix_decoding/utils.h            |  7 ++-
 3 files changed, 110 insertions(+), 17 deletions(-)

diff --git a/inference/suffix_decoding/suffix_decoding.cc b/inference/suffix_decoding/suffix_decoding.cc
index d2b4aa17c7..49dd819b4a 100644
--- a/inference/suffix_decoding/suffix_decoding.cc
+++ b/inference/suffix_decoding/suffix_decoding.cc
@@ -22,6 +22,8 @@ using json = nlohmann::json;
 Legion::Logger log_app("llama");
 
 
+void process_partition(RequestManager *rm, std::string input_filename) {
+}
 
 void FlexFlow::top_level_task(Task const *task,
                               std::vector<PhysicalRegion> const &regions,
@@ -30,13 +32,14 @@ void FlexFlow::top_level_task(Task const *task,
   FFConfig ffconfig;
   FilePaths file_paths;
   ModelMeta model_metadata;
+  std::string partition_name;
   bool use_full_precision = false;
   bool verbose = false;
   int max_requests_per_batch = 16;
   int max_tokens_per_batch = 256;
   int max_sequence_length = 1024;
   int max_spec_tree_token_num = 23;
-  int expansion_degree = 3;
+  int expansion_degree = 1;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -45,6 +48,7 @@ void FlexFlow::top_level_task(Task const *task,
                    argc,
                    file_paths,
                    model_metadata.model_names,
+                   partition_name,
                    use_full_precision,
                    verbose,
                    max_requests_per_batch,
@@ -57,6 +61,10 @@ void FlexFlow::top_level_task(Task const *task,
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
          ffconfig.numNodes * ffconfig.workersPerNode);
+  
+  json trace = load_trace(file_paths.prompt_file_path);
+  json training_entries = get_training_entries(trace, partition_name);
+  json eval_entries = get_eval_entries(trace, partition_name);
 
   GenerationConfig generationConfig;
   InferenceManager *im = InferenceManager::get_inference_manager();
@@ -88,30 +96,47 @@ void FlexFlow::top_level_task(Task const *task,
   
   rm->start_background_server(&tree_model);
 
-  // Register requests from prompt file
   int total_num_requests = 0;
   {
-    using json = nlohmann::json;
-    std::ifstream file_handle(file_paths.prompt_file_path);
-    assert(file_handle.good() && "Prompt file does not exist.");
-    json prompt_json = json::parse(file_handle,
-                                   /*parser_callback_t */ nullptr,
-                                   /*allow_exceptions */ true,
-                                   /*ignore_comments */ true);
-
     std::vector<Request> requests;
-    for (auto &prompt : prompt_json) {
-      std::string text = prompt.get<std::string>();
-      printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
+    for (auto entry: eval_entries) {
+      std::string prompt = entry["prompt"];
+      int response_length = entry["response_length"];
+      // printf("Prompt[%d]: %s\n", total_num_requests, prompt.c_str());
       // Add inference request
       Request inference_req;
-      inference_req.prompt = text;
-      inference_req.max_length = 128;
+      inference_req.prompt = prompt;
+      inference_req.max_new_tokens = response_length;
       requests.push_back(inference_req);
       total_num_requests++;
     }
     tree_model.generate(requests);
-  }
+  }  
+
+  // Register requests from prompt file
+  // int total_num_requests = 0;
+  // {
+  //   using json = nlohmann::json;
+  //   std::ifstream file_handle(file_paths.prompt_file_path);
+  //   assert(file_handle.good() && "Prompt file does not exist.");
+  //   json prompt_json = json::parse(file_handle,
+  //                                  /*parser_callback_t */ nullptr,
+  //                                  /*allow_exceptions */ true,
+  //                                  /*ignore_comments */ true);
+
+  //   std::vector<Request> requests;
+  //   for (auto &prompt : prompt_json) {
+  //     std::string text = prompt.get<std::string>();
+  //     printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
+  //     // Add inference request
+  //     Request inference_req;
+  //     inference_req.prompt = text;
+  //     inference_req.max_length = 128;
+  //     requests.push_back(inference_req);
+  //     total_num_requests++;
+  //   }
+  //   tree_model.generate(requests);
+  // }
 
   // terminate the request manager by stopping the background thread
   rm->terminate_background_server();
diff --git a/inference/suffix_decoding/utils.cc b/inference/suffix_decoding/utils.cc
index 0f20c1e5ff..f71aac2c26 100644
--- a/inference/suffix_decoding/utils.cc
+++ b/inference/suffix_decoding/utils.cc
@@ -8,6 +8,7 @@ void parse_input_args(char **argv,
                       int argc,
                       FilePaths &paths,
                       ModelNames &model_names,
+                      std::string &partition_name,
                       bool &use_full_precision,
                       bool &verbose,
                       int &max_requests_per_batch,
@@ -37,6 +38,11 @@ void parse_input_args(char **argv,
       paths.cache_folder_path = std::string(argv[++i]);
       continue;
     }
+    // cache folder
+    if (!strcmp(argv[i], "-partition-name")) {
+      partition_name = std::string(argv[++i]);
+      continue;
+    }
     // prompts
     if (!strcmp(argv[i], "-prompt")) {
       paths.prompt_file_path = std::string(argv[++i]);
@@ -308,4 +314,61 @@ void init_ssms(RequestManager *rm, std::vector<FFModel> &ssm_models, int num_ssm
 
         rm->register_ssm_model(&beam_model);
     }
+}
+
+json load_trace(std::string input_filename) {
+    std::cout << "Loading input file: " << input_filename << std::endl;
+    std::ifstream file(input_filename);
+    if (!file.is_open()) {
+        std::cerr << "Error opening file: " << input_filename << std::endl;
+        return nullptr;
+    }
+
+    try {
+        json data = json::parse(file);
+        
+        // Print metadata
+        const auto& metadata = data["metadata"];
+        std::cout << "Metadata:" << std::endl;
+        std::cout << "Average entries per partition: " << metadata["avg_entries_per_partition"] << std::endl;
+        std::cout << "Max prompt length: " << metadata["max_prompt_length"] << std::endl;
+        std::cout << "Min prompt length: " << metadata["min_prompt_length"] << std::endl;
+        std::cout << "Avg prompt length: " << metadata["avg_prompt_length"] << std::endl;
+        std::cout << "Max response length: " << metadata["max_response_length"] << std::endl;
+        std::cout << "Min response length: " << metadata["min_response_length"] << std::endl;
+        std::cout << "Avg response length: " << metadata["avg_response_length"] << std::endl;
+        // Print list of partition names
+        const auto& partitions = data["partitions"];
+        std::cout << "Partitions:" << std::endl;
+        int counter = 0;
+        for (const auto& partition : partitions) {
+            std::cout << counter++ << ". " << partition["name"] << std::endl;
+        }
+    }
+    catch (json::parse_error& e) {
+        std::cerr << "JSON parse error: " << e.what() << std::endl;
+        return nullptr;
+    }
+}
+
+json get_training_entries(json data, std::string partition_name) {
+    const auto& partitions = data["partitions"];
+    for (const auto& partition : partitions) {
+        if (partition["name"] == partition_name) {
+            return partition["training_entries"];
+        }
+    }
+    std::cerr << "Partition not found: " << partition_name << std::endl;
+    return 1;
+}
+
+json get_eval_entries(json data, std::string partition_name) {
+    const auto& partitions = data["partitions"];
+    for (const auto& partition : partitions) {
+        if (partition["name"] == partition_name) {
+            return partition["eval_entries"];
+        }
+    }
+    std::cerr << "Partition not found: " << partition_name << std::endl;
+    return 1;
 }
\ No newline at end of file
diff --git a/inference/suffix_decoding/utils.h b/inference/suffix_decoding/utils.h
index f4f1f90216..7b56c06952 100644
--- a/inference/suffix_decoding/utils.h
+++ b/inference/suffix_decoding/utils.h
@@ -41,6 +41,7 @@ void parse_input_args(char **argv,
                       int argc,
                       FilePaths &paths,
                       ModelNames &model_names,
+                      std::string &partition_name,
                       bool &use_full_precision,
                       bool &verbose,
                       int &max_requests_per_batch,
@@ -62,4 +63,8 @@ void init_llm(FFModel &tree_model, ModelMeta &model_metadata,
 
 void init_ssms(RequestManager *rm, std::vector<FFModel> &ssm_models, int num_ssms,
                 ModelMeta &model_metadata, GenerationConfig &generationConfig,
-                bool use_full_precision);
\ No newline at end of file
+                bool use_full_precision);
+
+json load_trace(std::string filename);
+json get_training_entries(json data, std::string partition_name);
+json get_eval_entries(json data, std::string partition_name);
\ No newline at end of file

From 6f4249fd618b535d31df907c821294ef28d8e8c7 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 11 Oct 2024 06:59:09 +0000
Subject: [PATCH 7/8] add script

---
 run_suffix_decoding.sh | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 run_suffix_decoding.sh

diff --git a/run_suffix_decoding.sh b/run_suffix_decoding.sh
new file mode 100644
index 0000000000..e124f5e4c0
--- /dev/null
+++ b/run_suffix_decoding.sh
@@ -0,0 +1,20 @@
+#! /usr/bin/env bash
+set -e
+set -x
+
+# Cd into parent directory of folder holding this script
+cd "${BASH_SOURCE[0]%/*}/build"
+
+# Download models
+python ../inference/utils/download_hf_model.py --half-precision-only meta-llama/Meta-Llama-3-8B Felladrin/Llama-160M-Chat-v1
+
+./suffix_decoding/suffix_decoding \
+    -ll:gpu 4 -ll:cpu 4 -ll:util 4 \
+    -tensor-parallelism-degree 4 \
+    -ll:fsize 20000 -ll:zsize 30000 \
+    -llm-model meta-llama/Meta-Llama-3-8B \
+    -ssm-model Felladrin/Llama-160M-Chat-v1 \
+    -partition-name "" \
+    -prompt ../../suffix_decoding/trace/spider_v2.json \
+    -output-file ../inference/output/spider_v2.out
+

From 08d15178720fe002a9c2516cc6b19d8cf303b6b3 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 11 Oct 2024 08:15:33 +0000
Subject: [PATCH 8/8] fixes

---
 inference/suffix_decoding/utils.cc           |  7 +--
 run_suffix_decoding.sh                       |  5 +-
 src/c/flexflow_c.cc                          |  2 +-
 src/ops/tree_inc_multihead_self_attention.cc |  1 +
 src/runtime/request_manager.cc               | 48 ++++++++++----------
 5 files changed, 34 insertions(+), 29 deletions(-)
 mode change 100644 => 100755 run_suffix_decoding.sh

diff --git a/inference/suffix_decoding/utils.cc b/inference/suffix_decoding/utils.cc
index f71aac2c26..1c8fd930ff 100644
--- a/inference/suffix_decoding/utils.cc
+++ b/inference/suffix_decoding/utils.cc
@@ -342,8 +342,9 @@ json load_trace(std::string input_filename) {
         std::cout << "Partitions:" << std::endl;
         int counter = 0;
         for (const auto& partition : partitions) {
-            std::cout << counter++ << ". " << partition["name"] << std::endl;
+            std::cout << counter++ << ". " << partition["partition_name"] << std::endl;
         }
+        return data;
     }
     catch (json::parse_error& e) {
         std::cerr << "JSON parse error: " << e.what() << std::endl;
@@ -354,7 +355,7 @@ json load_trace(std::string input_filename) {
 json get_training_entries(json data, std::string partition_name) {
     const auto& partitions = data["partitions"];
     for (const auto& partition : partitions) {
-        if (partition["name"] == partition_name) {
+        if (partition["partition_name"] == partition_name) {
             return partition["training_entries"];
         }
     }
@@ -365,7 +366,7 @@ json get_training_entries(json data, std::string partition_name) {
 json get_eval_entries(json data, std::string partition_name) {
     const auto& partitions = data["partitions"];
     for (const auto& partition : partitions) {
-        if (partition["name"] == partition_name) {
+        if (partition["partition_name"] == partition_name) {
             return partition["eval_entries"];
         }
     }
diff --git a/run_suffix_decoding.sh b/run_suffix_decoding.sh
old mode 100644
new mode 100755
index e124f5e4c0..69e2105254
--- a/run_suffix_decoding.sh
+++ b/run_suffix_decoding.sh
@@ -7,14 +7,15 @@ cd "${BASH_SOURCE[0]%/*}/build"
 
 # Download models
 python ../inference/utils/download_hf_model.py --half-precision-only meta-llama/Meta-Llama-3-8B Felladrin/Llama-160M-Chat-v1
+export RUST_BACKTRACE=1
 
-./suffix_decoding/suffix_decoding \
+gdb -ex run -ex bt --args ./inference/suffix_decoding/suffix_decoding \
     -ll:gpu 4 -ll:cpu 4 -ll:util 4 \
     -tensor-parallelism-degree 4 \
     -ll:fsize 20000 -ll:zsize 30000 \
     -llm-model meta-llama/Meta-Llama-3-8B \
     -ssm-model Felladrin/Llama-160M-Chat-v1 \
     -partition-name "" \
-    -prompt ../../suffix_decoding/trace/spider_v2.json \
+    -prompt ../../suffix-tree-decoding/trace/spider_v2.json \
     -output-file ../inference/output/spider_v2.out
 
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index bfa60a6d54..980ecd356b 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1730,7 +1730,7 @@ void flexflow_model_generate(flexflow_model_t handle_,
                   handle,
                   dataset_fp.c_str(),
                   max_lengths[i],
-                  max_new_tokens[i],
+                  max_new_tokens_[i],
                   training_steps[i]);
     } else {
       assert(false && "Unknown request type");
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index ae0795ac1e..a28c0869cc 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -163,6 +163,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
                        rotary_embedding_meta.original_max_position_embeddings);
   li->add_int_property("scaling_query", scaling_query);
   li->add_float_property("scaling_factor", scaling_factor);
+  li->add_int_property("qk_prod_scaling", qk_prod_scaling);
   li->add_int_property("position_bias", position_bias);
   li->add_int_property("quantization_type", quantization_type);
   li->add_int_property("offload", offload);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index faebaa83cc..ad4dd7c11b 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -229,29 +229,29 @@ RequestManager::RequestManager()
   max_spec_tree_token_num = -1;
   max_sequence_length = -1;
 
-  std::string model_name = "meta-llama/Meta-Llama-3-70B";
-  DailSqlTrace dail_sql_trace =
-      load_trace_dail_sql("/usr/suffix-tree-decoding/trace/spider.json");
-  int num_prompts = dail_sql_trace.prompts.size();
-  int num_responses = dail_sql_trace.responses.size();
-  assert(num_prompts == num_responses);
-  auto tokenizer = get_tokenizer(model_name);
-  int bos_token_id = get_bos_token_id(model_name);
-  int train_size = num_prompts / 2;
-  std::cout << "Number of prompts: " << num_prompts << std::endl;
-  std::cout << "Train size: " << train_size << std::endl;
-  std::vector<std::vector<int>> training_dataset;
-  for (int i = 0; i < train_size; i++) {
-    std::string text = dail_sql_trace.prompts[i] + dail_sql_trace.responses[i];
-    std::vector<int> encoded = tokenizer->Encode(text);
-    encoded.insert(encoded.begin(), bos_token_id);
-    training_dataset.push_back(encoded);
-  }
-  suffix_tree = new SuffixTree(50);
-  for (auto const &text : training_dataset) {
-    suffix_tree->insert(text, suffix_tree->query_guid);
-    suffix_tree->query_guid++;
-  }
+  // std::string model_name = "meta-llama/Meta-Llama-3-70B";
+  // DailSqlTrace dail_sql_trace =
+  //     load_trace_dail_sql("/usr/suffix-tree-decoding/trace/spider.json");
+  // int num_prompts = dail_sql_trace.prompts.size();
+  // int num_responses = dail_sql_trace.responses.size();
+  // assert(num_prompts == num_responses);
+  // auto tokenizer = get_tokenizer(model_name);
+  // int bos_token_id = get_bos_token_id(model_name);
+  // int train_size = num_prompts / 2;
+  // std::cout << "Number of prompts: " << num_prompts << std::endl;
+  // std::cout << "Train size: " << train_size << std::endl;
+  // std::vector<std::vector<int>> training_dataset;
+  // for (int i = 0; i < train_size; i++) {
+  //   std::string text = dail_sql_trace.prompts[i] + dail_sql_trace.responses[i];
+  //   std::vector<int> encoded = tokenizer->Encode(text);
+  //   encoded.insert(encoded.begin(), bos_token_id);
+  //   training_dataset.push_back(encoded);
+  // }
+  // suffix_tree = new SuffixTree(50);
+  // for (auto const &text : training_dataset) {
+  //   suffix_tree->insert(text, suffix_tree->query_guid);
+  //   suffix_tree->query_guid++;
+  // }
 }
 
 void RequestManager::set_max_requests_per_batch(int max_num_requests) {
@@ -348,6 +348,8 @@ void RequestManager::register_tokenizer(ModelType type,
                   << std::endl;
         assert(false);
       }
+      std::cout << "Loading tokenizer from: " << tokenizer_json_path
+                << std::endl;
       this->tokenizer_ = Tokenizer::FromBlobJSON(
           LoadBytesFromFile(tokenizer_json_path.string()));
     }