From 470a40fcb974050ee656571fb15373908e76fb51 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 25 Sep 2024 02:19:19 +0000
Subject: [PATCH 01/37] init

---
 docker/flexflow-environment/Dockerfile      |   1 +
 docker/run.sh                               |   7 +-
 inference/python/entrypoint/fastapi_incr.py |  24 ++--
 inference/python/streamlit/README.md        |   0
 inference/python/streamlit/app.py           | 122 ++++++++++++++++++++
 python/flexflow/core/flexflow_cffi.py       |   4 +-
 python/flexflow/serve/serve.py              |   6 +-
 7 files changed, 149 insertions(+), 15 deletions(-)
 create mode 100644 inference/python/streamlit/README.md
 create mode 100644 inference/python/streamlit/app.py

diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
index ee13a07375..4f41482ee5 100644
--- a/docker/flexflow-environment/Dockerfile
+++ b/docker/flexflow-environment/Dockerfile
@@ -112,6 +112,7 @@ RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops
 RUN pip3 install tensorflow notebook
 # PEFT-related
 RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft
+RUN pip3 install streamlit
 
 # Install Rust
 RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
diff --git a/docker/run.sh b/docker/run.sh
index cdf9383052..3e7417a3cc 100755
--- a/docker/run.sh
+++ b/docker/run.sh
@@ -17,6 +17,11 @@ hip_version=${hip_version:-"empty"}
 ATTACH_GPUS=${ATTACH_GPUS:-true}
 gpu_arg=""
 if $ATTACH_GPUS ; then gpu_arg="--gpus all" ; fi
+FORWARD_STREAMLIT_PORT=${FORWARD_STREAMLIT_PORT:-true}
+port_forward_arg=""
+if $FORWARD_STREAMLIT_PORT ; then
+  port_forward_arg+="-p 8501:8501"
+fi
 
 
 # Amount of shared memory to give the Docker container access to
@@ -120,4 +125,4 @@ if [ -f "$hf_token_path" ]; then
   hf_token_volume+="-v $hf_token_path:/root/.cache/huggingface/token"
 fi
 
-eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${hf_token_volume}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest"
+eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${hf_token_volume}" "${port_forward_arg}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest"
diff --git a/inference/python/entrypoint/fastapi_incr.py b/inference/python/entrypoint/fastapi_incr.py
index 34f61739fb..f2830e6e5e 100644
--- a/inference/python/entrypoint/fastapi_incr.py
+++ b/inference/python/entrypoint/fastapi_incr.py
@@ -60,28 +60,32 @@ def get_configs():
         # Define sample configs
         ff_init_configs = {
             # required parameters
-            "num_gpus": 2,
-            "memory_per_gpu": 14000,
+            "num_gpus": 4,
+            "memory_per_gpu": 20000,
             "zero_copy_memory_per_node": 40000,
             # optional parameters
             "num_cpus": 4,
             "legion_utility_processors": 4,
             "data_parallelism_degree": 1,
-            "tensor_parallelism_degree": 1,
-            "pipeline_parallelism_degree": 2,
+            "tensor_parallelism_degree": 4,
+            "pipeline_parallelism_degree": 1,
             "offload": False,
-            "offload_reserve_space_size": 1024**2,
+            "offload_reserve_space_size": 8 * 1024, # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
+            "enable_peft": False,
+            "peft_activation_reserve_space_size": 1024, # 1GB
+            "peft_weight_reserve_space_size": 1024, # 1GB
             "profiling": False,
+            "benchmarking": False,
             "inference_debugging": False,
             "fusion": True,
         }
         llm_configs = {
             # required parameters
-            "llm_model": "tiiuae/falcon-7b",
+            "llm_model": "meta-llama/Meta-Llama-3.1-8B",
             # optional parameters
-            "cache_path": "",
+            "cache_path": os.environ.get("FF_CACHE_PATH", ""),
             "refresh_cache": False,
             "full_precision": False,
             "prompt": "",
@@ -102,7 +106,9 @@ async def startup_event():
     configs = SimpleNamespace(**configs_dict)
     ff.init(configs_dict)
 
-    ff_data_type = ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
+    ff_data_type = (
+        ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
+    )
     llm = ff.LLM(
         configs.llm_model,
         data_type=ff_data_type,
@@ -117,7 +123,7 @@ async def startup_event():
     llm.compile(
         generation_config,
         max_requests_per_batch=1,
-        max_seq_length=256,
+        max_seq_length=2048,
         max_tokens_per_batch=64,
     )
     llm.start_server()
diff --git a/inference/python/streamlit/README.md b/inference/python/streamlit/README.md
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/inference/python/streamlit/app.py b/inference/python/streamlit/app.py
new file mode 100644
index 0000000000..564a9b6c5a
--- /dev/null
+++ b/inference/python/streamlit/app.py
@@ -0,0 +1,122 @@
+import streamlit as st
+import requests
+import os
+from huggingface_hub import model_info
+
+
+# App title
+st.set_page_config(page_title="🦙💬 FlexLLM Llama Server")
+
+# FastAPI server URL
+FASTAPI_URL = "http://localhost:8000/generate/"  # Adjust the port if necessary
+
+# Initialize session state variables
+if 'added_adapters' not in st.session_state:
+    st.session_state.added_adapters = []
+
+def check_model_availability(model_name):
+    try:
+        info = model_info(model_name)
+        return True
+    except Exception:
+        return False
+
+# Store LLM generated responses
+if "messages" not in st.session_state.keys():
+    st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}]
+
+# Display or clear chat messages
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.write(message["content"])
+
+def clear_chat_history():
+    st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}]
+
+
+# App title and description
+with st.sidebar:
+    st.title('🦙💬 FlexLLM Llama Server')
+    # st.success('Using local FastAPI server', icon='✅')
+    st.sidebar.button('Clear Chat History', on_click=clear_chat_history)
+
+    st.subheader('Generation parameters')
+    max_length = st.sidebar.slider('Max generation length', min_value=64, max_value=4096, value=2048, step=8)
+    # selected_model = st.sidebar.selectbox('Choose a Llama2 model', ['Llama2-7B', 'Llama2-13B', 'Llama2-70B'], key='selected_model')
+    decoding_method = st.sidebar.selectbox('Decoding method', ['Greedy decoding (default)', 'Sampling'], key='decoding_method')
+    temperature = st.sidebar.slider('temperature', min_value=0.01, max_value=5.0, value=0.1, step=0.01, disabled=decoding_method == 'Greedy decoding (default)')
+    top_p = st.sidebar.slider('top_p', min_value=0.01, max_value=1.0, value=0.9, step=0.01, disabled=decoding_method == 'Greedy decoding (default)')
+    
+    # lora_adapter = st.sidebar.text_input('Lora adapter', placeholder='None')
+    st.subheader("LoRA Adapters (optional)")
+    # Text input for PEFT model ID
+    peft_id = st.text_input("Add a LoRA Adapter", placeholder="Enter the Huggingface PEFT model ID")
+    # Button to load the adapter
+    if st.button("Load Adapter"):
+        if peft_id:
+            with st.spinner("Checking PEFT availability..."):
+                is_available = check_model_availability(peft_id)
+            if is_available:
+                if peft_id not in st.session_state.added_adapters:
+                    st.session_state.added_adapters.append(peft_id)
+                    st.success(f"Successfully added PEFT: {peft_id}")
+                else:
+                    st.warning(f"PEFT {peft_id} is already in the list.")
+            else:
+                st.error(f"PEFT {peft_id} is not available on Hugging Face. Please check the ID and try again.")
+        else:
+            st.warning("Please enter a PEFT Model ID.")
+    # Button to remove all adapters
+    if st.button("Remove All Adapters"):
+        st.session_state.added_adapters = []
+        st.success("All adapters have been removed.")
+    # Display the list of added adapters
+    st.markdown("**Added Adapters:**")
+    if st.session_state.added_adapters:
+        for adapter in st.session_state.added_adapters:
+            st.write(f"- {adapter}")
+    else:
+        st.write("No adapters added yet.")
+    
+    # st.markdown('📖 Learn how to build this app in this [blog](https://blog.streamlit.io/how-to-build-a-llama-2-chatbot/)!')
+
+
+
+# Function for generating LLaMA2 response
+def generate_llama2_response(prompt_input):
+    string_dialogue = "You are a helpful assistant. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'."
+    for dict_message in st.session_state.messages:
+        if dict_message["role"] == "user":
+            string_dialogue += "User: " + dict_message["content"] + "\n\n"
+        else:
+            string_dialogue += "Assistant: " + dict_message["content"] + "\n\n"
+    
+    full_prompt = f"{string_dialogue} {prompt_input} Assistant: "
+    
+    # Send request to FastAPI server
+    response = requests.post(FASTAPI_URL, json={"prompt": full_prompt})
+    
+    if response.status_code == 200:
+        return response.json()["response"]
+    else:
+        return f"Error: {response.status_code} - {response.text}"
+
+# User-provided prompt
+if prompt := st.chat_input():
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    with st.chat_message("user"):
+        st.write(prompt)
+
+# Generate a new response if last message is not from assistant
+if st.session_state.messages[-1]["role"] != "assistant":
+    with st.chat_message("assistant"):
+        with st.spinner("Thinking..."):
+            response = generate_llama2_response(prompt)
+            placeholder = st.empty()
+            full_response = ''
+            for item in response:
+                full_response += item
+                placeholder.markdown(full_response)
+            placeholder.markdown(full_response)
+    message = {"role": "assistant", "content": full_response}
+    st.session_state.messages.append(message)
\ No newline at end of file
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index 7692ccb88f..d065398f87 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -2057,7 +2057,7 @@ def __init__(
         self,
         req_type: RequestType,
         prompt: str = None,
-        max_sequence_length: int = 128,
+        max_sequence_length: int = 2048,
         peft_model_id: PEFTModelID = None,
         dataset_filepath: str = None,
         max_training_steps: int = 1,
@@ -4665,7 +4665,7 @@ def get_output_tensor(self, ffmodel, data_type):
         assert ret_val == True
         return np_array
 
-    def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 128):
+    def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 2048):
         assert isinstance(prompt_list, list)
         c_input_texts = [get_c_name(prompt) for prompt in prompt_list]
         max_num_chars = 5 * (max_sequence_length + 100)
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 132c50995b..988789bab4 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -498,7 +498,7 @@ def compile(
     def generate(
         self,
         requests_or_prompts: Union[str, List[str], Request, List[Request]],
-        max_length: int = 128,
+        max_length: int = 2048,
     ):
         """Generate tokens based on the input prompt(s)
 
@@ -568,7 +568,7 @@ def compile(
         generation_config: GenerationConfig = GenerationConfig(),
         max_requests_per_batch: int = 16,
         max_seq_length: int = 256,
-        max_tokens_per_batch: int = 128,
+        max_tokens_per_batch: int = 2048,
         enable_peft_finetuning: bool = False,
         model_specific_data_parallelism_degree: int = 1,
         model_specific_tensor_parallelism_degree: int = 1,
@@ -582,7 +582,7 @@ def compile(
         :type max_requests_per_batch: int, optional
         :param max_seq_length: The maximum sequence length to allow per batch, defaults to 256
         :type max_seq_length: int, optional
-        :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 128
+        :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 2048
         :type max_tokens_per_batch: int, optional
         :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False
         :type enable_peft_finetuning: bool, optional

From 7f23188772c5a32fa0e5586673d9f666f8cd5190 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 25 Sep 2024 03:10:11 +0000
Subject: [PATCH 02/37] update

---
 inference/python/streamlit/app.py | 204 ++++++++++++++++++------------
 1 file changed, 126 insertions(+), 78 deletions(-)

diff --git a/inference/python/streamlit/app.py b/inference/python/streamlit/app.py
index 564a9b6c5a..c264930e7d 100644
--- a/inference/python/streamlit/app.py
+++ b/inference/python/streamlit/app.py
@@ -1,19 +1,24 @@
 import streamlit as st
 import requests
-import os
+import os, json
 from huggingface_hub import model_info
 
 
 # App title
-st.set_page_config(page_title="🦙💬 FlexLLM Llama Server")
+st.set_page_config(page_title="🚀💻 FlexLLM Server", layout="wide")
 
 # FastAPI server URL
 FASTAPI_URL = "http://localhost:8000/generate/"  # Adjust the port if necessary
+FINETUNE_URL = "http://localhost:8000/finetuning"
 
 # Initialize session state variables
 if 'added_adapters' not in st.session_state:
     st.session_state.added_adapters = []
 
+# Store LLM generated responses
+if "messages" not in st.session_state.keys():
+    st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}]
+
 def check_model_availability(model_name):
     try:
         info = model_info(model_name)
@@ -21,67 +26,9 @@ def check_model_availability(model_name):
     except Exception:
         return False
 
-# Store LLM generated responses
-if "messages" not in st.session_state.keys():
-    st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}]
-
-# Display or clear chat messages
-for message in st.session_state.messages:
-    with st.chat_message(message["role"]):
-        st.write(message["content"])
-
 def clear_chat_history():
     st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}]
 
-
-# App title and description
-with st.sidebar:
-    st.title('🦙💬 FlexLLM Llama Server')
-    # st.success('Using local FastAPI server', icon='✅')
-    st.sidebar.button('Clear Chat History', on_click=clear_chat_history)
-
-    st.subheader('Generation parameters')
-    max_length = st.sidebar.slider('Max generation length', min_value=64, max_value=4096, value=2048, step=8)
-    # selected_model = st.sidebar.selectbox('Choose a Llama2 model', ['Llama2-7B', 'Llama2-13B', 'Llama2-70B'], key='selected_model')
-    decoding_method = st.sidebar.selectbox('Decoding method', ['Greedy decoding (default)', 'Sampling'], key='decoding_method')
-    temperature = st.sidebar.slider('temperature', min_value=0.01, max_value=5.0, value=0.1, step=0.01, disabled=decoding_method == 'Greedy decoding (default)')
-    top_p = st.sidebar.slider('top_p', min_value=0.01, max_value=1.0, value=0.9, step=0.01, disabled=decoding_method == 'Greedy decoding (default)')
-    
-    # lora_adapter = st.sidebar.text_input('Lora adapter', placeholder='None')
-    st.subheader("LoRA Adapters (optional)")
-    # Text input for PEFT model ID
-    peft_id = st.text_input("Add a LoRA Adapter", placeholder="Enter the Huggingface PEFT model ID")
-    # Button to load the adapter
-    if st.button("Load Adapter"):
-        if peft_id:
-            with st.spinner("Checking PEFT availability..."):
-                is_available = check_model_availability(peft_id)
-            if is_available:
-                if peft_id not in st.session_state.added_adapters:
-                    st.session_state.added_adapters.append(peft_id)
-                    st.success(f"Successfully added PEFT: {peft_id}")
-                else:
-                    st.warning(f"PEFT {peft_id} is already in the list.")
-            else:
-                st.error(f"PEFT {peft_id} is not available on Hugging Face. Please check the ID and try again.")
-        else:
-            st.warning("Please enter a PEFT Model ID.")
-    # Button to remove all adapters
-    if st.button("Remove All Adapters"):
-        st.session_state.added_adapters = []
-        st.success("All adapters have been removed.")
-    # Display the list of added adapters
-    st.markdown("**Added Adapters:**")
-    if st.session_state.added_adapters:
-        for adapter in st.session_state.added_adapters:
-            st.write(f"- {adapter}")
-    else:
-        st.write("No adapters added yet.")
-    
-    # st.markdown('📖 Learn how to build this app in this [blog](https://blog.streamlit.io/how-to-build-a-llama-2-chatbot/)!')
-
-
-
 # Function for generating LLaMA2 response
 def generate_llama2_response(prompt_input):
     string_dialogue = "You are a helpful assistant. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'."
@@ -101,22 +48,123 @@ def generate_llama2_response(prompt_input):
     else:
         return f"Error: {response.status_code} - {response.text}"
 
-# User-provided prompt
-if prompt := st.chat_input():
-    st.session_state.messages.append({"role": "user", "content": prompt})
-    with st.chat_message("user"):
-        st.write(prompt)
-
-# Generate a new response if last message is not from assistant
-if st.session_state.messages[-1]["role"] != "assistant":
-    with st.chat_message("assistant"):
-        with st.spinner("Thinking..."):
-            response = generate_llama2_response(prompt)
-            placeholder = st.empty()
-            full_response = ''
-            for item in response:
-                full_response += item
+# Sidebar
+with st.sidebar:
+    st.title('🚀 FlexLLM Server')
+    page = st.radio("Choose a page", ["Chat", "Finetune"])
+    if page == "Chat":
+        st.header('🦙 Llama Chatbot')
+        # st.success('Using local FastAPI server', icon='✅')
+        st.sidebar.button('Clear Chat History', on_click=clear_chat_history)
+
+        st.subheader('Generation parameters')
+        max_length = st.sidebar.slider('Max generation length', min_value=64, max_value=4096, value=2048, step=8)
+        # selected_model = st.sidebar.selectbox('Choose a Llama2 model', ['Llama2-7B', 'Llama2-13B', 'Llama2-70B'], key='selected_model')
+        decoding_method = st.sidebar.selectbox('Decoding method', ['Greedy decoding (default)', 'Sampling'], key='decoding_method')
+        temperature = st.sidebar.slider('temperature', min_value=0.01, max_value=5.0, value=0.1, step=0.01, disabled=decoding_method == 'Greedy decoding (default)')
+        top_p = st.sidebar.slider('top_p', min_value=0.01, max_value=1.0, value=0.9, step=0.01, disabled=decoding_method == 'Greedy decoding (default)')
+        
+        # lora_adapter = st.sidebar.text_input('Lora adapter', placeholder='None')
+        st.subheader("LoRA Adapters (optional)")
+        # Text input for PEFT model ID
+        peft_id = st.text_input("Add a LoRA Adapter", placeholder="Enter the Huggingface PEFT model ID")
+        # Button to load the adapter
+        if st.button("Load Adapter"):
+            if peft_id:
+                with st.spinner("Checking PEFT availability..."):
+                    is_available = check_model_availability(peft_id)
+                if is_available:
+                    if peft_id not in st.session_state.added_adapters:
+                        st.session_state.added_adapters.append(peft_id)
+                        st.success(f"Successfully added PEFT: {peft_id}")
+                    else:
+                        st.warning(f"PEFT {peft_id} is already in the list.")
+                else:
+                    st.error(f"PEFT {peft_id} is not available on Hugging Face. Please check the ID and try again.")
+            else:
+                st.warning("Please enter a PEFT Model ID.")
+        # Button to remove all adapters
+        if st.button("Remove All Adapters"):
+            st.session_state.added_adapters = []
+            st.success("All adapters have been removed.")
+        # Display the list of added adapters
+        st.markdown("**Added Adapters:**")
+        if st.session_state.added_adapters:
+            for adapter in st.session_state.added_adapters:
+                st.write(f"- {adapter}")
+        else:
+            st.write("No adapters added yet.")
+        # st.markdown('📖 Learn how to build this app in this [blog](https://blog.streamlit.io/how-to-build-a-llama-2-chatbot/)!')
+    elif page == "Finetune":
+        st.header("🏋️‍♂️ LoRA Finetuning")
+        
+        # Hugging Face token input
+        hf_token = st.text_input("Enter your Hugging Face token:", type="password")
+        
+        # Dataset selection
+        dataset_option = st.radio("Choose dataset source:", ["Upload JSON", "Hugging Face Dataset"])
+        
+        if dataset_option == "Upload JSON":
+            uploaded_file = st.file_uploader("Upload JSON dataset", type="json")
+            if uploaded_file is not None:
+                dataset = json.load(uploaded_file)
+                st.success("Dataset uploaded successfully!")
+        else:
+            dataset_name = st.text_input("Enter Hugging Face dataset name:")
+        
+        # Start finetuning button
+        if st.button("Start Finetuning"):
+            if not hf_token:
+                st.error("Please enter your Hugging Face token.")
+            elif dataset_option == "Upload JSON" and uploaded_file is None:
+                st.error("Please upload a JSON dataset.")
+            elif dataset_option == "Hugging Face Dataset" and not dataset_name:
+                st.error("Please enter a Hugging Face dataset name.")
+            else:
+                # Prepare the request data
+                request_data = {
+                    "token": hf_token,
+                    "dataset_source": dataset_option,
+                }
+                
+                if dataset_option == "Upload JSON":
+                    request_data["dataset"] = dataset
+                else:
+                    request_data["dataset_name"] = dataset_name
+                
+                # Send finetuning request to FastAPI server
+                with st.spinner("Finetuning in progress..."):
+                    response = requests.post(FINETUNE_URL, json=request_data)
+                
+                if response.status_code == 200:
+                    st.success("Finetuning completed successfully!")
+                else:
+                    st.error(f"Finetuning failed. Error: {response.status_code} - {response.text}")
+
+if page == "Chat":
+    # Display or clear chat messages
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.write(message["content"])
+
+    # User-provided prompt
+    if prompt := st.chat_input():
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        with st.chat_message("user"):
+            st.write(prompt)
+
+    # Generate a new response if last message is not from assistant
+    if st.session_state.messages[-1]["role"] != "assistant":
+        with st.chat_message("assistant"):
+            with st.spinner("Thinking..."):
+                response = generate_llama2_response(prompt)
+                placeholder = st.empty()
+                full_response = ''
+                for item in response:
+                    full_response += item
+                    placeholder.markdown(full_response)
                 placeholder.markdown(full_response)
-            placeholder.markdown(full_response)
-    message = {"role": "assistant", "content": full_response}
-    st.session_state.messages.append(message)
\ No newline at end of file
+        message = {"role": "assistant", "content": full_response}
+        st.session_state.messages.append(message)
+elif page == "Finetune":
+    st.write("Use the sidebar to configure and start finetuning.")
\ No newline at end of file

From a2d2ac0d5896916808eec81b50bae54099e06663 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 25 Sep 2024 14:15:23 +0000
Subject: [PATCH 03/37] update

---
 inference/python/streamlit/app.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/inference/python/streamlit/app.py b/inference/python/streamlit/app.py
index c264930e7d..5ed56148c9 100644
--- a/inference/python/streamlit/app.py
+++ b/inference/python/streamlit/app.py
@@ -99,7 +99,18 @@ def generate_llama2_response(prompt_input):
         st.header("🏋️‍♂️ LoRA Finetuning")
         
         # Hugging Face token input
-        hf_token = st.text_input("Enter your Hugging Face token:", type="password")
+        # hf_token = st.text_input("Enter your Hugging Face token:", type="password")
+        if 'hf_token' in st.session_state.keys():
+            st.success('HF token already provided!', icon='✅')
+            hf_token = st.session_state.hf_token
+            print(hf_token)
+        else:
+            hf_token = st.text_input('Enter your Hugging Face token:', type='password')
+            if not (hf_token.startswith('hf_') and len(hf_token)==37):
+                st.warning('Please enter valid credentials!', icon='⚠️')
+            else:
+                st.success('Proceed to finetuning your model!', icon='👉')
+                st.session_state.hf_token = hf_token
         
         # Dataset selection
         dataset_option = st.radio("Choose dataset source:", ["Upload JSON", "Hugging Face Dataset"])

From f8c90e64ae070cbcb4fee81080f31a00a758284f Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 25 Sep 2024 17:18:07 +0000
Subject: [PATCH 04/37] update

---
 inference/python/streamlit/app.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/inference/python/streamlit/app.py b/inference/python/streamlit/app.py
index 5ed56148c9..4d8633e167 100644
--- a/inference/python/streamlit/app.py
+++ b/inference/python/streamlit/app.py
@@ -103,15 +103,17 @@ def generate_llama2_response(prompt_input):
         if 'hf_token' in st.session_state.keys():
             st.success('HF token already provided!', icon='✅')
             hf_token = st.session_state.hf_token
-            print(hf_token)
         else:
             hf_token = st.text_input('Enter your Hugging Face token:', type='password')
             if not (hf_token.startswith('hf_') and len(hf_token)==37):
-                st.warning('Please enter valid credentials!', icon='⚠️')
+                st.warning('please enter a valid token', icon='⚠️')
             else:
                 st.success('Proceed to finetuning your model!', icon='👉')
                 st.session_state.hf_token = hf_token
         
+        # PEFT model name
+        peft_model_name = st.text_input("Enter the PEFT model name:", help="The name of the PEFT model should start with the username associated with the provided HF token, followed by '/'ß. E.g. 'username/peft-base-uncased'")
+        
         # Dataset selection
         dataset_option = st.radio("Choose dataset source:", ["Upload JSON", "Hugging Face Dataset"])
         
@@ -123,6 +125,18 @@ def generate_llama2_response(prompt_input):
         else:
             dataset_name = st.text_input("Enter Hugging Face dataset name:")
         
+        # Finetuning parameters
+        st.subheader("Finetuning parameters")
+        lora_rank = st.number_input("LoRA rank", min_value=2, max_value=64, value=16, step=2)
+        lora_alpha = st.number_input("LoRA alpha", min_value=2, max_value=64, value=16, step=2)
+        target_modules = st.multiselect("Target modules", ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head"], default=["down_proj"])
+        learning_rate = st.number_input("Learning rate", min_value=1e-6, max_value=1e-3, value=1e-5, step=1e-6)
+        optimizer_type = st.selectbox("Optimizer type", ["SGD", "Adam", "AdamW", "Adagrad", "Adadelta", "Adamax", "RMSprop"])
+        momentum = st.number_input("Momentum", min_value=0.0, max_value=1.0, value=0.0, step=0.01)
+        weight_decay = st.number_input("Weight decay", min_value=0.0, max_value=1.0, value=0.0, step=0.01)
+        nesterov = st.checkbox("Nesterov")
+        max_steps = st.number_input("Max steps", min_value=1000, max_value=100000, value=10000, step=1000)
+        
         # Start finetuning button
         if st.button("Start Finetuning"):
             if not hf_token:

From 2906e57272ecf8b02e1fac790f9117491b44001b Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 26 Sep 2024 13:21:06 +0000
Subject: [PATCH 05/37] update

---
 inference/python/streamlit/fastapi_incr.py    | 203 ++++++++++++++++++
 .../inference/huggingface_inference_simple.py |  51 +++++
 tests/inference/huggingface_pipeline.py       |  33 +++
 3 files changed, 287 insertions(+)
 create mode 100644 inference/python/streamlit/fastapi_incr.py
 create mode 100644 tests/inference/huggingface_inference_simple.py
 create mode 100644 tests/inference/huggingface_pipeline.py

diff --git a/inference/python/streamlit/fastapi_incr.py b/inference/python/streamlit/fastapi_incr.py
new file mode 100644
index 0000000000..0bc20f3b0a
--- /dev/null
+++ b/inference/python/streamlit/fastapi_incr.py
@@ -0,0 +1,203 @@
+# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+Running Instructions:
+- To run this FastAPI application, make sure you have FastAPI and Uvicorn installed.
+- Save this script as 'fastapi_incr.py'.
+- Run the application using the command: `uvicorn fastapi_incr:app --reload --port PORT_NUMBER`
+- The server will start on `http://localhost:PORT_NUMBER`. Use this base URL to make API requests.
+- Go to `http://localhost:PORT_NUMBER/docs` for API documentation.
+"""
+
+
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, Field
+import flexflow.serve as ff
+import uvicorn
+import json, os, argparse
+from types import SimpleNamespace
+from typing import Optional, List
+import time
+
+
+# Initialize FastAPI application
+app = FastAPI()
+
+# Define the request model
+class PromptRequest(BaseModel):
+    prompt: str
+
+# data models
+class Message(BaseModel):
+    role: str
+    content: str
+
+
+class ChatCompletionRequest(BaseModel):
+    model: Optional[str] = "mock-gpt-model"
+    messages: List[Message]
+    max_tokens: Optional[int] = 512
+    temperature: Optional[float] = 0.1
+    stream: Optional[bool] = False
+
+# Global variable to store the LLM model
+llm = None
+
+
+def get_configs():
+    
+    # Fetch configuration file path from environment variable
+    config_file = os.getenv("CONFIG_FILE", "")
+
+    # Load configs from JSON file (if specified)
+    if config_file:
+        if not os.path.isfile(config_file):
+            raise FileNotFoundError(f"Config file {config_file} not found.")
+        try:
+            with open(config_file) as f:
+                return json.load(f)
+        except json.JSONDecodeError as e:
+            print("JSON format error:")
+            print(e)
+    else:
+        # Define sample configs
+        ff_init_configs = {
+            # required parameters
+            "num_gpus": 4,
+            "memory_per_gpu": 20000,
+            "zero_copy_memory_per_node": 40000,
+            # optional parameters
+            "num_cpus": 4,
+            "legion_utility_processors": 4,
+            "data_parallelism_degree": 1,
+            "tensor_parallelism_degree": 4,
+            "pipeline_parallelism_degree": 1,
+            "offload": False,
+            "offload_reserve_space_size": 8 * 1024, # 8GB
+            "use_4bit_quantization": False,
+            "use_8bit_quantization": False,
+            "enable_peft": False,
+            "peft_activation_reserve_space_size": 1024, # 1GB
+            "peft_weight_reserve_space_size": 1024, # 1GB
+            "profiling": False,
+            "benchmarking": False,
+            "inference_debugging": False,
+            "fusion": True,
+        }
+        llm_configs = {
+            # required parameters
+            "llm_model": "meta-llama/Meta-Llama-3.1-8B",
+            # optional parameters
+            "cache_path": os.environ.get("FF_CACHE_PATH", ""),
+            "refresh_cache": False,
+            "full_precision": False,
+            "prompt": "",
+            "output_file": "",
+        }
+        # Merge dictionaries
+        ff_init_configs.update(llm_configs)
+        return ff_init_configs
+    
+
+# Initialize model on startup
+@app.on_event("startup")
+async def startup_event():
+    global llm
+
+    # Initialize your LLM model configuration here
+    configs_dict = get_configs()
+    configs = SimpleNamespace(**configs_dict)
+    ff.init(configs_dict)
+
+    ff_data_type = (
+        ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
+    )
+    llm = ff.LLM(
+        configs.llm_model,
+        data_type=ff_data_type,
+        cache_path=configs.cache_path,
+        refresh_cache=configs.refresh_cache,
+        output_file=configs.output_file,
+    )
+
+    generation_config = ff.GenerationConfig(
+        do_sample=False, temperature=0.9, topp=0.8, topk=1
+    )
+    llm.compile(
+        generation_config,
+        max_requests_per_batch=1,
+        max_seq_length=2048,
+        max_tokens_per_batch=64,
+    )
+    llm.start_server()
+
+# API endpoint to generate response
+@app.post("/generate/")
+async def generate(prompt_request: PromptRequest):
+    if llm is None:
+        raise HTTPException(status_code=503, detail="LLM model is not initialized.")
+    
+    # Call the model to generate a response
+    full_output = llm.generate([prompt_request.prompt])[0].output_text.decode('utf-8')
+    
+    # Separate the prompt and response
+    split_output = full_output.split('\n', 1)
+    if len(split_output) > 1:
+        response_text = split_output[1] 
+    else:
+        response_text = "" 
+        
+    # Return the prompt and the response in JSON format
+    return {
+        "prompt": prompt_request.prompt,
+        "response": response_text
+    }
+
+@app.post("/chat/completions")
+async def chat_completions(request: ChatCompletionRequest):
+
+    if llm is None:
+        raise HTTPException(status_code=503, detail="LLM model is not initialized.")
+    
+    if request.messages and request.messages[0].role == 'user':
+      resp_content = "As a mock AI Assitant, I can only echo your last message:" + request.messages[-1].content
+    else:
+      resp_content = "As a mock AI Assitant, I can only echo your last message, but there were no messages!"
+
+    return {
+        "id": "1337",
+        "object": "chat.completion",
+        "created": time.time(),
+        "model": request.model,
+        "choices": [{"message": Message(role="assistant", content=resp_content)}],
+    }
+
+# Shutdown event to stop the model server
+@app.on_event("shutdown")
+async def shutdown_event():
+    global llm
+    if llm is not None:
+        llm.stop_server()
+
+# Main function to run Uvicorn server
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)
+
+# Running within the entrypoint folder:
+# uvicorn fastapi_incr:app --reload --port
+
+# Running within the python folder:
+# uvicorn entrypoint.fastapi_incr:app --reload --port 3000
diff --git a/tests/inference/huggingface_inference_simple.py b/tests/inference/huggingface_inference_simple.py
new file mode 100644
index 0000000000..f1cf8450b7
--- /dev/null
+++ b/tests/inference/huggingface_inference_simple.py
@@ -0,0 +1,51 @@
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    AutoConfig,
+    GenerationConfig,
+)
+
+model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+do_sample = False
+max_length = 128
+model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto",)
+hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+generation_config = GenerationConfig.from_pretrained(model_name)
+print(generation_config.do_sample)
+generation_config.do_sample = do_sample
+generation_config.num_beams=1
+generation_config.temperature = None
+generation_config.top_p = None
+
+
+def run_text_completion():
+    prompt = "Help me plan a 1-week trip to Dubai"
+    batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
+
+    generated = model.generate(
+        batch["input_ids"],
+        max_new_tokens=max_length,
+        generation_config=generation_config,
+    )
+    out = tokenizer.decode(generated[0])
+    print(out)
+
+def run_chat_completion():
+    messages=[
+        {"role": "system", "content": "You are a helpful an honest programming assistant."},
+        {"role": "user", "content": "Is Rust better than Python?"},
+    ]
+    tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    batch = tokenizer(tokenized_chat, return_tensors="pt")
+
+    generated = model.generate(
+        batch["input_ids"],
+        max_new_tokens=max_length,
+        generation_config=generation_config,
+    )
+    out = tokenizer.decode(generated[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
+    prompt_length = len(tokenizer.decode(batch["input_ids"][0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
+    all_text = out[prompt_length:]
+    print(all_text)
+run_chat_completion()
\ No newline at end of file
diff --git a/tests/inference/huggingface_pipeline.py b/tests/inference/huggingface_pipeline.py
new file mode 100644
index 0000000000..95388e0a4b
--- /dev/null
+++ b/tests/inference/huggingface_pipeline.py
@@ -0,0 +1,33 @@
+import transformers
+from transformers import GenerationConfig
+
+model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+do_sample = False
+
+generation_config = GenerationConfig.from_pretrained(model_id)
+generation_config.do_sample = do_sample
+generation_config.num_beams=1
+# generation_config.max_length = 128
+generation_config.temperature = None
+generation_config.top_p = None
+print(generation_config)
+
+pipeline = transformers.pipeline(
+    "text-generation",
+    model=model_id,
+    # model_kwargs={"torch_dtype": torch.bfloat16},
+    device_map="auto",
+)
+
+messages=[
+        {"role": "system", "content": "You are a helpful an honest programming assistant."},
+        {"role": "user", "content": "Is Rust better than Python?"},
+    ]
+    
+# messages="Help me plan a 1-week trip to Dubai"
+outputs = pipeline(
+    messages,
+    max_new_tokens=128,
+    generation_config=generation_config,
+)
+print(outputs[0]["generated_text"][-1]['content'])
\ No newline at end of file

From d62d9beb020113047454b56e306d99625abb413b Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 1 Oct 2024 04:41:28 +0000
Subject: [PATCH 06/37] add max new tokens parameter

---
 include/flexflow/batch_config.h               |  4 +-
 include/flexflow/flexflow_c.h                 |  3 +-
 include/flexflow/request_manager.h            |  3 +-
 inference/incr_decoding/incr_decoding.cc      |  2 +-
 inference/peft/peft.cc                        |  2 +-
 inference/peft/peft_bwd_benchmark.cc          |  6 +-
 inference/peft/peft_fwd_benchmark.cc          |  2 +-
 inference/peft/req_rate_benchmark.cc          |  8 +-
 inference/python/entrypoint/fastapi_incr.py   | 24 ++---
 inference/python/streamlit/fastapi_incr.py    |  2 +-
 inference/spec_infer/spec_infer.cc            |  2 +-
 python/flexflow/core/flexflow_cffi.py         | 59 +++++------
 python/flexflow/serve/serve.py                | 11 ++-
 src/c/flexflow_c.cc                           | 32 ++++--
 src/ops/add_bias_residual_layer_norm.cpp      |  2 +-
 src/ops/add_bias_residual_layer_norm.cu       |  2 +-
 src/ops/inc_multihead_self_attention.cpp      |  2 +-
 src/ops/inc_multihead_self_attention.cu       |  2 +-
 src/ops/kernels/linear_kernels.cpp            |  2 +-
 src/ops/kernels/linear_kernels.cu             |  2 +-
 src/ops/kernels/lora_linear_kernels.cpp       |  2 +-
 src/ops/kernels/lora_linear_kernels.cu        |  2 +-
 src/ops/kernels/residual_rms_norm_kernels.cpp |  2 +-
 src/ops/kernels/residual_rms_norm_kernels.cu  |  2 +-
 src/ops/kernels/rms_norm_kernels.cpp          |  2 +-
 src/ops/kernels/rms_norm_kernels.cu           |  2 +-
 src/ops/layer_norm.cpp                        |  2 +-
 src/ops/layer_norm.cu                         |  2 +-
 src/ops/residual_layer_norm.cpp               |  2 +-
 src/ops/residual_layer_norm.cu                |  2 +-
 src/ops/sigmoid_silu_multi.cpp                |  2 +-
 src/ops/sigmoid_silu_multi.cu                 |  2 +-
 src/runtime/batch_config.cc                   |  4 +-
 src/runtime/beam_search_batch_config.cc       |  4 +-
 src/runtime/request_manager.cc                | 97 ++++++++++++-------
 src/runtime/tree_verify_batch_config.cc       |  4 +-
 36 files changed, 176 insertions(+), 129 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 873fed0bdb..a509af765c 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -87,7 +87,7 @@ class BatchConfig {
       first_token_depth_in_request = 0;
       first_token_offset_in_batch = 0;
       num_tokens_in_batch = 0;
-      max_sequence_length = 0;
+      max_length = 0;
       request_guid = 0;
       prompt_phase = false;
       batch_config_request_id = -1;
@@ -98,7 +98,7 @@ class BatchConfig {
     int first_token_depth_in_request;
     int first_token_offset_in_batch;
     int num_tokens_in_batch;
-    int max_sequence_length;
+    int max_length;
 
     // request id in batch config:
     int batch_config_request_id = -1;
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 52b4b3d362..5aa2fdd551 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -627,7 +627,8 @@ void flexflow_model_generate(flexflow_model_t handle_,
                              enum RequestType *request_types,
                              char const **input_texts,
                              char **output_texts,
-                             int *max_seq_lengths,
+                             int *max_lengths,
+                             int *max_new_tokens_,
                              flexflow_peft_model_id_t *peft_model_ids,
                              char const **dataset_filepaths,
                              int *training_steps,
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index f0fab957ee..36a56012fc 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -67,7 +67,8 @@ struct Request {
   };
   BatchConfig::RequestGuid guid;
   PEFTModelID peft_model_id = PEFTModelID::NO_ID;
-  int max_sequence_length = 128;
+  int max_length = -1;
+  int max_new_tokens = 128;
   int initial_len;
   int ssm_cache_size = 0;
   int llm_cache_size = 0;
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index c9ffff5c07..f8e16f24fa 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -271,7 +271,7 @@ void FlexFlow::top_level_task(Task const *task,
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       Request inference_req;
       inference_req.prompt = text;
-      inference_req.max_sequence_length = 128;
+      inference_req.max_length = 128;
       requests.push_back(inference_req);
       total_num_requests++;
     }
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index c55f2c0bfd..ee5bd1b460 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -340,7 +340,7 @@ void FlexFlow::top_level_task(Task const *task,
         printf("Inference prompt[%d]: %s\n", total_num_requests, text.c_str());
         Request inference_req;
         inference_req.prompt = text;
-        inference_req.max_sequence_length = 128;
+        inference_req.max_length = 128;
         inference_req.peft_model_id =
             (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
         requests.push_back(inference_req);
diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc
index 86d6d8cbbf..df9a1e35db 100644
--- a/inference/peft/peft_bwd_benchmark.cc
+++ b/inference/peft/peft_bwd_benchmark.cc
@@ -308,7 +308,7 @@ void FlexFlow::top_level_task(Task const *task,
     for (int i = 0; i < 100; i++) {
       Request inference_req;
       inference_req.benchmarking_tokens = 128;
-      inference_req.max_sequence_length = 256;
+      inference_req.max_length = 256;
       inference_req.warmup = true;
       inference_req.peft_model_id =
           (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
@@ -317,7 +317,7 @@ void FlexFlow::top_level_task(Task const *task,
     Request fine_tuning_req;
     fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
     fine_tuning_req.benchmarking_tokens = 1024;
-    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.max_length = 1024;
     fine_tuning_req.warmup = true;
     fine_tuning_req.peft_model_id =
         (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
@@ -361,7 +361,7 @@ void FlexFlow::top_level_task(Task const *task,
       Request fine_tuning_req;
       fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
       fine_tuning_req.benchmarking_tokens = lengths[i];
-      fine_tuning_req.max_sequence_length = lengths[i];
+      fine_tuning_req.max_length = lengths[i];
       fine_tuning_req.peft_model_id =
           (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
       fine_tuning_req.max_training_steps = 1;
diff --git a/inference/peft/peft_fwd_benchmark.cc b/inference/peft/peft_fwd_benchmark.cc
index 9ff042c157..9b020f5954 100644
--- a/inference/peft/peft_fwd_benchmark.cc
+++ b/inference/peft/peft_fwd_benchmark.cc
@@ -333,7 +333,7 @@ void FlexFlow::top_level_task(Task const *task,
       // sequence_length);
       Request inference_req;
       inference_req.benchmarking_tokens = prompt.first;
-      inference_req.max_sequence_length = prompt.second + prompt.first;
+      inference_req.max_length = prompt.second + prompt.first;
       inference_req.peft_model_id =
           (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
       requests.push_back(inference_req);
diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc
index 43008e74fe..cde3b1c02e 100644
--- a/inference/peft/req_rate_benchmark.cc
+++ b/inference/peft/req_rate_benchmark.cc
@@ -369,7 +369,7 @@ void FlexFlow::top_level_task(Task const *task,
     for (int i = 0; i < 100; i++) {
       Request inference_req;
       inference_req.benchmarking_tokens = 128;
-      inference_req.max_sequence_length = 256;
+      inference_req.max_length = 256;
       inference_req.warmup = true;
       inference_req.peft_model_id =
           (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
@@ -379,7 +379,7 @@ void FlexFlow::top_level_task(Task const *task,
     Request fine_tuning_req;
     fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
     fine_tuning_req.benchmarking_tokens = 1024;
-    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.max_length = 1024;
     fine_tuning_req.warmup = true;
     fine_tuning_req.peft_model_id =
         (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
@@ -443,7 +443,7 @@ void FlexFlow::top_level_task(Task const *task,
     Request fine_tuning_req;
     fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
     fine_tuning_req.benchmarking_tokens = 1024;
-    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.max_length = 1024;
     fine_tuning_req.peft_model_id =
         (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
     fine_tuning_req.max_training_steps = 1000000000;
@@ -473,7 +473,7 @@ void FlexFlow::top_level_task(Task const *task,
         // sequence_length);
         Request inference_req;
         inference_req.benchmarking_tokens = prompt.first;
-        inference_req.max_sequence_length = prompt.second + prompt.first;
+        inference_req.max_length = prompt.second + prompt.first;
         inference_req.peft_model_id =
             (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
         requests.push_back(inference_req);
diff --git a/inference/python/entrypoint/fastapi_incr.py b/inference/python/entrypoint/fastapi_incr.py
index f2830e6e5e..34f61739fb 100644
--- a/inference/python/entrypoint/fastapi_incr.py
+++ b/inference/python/entrypoint/fastapi_incr.py
@@ -60,32 +60,28 @@ def get_configs():
         # Define sample configs
         ff_init_configs = {
             # required parameters
-            "num_gpus": 4,
-            "memory_per_gpu": 20000,
+            "num_gpus": 2,
+            "memory_per_gpu": 14000,
             "zero_copy_memory_per_node": 40000,
             # optional parameters
             "num_cpus": 4,
             "legion_utility_processors": 4,
             "data_parallelism_degree": 1,
-            "tensor_parallelism_degree": 4,
-            "pipeline_parallelism_degree": 1,
+            "tensor_parallelism_degree": 1,
+            "pipeline_parallelism_degree": 2,
             "offload": False,
-            "offload_reserve_space_size": 8 * 1024, # 8GB
+            "offload_reserve_space_size": 1024**2,
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
-            "enable_peft": False,
-            "peft_activation_reserve_space_size": 1024, # 1GB
-            "peft_weight_reserve_space_size": 1024, # 1GB
             "profiling": False,
-            "benchmarking": False,
             "inference_debugging": False,
             "fusion": True,
         }
         llm_configs = {
             # required parameters
-            "llm_model": "meta-llama/Meta-Llama-3.1-8B",
+            "llm_model": "tiiuae/falcon-7b",
             # optional parameters
-            "cache_path": os.environ.get("FF_CACHE_PATH", ""),
+            "cache_path": "",
             "refresh_cache": False,
             "full_precision": False,
             "prompt": "",
@@ -106,9 +102,7 @@ async def startup_event():
     configs = SimpleNamespace(**configs_dict)
     ff.init(configs_dict)
 
-    ff_data_type = (
-        ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
-    )
+    ff_data_type = ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
     llm = ff.LLM(
         configs.llm_model,
         data_type=ff_data_type,
@@ -123,7 +117,7 @@ async def startup_event():
     llm.compile(
         generation_config,
         max_requests_per_batch=1,
-        max_seq_length=2048,
+        max_seq_length=256,
         max_tokens_per_batch=64,
     )
     llm.start_server()
diff --git a/inference/python/streamlit/fastapi_incr.py b/inference/python/streamlit/fastapi_incr.py
index 0bc20f3b0a..622f50008e 100644
--- a/inference/python/streamlit/fastapi_incr.py
+++ b/inference/python/streamlit/fastapi_incr.py
@@ -138,7 +138,7 @@ async def startup_event():
     )
     llm.compile(
         generation_config,
-        max_requests_per_batch=1,
+        max_requests_per_batch=16,
         max_seq_length=2048,
         max_tokens_per_batch=64,
     )
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 9689080825..134ae70c4a 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -421,7 +421,7 @@ void FlexFlow::top_level_task(Task const *task,
       // Add inference request
       Request inference_req;
       inference_req.prompt = text;
-      inference_req.max_sequence_length = 128;
+      inference_req.max_length = 128;
       requests.push_back(inference_req);
       total_num_requests++;
     }
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index d065398f87..ec07ee9a5f 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -38,9 +38,10 @@
 )
 from flexflow.config import *
 from .flexflowlib import ffi, flexflow_library
-from typing import Union, List
+from typing import Union, List, Optional
+from dataclasses import dataclass
 from peft import LoraConfig
-import json
+import json, math
 
 
 def ffc():
@@ -2049,25 +2050,16 @@ def no_id_handle():
 # Request
 # -----------------------------------------------------------------------
 
-
+@dataclass
 class Request:
     """A class to record the metadata of an inference or finetuning request."""
-
-    def __init__(
-        self,
-        req_type: RequestType,
-        prompt: str = None,
-        max_sequence_length: int = 2048,
-        peft_model_id: PEFTModelID = None,
-        dataset_filepath: str = None,
-        max_training_steps: int = 1,
-    ):
-        self.req_type = req_type
-        self.prompt = prompt
-        self.max_sequence_length = max_sequence_length
-        self.peft_model_id = peft_model_id
-        self.dataset_filepath = dataset_filepath
-        self.max_training_steps = max_training_steps
+    req_type: RequestType
+    prompt: Optional[str] = None
+    max_length: int = -1
+    max_new_tokens: int = 128
+    peft_model_id: Optional[PEFTModelID] = None
+    dataset_filepath: Optional[str] = None
+    max_training_steps: int = 1
 
 
 # -----------------------------------------------------------------------
@@ -4665,19 +4657,23 @@ def get_output_tensor(self, ffmodel, data_type):
         assert ret_val == True
         return np_array
 
-    def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 2048):
+    def generate_inf_only(self, prompt_list: List[str], max_length: int = -1, max_new_tokens: int = 128):
+        if max_length != -1 and max_new_tokens != -1:
+            warnings.warn(f"Both `max_new_tokens` (={self.max_new_tokens}) and `max_length`(={self.max_length}) seem to have been set. `max_new_tokens` will take precedence.")
         assert isinstance(prompt_list, list)
         c_input_texts = [get_c_name(prompt) for prompt in prompt_list]
-        max_num_chars = 5 * (max_sequence_length + 100)
+        estimated_max_tokens = math.ceil(max_new_tokens + max([len(prompt.split()) for prompt in prompt_list])*1.5) if max_new_tokens != -1 else max_length
+        max_num_chars = 5 * (estimated_max_tokens + 100)
         c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list]
         c_output_length_and_tokens = [
-            ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list
+            ffi.new("int[]", estimated_max_tokens + 100) for prompt in prompt_list
         ]
         c_request_types = [
             enum_to_int(RequestType, RequestType.REQ_INFERENCE)
             for prompt in prompt_list
         ]
-        max_sequence_lengths = [max_sequence_length for prompt in prompt_list]
+        max_lengths = [max_length for prompt in prompt_list]
+        max_new_tokens_ = [max_new_tokens for prompt in prompt_list]
         peft_model_ids = [PEFTModelID.no_id_handle() for prompt in prompt_list]
         dataset_filepaths = [ffi.NULL for prompt in prompt_list]
         training_steps = [0 for prompt in prompt_list]
@@ -4689,7 +4685,8 @@ def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 2
             c_request_types,
             c_input_texts,
             c_output_texts,
-            max_sequence_lengths,
+            max_lengths,
+            max_new_tokens_,
             peft_model_ids,
             dataset_filepaths,
             training_steps,
@@ -4726,9 +4723,16 @@ def generate(self, requests_list: List[Request]):
         c_request_types = [
             enum_to_int(RequestType, request.req_type) for request in requests_list
         ]
-        max_sequence_lengths = [
-            request.max_sequence_length for request in requests_list
+        max_lengths = [
+            request.max_length for request in requests_list
+        ]
+        max_new_tokens_ = [
+            request.max_new_tokens for request in requests_list
         ]
+        for i in range(len(requests_list)):
+            if max_lengths[i] != -1 and max_new_tokens_[i] != -1:
+                warnings.warn(f"Both `max_new_tokens` (={max_new_tokens_[i]}) and `max_length`(={max_lengths[i]}) seem to have been set. `max_new_tokens` will take precedence.")
+        
         peft_model_ids = [
             (
                 request.peft_model_id
@@ -4752,7 +4756,8 @@ def generate(self, requests_list: List[Request]):
             c_request_types,
             c_input_texts,
             c_output_texts,
-            max_sequence_lengths,
+            max_lengths,
+            max_new_tokens_,
             peft_model_ids,
             dataset_filepaths,
             training_steps,
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 988789bab4..32e8e49453 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -498,12 +498,17 @@ def compile(
     def generate(
         self,
         requests_or_prompts: Union[str, List[str], Request, List[Request]],
-        max_length: int = 2048,
+        max_length: int = -1,
+        max_new_tokens: int = 128,
     ):
         """Generate tokens based on the input prompt(s)
 
         :param requests_or_prompts: The generation prompt(s) in the form of a string, a list of strings, a Request, or list of Requests
         :type requests_or_prompts: Union[str, List[str], Request, List[Request]]
+        :param max_length: The maximum length in tokens of the prompt + generated sequence, defaults to -1 (no maximum length)
+        :type max_length: int, optional
+        :param max_new_tokens: The maximum number of new tokens (excluding the prompt) to generate, defaults to 128
+        :type max_new_tokens: int, optional
         :return: the generation results
         :rtype: GenerationResult
         """
@@ -511,7 +516,7 @@ def generate(
             if len(requests_or_prompts) == 0:
                 return None
             return self.model.ffmodel.generate_inf_only(
-                [requests_or_prompts], max_length
+                [requests_or_prompts], max_length, max_new_tokens
             )
         elif type(requests_or_prompts) == Request:
             return self.model.ffmodel.generate(requests_or_prompts)
@@ -520,7 +525,7 @@ def generate(
                 return []
             if type(requests_or_prompts[0]) == str:
                 return self.model.ffmodel.generate_inf_only(
-                    requests_or_prompts, max_length
+                    requests_or_prompts, max_length, max_new_tokens
                 )
             else:
                 print(requests_or_prompts)
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index e39cb29037..e6b246597f 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1622,7 +1622,8 @@ void flexflow_model_generate(flexflow_model_t handle_,
                              enum RequestType *request_types,
                              char const **input_texts,
                              char **output_texts,
-                             int *max_seq_lengths,
+                             int *max_lengths,
+                             int *max_new_tokens_,
                              flexflow_peft_model_id_t *peft_model_ids,
                              char const **dataset_filepaths,
                              int *training_steps,
@@ -1637,21 +1638,24 @@ void flexflow_model_generate(flexflow_model_t handle_,
       std::string const text_str(input_texts[i]);
       Request inference_req;
       inference_req.prompt = text_str;
-      inference_req.max_sequence_length = max_seq_lengths[i];
+      inference_req.max_length = max_lengths[i];
+      inference_req.max_new_tokens = max_new_tokens_[i];
       PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]);
       if (peft_model_id != nullptr) {
         inference_req.peft_model_id = *peft_model_id;
       }
       requests.push_back(inference_req);
-      DEBUG_PRINT("[Model] generate[%d] %p %s %i",
+      DEBUG_PRINT("[Model] generate[%d] %p %s %i %i",
                   i,
                   handle,
                   text_str.c_str(),
-                  max_seq_lengths[i]);
+                  max_lengths[i],
+                  max_new_tokens_[i]);
     } else if (request_types[i] == RequestType::REQ_FINETUNING) {
       Request fine_tuning_req;
       fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
-      fine_tuning_req.max_sequence_length = max_seq_lengths[i];
+      fine_tuning_req.max_length = max_lengths[i];
+      fine_tuning_req.max_new_tokens = max_new_tokens_[i];
       PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]);
       if (peft_model_id != nullptr) {
         fine_tuning_req.peft_model_id = *peft_model_id;
@@ -1660,11 +1664,12 @@ void flexflow_model_generate(flexflow_model_t handle_,
       fine_tuning_req.dataset_filepath = dataset_fp;
       fine_tuning_req.max_training_steps = training_steps[i];
       requests.push_back(fine_tuning_req);
-      DEBUG_PRINT("[Model] finetune[%d] %p %s %i %i",
+      DEBUG_PRINT("[Model] finetune[%d] %p %s %i %i %i",
                   i,
                   handle,
                   dataset_fp.c_str(),
-                  max_seq_lengths[i],
+                  max_lengths[i],
+                  max_new_tokens[i],
                   training_steps[i]);
     } else {
       assert(false && "Unknown request type");
@@ -1678,8 +1683,17 @@ void flexflow_model_generate(flexflow_model_t handle_,
       // If the prompt exceeds max seq len, check that we return the prompt with
       // no additional token. Otherwise, check that the output does not exceed
       // the max sequence length.
-      assert(results[i].output_tokens.size() <= max_seq_lengths[i] ||
-             results[i].output_tokens.size() == results[i].input_tokens.size());
+      int total_tokens = results[i].output_tokens.size();
+      int num_output_tokens = total_tokens - results[i].input_tokens.size();
+      if (max_new_tokens_[i] >= 0) {
+        assert(num_output_tokens <= max_new_tokens_[i]);
+      }
+      if (max_lengths[i] >= 0) {
+        assert(total_tokens <= max_lengths[i] || num_output_tokens == 0);
+      }
+      // assert(results[i].output_tokens.size() <= max_seq_lengths[i] ||
+      //        results[i].output_tokens.size() ==
+      //        results[i].input_tokens.size());
       output_length_and_tokens[i][0] = results[i].output_tokens.size();
       std::copy(results[i].output_tokens.begin(),
                 results[i].output_tokens.end(),
diff --git a/src/ops/add_bias_residual_layer_norm.cpp b/src/ops/add_bias_residual_layer_norm.cpp
index 681f55c998..cb140e0c75 100644
--- a/src/ops/add_bias_residual_layer_norm.cpp
+++ b/src/ops/add_bias_residual_layer_norm.cpp
@@ -224,7 +224,7 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu
index bcca1ba2c6..2d2707f10b 100644
--- a/src/ops/add_bias_residual_layer_norm.cu
+++ b/src/ops/add_bias_residual_layer_norm.cu
@@ -222,7 +222,7 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 826fea4347..92cfdef5a6 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -1526,7 +1526,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
                        bc->requestsInfo[i].num_tokens_in_batch;
-    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+    int max_peft_tokens = bc->requestsInfo[i].max_length;
     // Copy query to m->query_activation_buffer if we need to compute
     // PEFT backward
     if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index b278611b60..39c7397f6b 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1492,7 +1492,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
                        bc->requestsInfo[i].num_tokens_in_batch;
-    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+    int max_peft_tokens = bc->requestsInfo[i].max_length;
     // Copy query to m->query_activation_buffer if we need to compute
     // PEFT backward
     if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp
index a36d6719c9..6b371b840e 100644
--- a/src/ops/kernels/linear_kernels.cpp
+++ b/src/ops/kernels/linear_kernels.cpp
@@ -238,7 +238,7 @@ void inference_kernel_wrapper(LinearMeta *m,
           continue;
         }
         int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-        int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+        int max_peft_tokens = bc->requestsInfo[i].max_length;
         int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch;
         if (bc->requestsInfo[i].peft_bwd) {
           size_t activation_size_needed =
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index d4f930db6c..ffd2c66c9b 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -239,7 +239,7 @@ void inference_kernel_wrapper(LinearMeta *m,
           continue;
         }
         int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-        int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+        int max_peft_tokens = bc->requestsInfo[i].max_length;
         int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch;
         if (bc->requestsInfo[i].peft_bwd) {
           size_t activation_size_needed =
diff --git a/src/ops/kernels/lora_linear_kernels.cpp b/src/ops/kernels/lora_linear_kernels.cpp
index c3c2cce3cf..eab8899167 100644
--- a/src/ops/kernels/lora_linear_kernels.cpp
+++ b/src/ops/kernels/lora_linear_kernels.cpp
@@ -249,7 +249,7 @@ void inference_kernel(LoraLinearMeta *m,
       continue;
     }
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+    int max_peft_tokens = bc->requestsInfo[i].max_length;
     int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
     assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
            m->model_state.end());
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 5f130782aa..93e5820f9c 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -248,7 +248,7 @@ void inference_kernel(LoraLinearMeta *m,
       continue;
     }
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+    int max_peft_tokens = bc->requestsInfo[i].max_length;
     int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
     assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
            m->model_state.end());
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cpp b/src/ops/kernels/residual_rms_norm_kernels.cpp
index 016364edfd..cbdb8ee153 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cpp
+++ b/src/ops/kernels/residual_rms_norm_kernels.cpp
@@ -273,7 +273,7 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m,
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 0d44f0260a..285a5a5b8f 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -270,7 +270,7 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m,
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp
index 4158628005..551cb72022 100644
--- a/src/ops/kernels/rms_norm_kernels.cpp
+++ b/src/ops/kernels/rms_norm_kernels.cpp
@@ -227,7 +227,7 @@ void inference_kernel_wrapper(RMSNormMeta *m,
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index dd6ada864d..8f59d65ea7 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -225,7 +225,7 @@ void inference_kernel_wrapper(RMSNormMeta *m,
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp
index 27d314e21e..2fe4a85905 100644
--- a/src/ops/layer_norm.cpp
+++ b/src/ops/layer_norm.cpp
@@ -256,7 +256,7 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m,
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu
index 0801d11617..b08b23819c 100644
--- a/src/ops/layer_norm.cu
+++ b/src/ops/layer_norm.cu
@@ -255,7 +255,7 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m,
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp
index 582e0752ef..7f6b0b370d 100644
--- a/src/ops/residual_layer_norm.cpp
+++ b/src/ops/residual_layer_norm.cpp
@@ -280,7 +280,7 @@ void ResidualLayerNorm::inference_kernel_wrapper(
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index 8cdf87a92c..6caf6b436d 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -278,7 +278,7 @@ void ResidualLayerNorm::inference_kernel_wrapper(
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
       int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp
index ceaa1a7788..50a358beab 100644
--- a/src/ops/sigmoid_silu_multi.cpp
+++ b/src/ops/sigmoid_silu_multi.cpp
@@ -130,7 +130,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
         size_t input_tensor_size =
diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index 929d557a17..ca0168a59d 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -129,7 +129,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
         continue;
       }
       int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int max_peft_tokens = bc->requestsInfo[i].max_length;
       int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
       if (bc->requestsInfo[i].peft_bwd) {
         size_t input_tensor_size =
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 4c339750c7..a4bf960a2c 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -162,8 +162,8 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
          << bc.requestsInfo[i].first_token_offset_in_batch << std::endl;
       os << "    Number of tokens in batch: "
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
-      os << "    Max sequence length: "
-         << bc.requestsInfo[i].max_sequence_length << std::endl;
+      os << "    Max sequence length: " << bc.requestsInfo[i].max_length
+         << std::endl;
       os << "    BatchConfig Req ID: "
          << bc.requestsInfo[i].batch_config_request_id << std::endl;
       os << "    Prompt phase: " << bc.requestsInfo[i].prompt_phase
diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc
index b10f8e82ab..83e4390993 100644
--- a/src/runtime/beam_search_batch_config.cc
+++ b/src/runtime/beam_search_batch_config.cc
@@ -141,8 +141,8 @@ std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) {
       os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id
          << std::endl;
       os << "    PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
-      os << "    Max sequence length: "
-         << bc.requestsInfo[i].max_sequence_length << std::endl;
+      os << "    Max sequence length: " << bc.requestsInfo[i].max_length
+         << std::endl;
       os << "    Request completed: " << bc.request_completed[i] << std::endl;
       os << "    Request running: " << bc.request_running[i] << std::endl;
       os << "    Beam Search Specific: " << std::endl;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 31a32dd3c8..44b181fcb3 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -54,7 +54,8 @@ std::ostream &operator<<(std::ostream &os, Request const &req) {
   os << "Request {\n";
   os << "  guid: " << req.guid << "\n";
   os << "  peft_model_id: " << req.peft_model_id << "\n";
-  os << "  max_sequence_length: " << req.max_sequence_length << "\n";
+  os << "  max_length: " << req.max_length << "\n";
+  os << "  max_new_tokens: " << req.max_new_tokens << "\n";
   os << "  initial_len: " << req.initial_len << "\n";
   os << "  ssm_cache_size: " << req.ssm_cache_size << "\n";
   os << "  llm_cache_size: " << req.llm_cache_size << "\n";
@@ -261,24 +262,45 @@ RequestManager::RequestGuid
   Request request;
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
-  request.max_sequence_length = request_.max_sequence_length;
+  request.max_length = request_.max_length;
+  request.max_new_tokens = request_.max_new_tokens;
+  if (request.max_length != -1 && request.max_new_tokens != -1) {
+    std::cout
+        << "Both `max_new_tokens` (=" << request.max_new_tokens
+        << ") and `max_length`(=" << request.max_length
+        << ") seem to have been set. `max_new_tokens` will take precedence.";
+  }
   request.peft_model_id = request_.peft_model_id;
   request.warmup = request_.warmup;
   if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
     request.tokens.push_back(bos_token_id);
   }
   if (request_.benchmarking_tokens >= 0) {
-    assert(request_.benchmarking_tokens < get_max_sequence_length());
+    assert(request_.benchmarking_tokens < get_max_sequence_length() &&
+           "Benchmarking tokens exceed max sequence length");
     request.benchmarking_tokens = request_.benchmarking_tokens;
     request.tokens.insert(request.tokens.end(),
                           request_.benchmarking_tokens,
                           15); // insert random number
   } else {
     std::vector<int32_t> tokens = this->tokenizer_->Encode(request_.prompt);
+    // from here on, we will only use the max_length parameter
+    if (request.max_new_tokens != -1) {
+      request.max_length = tokens.size() + request.max_new_tokens;
+    }
+    // check that max sequence length is not exceeded
+    // 1. prompt itself should be less than max sequence length
     if (tokens.size() >= get_max_sequence_length()) {
-      std::cout << "Warning: too many tokens in prompt, only load up to "
-                << get_max_sequence_length() << " tokens, but got "
-                << tokens.size() << ".\n";
+      std::cout << "Error: prompt (" << tokens.size()
+                << " tokens) exceeds max sequence length of "
+                << get_max_sequence_length() << ".\n";
+      return INVALID_GUID;
+    }
+    // 2. max_length should not exceed the max_sequence_length
+    if (request.max_length >= get_max_sequence_length()) {
+      std::cout << "Error: max_length (" << request.max_length
+                << ") exceeds max sequence length of "
+                << get_max_sequence_length() << ".\n";
       return INVALID_GUID;
     }
     for (int i = 0; i < tokens.size(); i++) {
@@ -341,7 +363,18 @@ RequestManager::RequestGuid
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
   request.initial_len = 0;
-  request.max_sequence_length = request_.max_sequence_length;
+  request.max_length = request_.max_length;
+  request.max_new_tokens = request_.max_new_tokens;
+  if (request.max_length != -1) {
+    std::cout << "Warning: max_length is set for PEFT finetuning, but it will "
+                 "be ignored."
+              << std::endl;
+  }
+  if (request.max_new_tokens != -1) {
+    std::cout << "Warning: max_new_tokens is set for PEFT finetuning, but "
+                 "it will be ignored."
+              << std::endl;
+  }
   request.peft_model_id = request_.peft_model_id;
   request.req_type = RequestType::REQ_FINETUNING;
   request.completed_training_steps = 0;
@@ -352,7 +385,8 @@ RequestManager::RequestGuid
 
   // Load dataset
   if (request_.benchmarking_tokens >= 0) {
-    assert(request_.benchmarking_tokens <= get_max_sequence_length());
+    assert(request_.benchmarking_tokens <= get_max_sequence_length() &&
+           "Benchmarking tokens exceed max sequence length");
     request.benchmarking_tokens = request_.benchmarking_tokens;
     std::vector<int32_t> input_tokens;
     std::vector<int32_t> output_tokens;
@@ -385,9 +419,10 @@ RequestManager::RequestGuid
           this->tokenizer_->Encode(output_text);
       if (input_tokens.size() + output_tokens.size() >
           get_max_sequence_length()) {
-        std::cout << "Warning: too many tokens in sample, only load up to "
-                  << get_max_sequence_length() << " tokens, but got "
-                  << input_tokens.size() + output_tokens.size() << ".\n";
+        std::cout << "Error: sample in training dataset is "
+                  << input_tokens.size() + output_tokens.size()
+                  << " tokens long, exceeding the maximum sequence length of "
+                  << get_max_sequence_length() << " tokens.\n";
         return INVALID_GUID;
       } else {
         request.dataset.push_back(std::make_pair(input_tokens, output_tokens));
@@ -515,7 +550,7 @@ bool RequestManager::check_inf_req_completion(BatchConfig const &old_bc,
   Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
   bool request_completed = false;
   // printf("model_type = %d\n", this->model_type);
-  if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) {
+  if (request.tokens.size() >= old_bc.requestsInfo[i].max_length) {
     request_completed = true;
   } else if (request.tokens.back() == eos_token_id) {
     // Encounter EOS token id
@@ -698,8 +733,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         new_bc.requestsInfo[i].peft_model_id =
             old_bc.requestsInfo[i].peft_model_id;
         new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd;
-        new_bc.requestsInfo[i].max_sequence_length =
-            old_bc.requestsInfo[i].max_sequence_length;
+        new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length;
         num_active_req++;
         new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
         if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 ==
@@ -765,8 +799,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         new_bc.requestsInfo[i].num_tokens_in_batch =
             std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
                      (int)new_request.tokens.size());
-        new_bc.requestsInfo[i].max_sequence_length =
-            new_request.max_sequence_length;
+        new_bc.requestsInfo[i].max_length = new_request.max_length;
         new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id;
         new_bc.requestsInfo[i].peft_bwd = false;
         new_bc.request_completed[i] = false;
@@ -932,8 +965,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
           new_bc.num_active_infr_tokens();
       new_bc.requestsInfo[inference_batch_size].num_tokens_in_batch =
           num_peft_tokens;
-      new_bc.requestsInfo[inference_batch_size].max_sequence_length =
-          request.max_sequence_length;
+      new_bc.requestsInfo[inference_batch_size].max_length = request.max_length;
       new_bc.requestsInfo[inference_batch_size].request_guid = request.guid;
       new_bc.requestsInfo[inference_batch_size].peft_model_id =
           request.peft_model_id;
@@ -1076,10 +1108,10 @@ BeamSearchBatchConfig
                         verified_tokens.size());
       // check if the request is finished
       if (verified_tokens.size() + request.tokens.size() >=
-          request.max_sequence_length) {
+          request.max_length) {
         // Append all verified tokens to the request
         for (auto const &token_pair : verified_tokens) {
-          if (token_pair.second < request.max_sequence_length) {
+          if (token_pair.second < request.max_length) {
             request.tokens.push_back(token_pair.first);
           }
         }
@@ -1171,14 +1203,13 @@ BeamSearchBatchConfig
         new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
         new_bc.requestsInfo[i].request_guid =
             old_bc.requestsInfo[i].request_guid;
-        new_bc.requestsInfo[i].max_sequence_length =
-            old_bc.requestsInfo[i].max_sequence_length;
+        new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length;
         new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size();
         new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
 
         // TODO: Beam Request Info, missing from VerifyTreeBatchConfig
         int new_max_depth =
-            new_bc.requestsInfo[i].max_sequence_length -
+            new_bc.requestsInfo[i].max_length -
             new_bc.requestsInfo[i].first_token_depth_in_request -
             verified_tokens.size();
         new_bc.beamRequestsInfo[i].current_depth = 1;
@@ -1254,8 +1285,7 @@ BeamSearchBatchConfig
           request.ssm_cache_size;
       new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
       new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_bc.requestsInfo[i].max_sequence_length;
+      new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length;
       new_bc.requestsInfo[i].num_tokens_in_batch = 0;
       new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
 
@@ -1307,8 +1337,7 @@ BeamSearchBatchConfig
         new_bc.requestsInfo[i].num_tokens_in_batch =
             std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
                      (int)new_request.tokens.size());
-        new_bc.requestsInfo[i].max_sequence_length =
-            new_request.max_sequence_length;
+        new_bc.requestsInfo[i].max_length = new_request.max_length;
         new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
 
         // add profile_info for the new request
@@ -1484,8 +1513,7 @@ BeamSearchBatchConfig
       new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens;
       new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
       new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_bc.requestsInfo[i].max_sequence_length;
+      new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length;
       profiling_requests[request.guid].ssm_decoding_steps += 1;
       new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
       // update the beam search metadata
@@ -1613,8 +1641,7 @@ BeamSearchBatchConfig
       new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens;
       new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
       new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_bc.requestsInfo[i].max_sequence_length;
+      new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length;
       new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
 
       // update the beam search metadata
@@ -1816,8 +1843,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
       new_bc.requestsInfo[i].request_guid =
           old_batches.at(0).requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_batches.at(0).requestsInfo[i].max_sequence_length;
+      new_bc.requestsInfo[i].max_length =
+          old_batches.at(0).requestsInfo[i].max_length;
       new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
 
       // copy bitmask to verify batchconfig
@@ -1958,8 +1985,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
       new_bc.requestsInfo[i].request_guid =
           old_batches.at(0).requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_batches.at(0).requestsInfo[i].max_sequence_length;
+      new_bc.requestsInfo[i].max_length =
+          old_batches.at(0).requestsInfo[i].max_length;
       new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
 
       new_bc.request_completed[i] = false;
diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc
index a71b1070b2..f8ac6089fe 100644
--- a/src/runtime/tree_verify_batch_config.cc
+++ b/src/runtime/tree_verify_batch_config.cc
@@ -58,8 +58,8 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) {
       os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id
          << std::endl;
       os << "    PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
-      os << "    Max sequence length: "
-         << bc.requestsInfo[i].max_sequence_length << std::endl;
+      os << "    Max sequence length: " << bc.requestsInfo[i].max_length
+         << std::endl;
       os << "    Request completed: " << bc.request_completed[i] << std::endl;
       os << "    Request running: " << bc.request_running[i] << std::endl;
     }

From 85797e091cebad393649525fc5623ab32b03fe11 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 1 Oct 2024 07:50:51 +0000
Subject: [PATCH 07/37] backup

---
 include/flexflow/model.h                  |   7 +-
 include/flexflow/ops/lora_linear.h        |   8 +-
 include/flexflow/ops/lora_linear_params.h |   4 +-
 python/flexflow/serve/serve.py            |  42 +++++++++
 src/ops/lora_linear.cc                    | 105 ++++++++++++++++++++--
 src/runtime/inference_manager.cc          |   1 +
 6 files changed, 154 insertions(+), 13 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 4ad735ef7d..5ac91d5b81 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -845,7 +845,8 @@ class FFModel {
   // ========================================
   // PEFT Layers
   // ========================================
-  PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config);
+//   PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config);
+    void add_lora_layers(std::vector<std::string> target_modules, int max_rank, int max_concurrent_adapters);
   // ========================================
   // Inference APIs
   // ========================================
@@ -1180,8 +1181,8 @@ class FFModel {
   std::vector<ParallelTensor> parameters;
   // PEFT related
   std::unordered_map<Layer *, Layer *> base_layer_to_peft_layer;
-  std::unordered_map<Layer *, std::vector<PEFTModelID>> peft_layer_to_peft_id;
-  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
+//   std::unordered_map<Layer *, std::vector<PEFTModelID>> peft_layer_to_peft_id;
+//   std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
   //   std::vector<Op *> peft_operators;
 
   FFHandler handlers[MAX_NUM_WORKERS];
diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h
index 9e83c3f90e..8d37be0c64 100644
--- a/include/flexflow/ops/lora_linear.h
+++ b/include/flexflow/ops/lora_linear.h
@@ -23,7 +23,9 @@ class LoraLinear : public Op {
       OperatorType type,
       ParallelTensor const input,
       ParallelTensor const output,
-      std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
+      int max_rank,
+      int max_concurrent_adapters,
+      // std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
       char const *name = nullptr);
   LoraLinear(FFModel &model,
              LoraLinear const &other,
@@ -91,7 +93,9 @@ class LoraLinear : public Op {
   // size_t get_params_hash() const override;
   LoraLinearParams get_params() const;
 
-  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
+  // std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
+  int max_rank;
+  int max_concurrent_adapters;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
index 70539271f2..1cdeb65aa2 100644
--- a/include/flexflow/ops/lora_linear_params.h
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -129,7 +129,9 @@ class LoraLinearParams {
 public:
   LayerID layer_guid;
   OperatorType type;
-  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
+  // std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
+  int max_rank;
+  int max_concurrent_adapters;
   char name[MAX_OPNAME];
 
   bool is_valid(std::pair<ParallelTensorShape, ParallelTensorShape> const
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 32e8e49453..794f1babb3 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -611,3 +611,45 @@ def compile(
             model_specific_pipeline_parallelism_degree,
             ssms,
         )
+
+from safetensors import safe_open
+from huggingface_hub import hf_hub_download
+def download_and_convert_peft_model(peft_model_id: str, data_type: DataType = DataType.DT_HALF, cache_path: str = "", refresh_cache: bool = False):
+    if data_type != DataType.DT_FLOAT and data_type != DataType.DT_HALF:
+        raise ValueError("data_type must be either DataType.DT_FLOAT or DataType.DT_HALF")
+    adapter_path = hf_hub_download(repo_id=peft_model_id, filename="adapter_model.safetensors")
+    peft_config = PeftConfig.from_pretrained(peft_model_id)
+    base_model_name_or_path = peft_config.base_model_name_or_path
+    llm = LLM(base_model_name_or_path, data_type, cache_path, refresh_cache)
+    
+    # Save peft config to file
+    peft_config_dir = os.path.join(
+        os.path.expanduser(llm.cache_path), "configs", peft_model_id.lower()
+    )
+    os.makedirs(peft_config_dir, exist_ok=True)
+    peft_config_path = os.path.join(peft_config_dir, "config.json")
+    print(f"Saving {peft_model_id} configs to file {peft_config_path}...")
+    with open(peft_config_path, "w") as json_file:
+
+        class SetEncoder(json.JSONEncoder):
+            def default(self, obj):
+                if isinstance(obj, set):
+                    return list(obj)
+                return super().default(obj)
+
+        json.dump(peft_config.to_dict(), json_file, indent=2, cls=SetEncoder)
+
+    # Save peft weights to file
+    with safe_open(adapter_path, framework="pt", device="cpu") as f:
+        for tensor_name in f.keys():
+            tensor = f.get_tensor(tensor_name)
+            if data_type == DataType.DT_HALF:
+                tensor = tensor.half()
+            else:
+                tensor = tensor.float()
+            tensor_name = tensor_name.replace("base_model.model.model.", "").replace(".default", "")
+            print(tensor_name)
+            
+            tensor_name = llm.model_class.convert_hf_weight_name(tensor_name)
+            tensor.detach().cpu().numpy().tofile(f"{llm.weights_path}/{tensor_name}")
+
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index fde6bc2b28..e97087ea68 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -51,6 +51,82 @@ bool check_lora_layer_match(Layer *potential_target,
   return false;
 }
 
+void FFmodel::add_lora_layers(std::vector<std::string> target_modules, int max_rank, int max_concurrent_adapters) {
+  assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled");
+  assert(target_modules.size() > 0 && "LoRA target module name is empty");
+  assrt(max_rank > 1 && max_rank <= 32 && "Invalid max LoRA rank");
+  assert(max_concurrent_adapters > 0 && "Invalid number of LoRA concurrent adapters");
+
+  for (std::string target_module_name : target_modules) {
+    assert(target_module_name.length() > 0 && "LoRA target module name is empty");
+    // find target layer
+    for (auto it = layers.begin(); it != layers.end(); ++it) {
+      Layer *target_module = *it;
+      bool match = check_lora_layer_match(target_module, target_module_name);
+      if (!match) {
+        continue;
+      }
+      assert(base_layer_to_peft_layer.find(target_module) == base_layer_to_peft_layer.end() && "LoRA layer already added, attempting to add again");
+      // Get input and output tensors from target module
+      Tensor const input = target_module->inputs[0];
+      Tensor const output = target_module->outputs[0];
+      assert(input->data_type == output->data_type);
+      // Compute OP_LORA layer name, based on target module name
+      std::string name_ = target_module->name
+                              ? std::string(target_module->name)
+                              : std::string("");
+      size_t last_underscore = name_.length() - 1;
+      for (int i = name_.length() - 1; i > 0; i--) {
+        if (!(std::isdigit(target_module->name[i]) ||
+              target_module->name[i] == '_')) {
+          break;
+        } else if (target_module->name[i] == '_') {
+          last_underscore = i;
+        }
+      }
+      name_.erase(last_underscore);
+      name_ += ".lora";
+      std::cout << "Adding layer " << name_ << std::endl;
+      // Create OP_LORA layer given input, output and name
+      Layer *peft_layer = new Layer(this,
+                                    OP_LORA,
+                                    output->data_type,
+                                    name_.c_str(),
+                                    2 /*inputs*/,
+                                    0 /*weights*/,
+                                    1 /*outputs*/,
+                                    input,
+                                    output);
+      // fix LoRA layer's transformer layer ID and model ID (to be the same as target module)
+      peft_layer->layer_guid.transformer_layer_id =
+          target_module->layer_guid.transformer_layer_id;
+      peft_layer->layer_guid.model_id = target_module->layer_guid.model_id;
+      // set up output tensor for OP_LORA layer
+      {
+        int numdims = output->num_dims;
+        int dims[MAX_TENSOR_DIM];
+        for (int i = 0; i < numdims; i++) {
+          dims[i] = output->dims[i];
+        }
+        peft_layer->outputs[0] =
+            create_tensor_legion_ordering(numdims,
+                                          dims,
+                                          output->data_type,
+                                          peft_layer,
+                                          0,
+                                          true /*create_grad*/);
+      }
+      // pass max_rank and max_concurrent_adapters to OP_LORA layer
+      peft_layer->add_int_property("max_rank", max_rank);
+      peft_layer->add_int_property("max_concurrent_adapters", max_concurrent_adapters);
+      it = layers.insert(it + 1, peft_layer);
+      ++it;
+      base_layer_to_peft_layer[target_module] = peft_layer;
+    }
+  }
+}
+
+#ifdef DEADCODE
 PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
   assert(config.enable_peft &&
          "Cannot add a LoRA layer if PEFT mode is not enabled");
@@ -175,11 +251,18 @@ PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
 
   return peft_model_id;
 }
+#endif
 
 Op *LoraLinear::create_operator_from_layer(
     FFModel &model,
     Layer const *layer,
     std::vector<ParallelTensor> const &inputs) {
+  long long value;
+  layer->get_int_property("max_rank", value);
+  int max_rank = value;
+  layer->get_int_property("max_concurrent_adapters", max_concurrent_adapters);
+  int max_concurrent_adapters = value;
+#ifdef DEADCODE
   std::unordered_map<PEFTModelID, LoraLinearConfig> _peft_configs;
   std::vector<PEFTModelID> const &peft_ids =
       model.peft_layer_to_peft_id[(Layer *)layer];
@@ -187,12 +270,14 @@ Op *LoraLinear::create_operator_from_layer(
     _peft_configs.emplace(
         std::make_pair(peft_ids[i], model.peft_configs[peft_ids[i]]));
   }
+#endif
   return new LoraLinear(model,
                         layer->layer_guid,
                         layer->op_type,
                         inputs[0],
                         inputs[1],
-                        _peft_configs,
+                        max_rank,
+                        max_concurrent_adapters,
                         layer->name);
 }
 
@@ -205,7 +290,8 @@ LoraLinear::LoraLinear(FFModel &model,
                  other.op_type,
                  input,
                  output,
-                 other.peft_configs,
+                 other.max_rank,
+                other.max_concurrent_adapters,
                  other.name) {}
 
 LoraLinear::LoraLinear(FFModel &model,
@@ -217,7 +303,8 @@ LoraLinear::LoraLinear(FFModel &model,
                  params.type,
                  inputs.first,
                  inputs.second,
-                 params.peft_configs,
+                 params.max_rank,
+                 params.max_concurrent_adapters,
                  params.name) {}
 
 LoraLinear::LoraLinear(
@@ -226,7 +313,9 @@ LoraLinear::LoraLinear(
     OperatorType _op_type,
     ParallelTensor const _input,
     ParallelTensor const _output,
-    std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
+    int _max_rank,
+    int _max_concurrent_adapters,
+    // std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
     char const *name)
     : Op(model,
          _op_type,
@@ -256,9 +345,11 @@ LoraLinear::LoraLinear(
     outputs[0] = model.create_parallel_tensor_legion_ordering(
         numdim, dims, inputs[1]->data_type, this);
   }
-  for (auto const &kv : _peft_configs) {
-    peft_configs.insert(kv);
-  }
+  // for (auto const &kv : _peft_configs) {
+  //   peft_configs.insert(kv);
+  // }
+  max_rank = _max_rank;
+  max_concurrent_adapters = _max_concurrent_adapters;
   // assert(check_output_input_weight_parallel_dims(allocate_weights));
 }
 
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 1b65dfd869..20b2a5b963 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -837,4 +837,5 @@ std::string join_path(std::vector<std::string> const &paths) {
   return joined;
 }
 
+
 }; // namespace FlexFlow

From bb08d695c127d5a0639c6399db2f7ba6c9fec315 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 1 Oct 2024 18:01:18 +0000
Subject: [PATCH 08/37] update

---
 include/flexflow/utils/peft_weight_allocator.h | 15 +++++++++++++++
 src/runtime/model.cu                           |  3 ++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h
index dae46a8af1..0d43d4722b 100644
--- a/include/flexflow/utils/peft_weight_allocator.h
+++ b/include/flexflow/utils/peft_weight_allocator.h
@@ -21,6 +21,7 @@
 
 namespace FlexFlow {
 
+#ifdef DEACODE
 class PEFTWeightAllocator {
 public:
   PEFTWeightAllocator(void *_base_ptr, size_t _total_size)
@@ -86,6 +87,20 @@ class PEFTWeightAllocator {
   std::unordered_map<PEFTModelID, std::pair<off_t, size_t>> sync_weights;
   std::mutex peft_weight_allocator_mutex;
 };
+#endif
+
+class PEFTMemoryManager {
+public:
+  PEFTMemoryManager(int max_rank_, int max_concurrent_adapters_, int lora_in_dim, int lora_out_dim) : max_rank(max_rank_), max_concurrent_adapters(max_concurrent_adapters_), lora_in_dim(lora_in_dim), lora_out_dim(lora_out_dim) {}
+
+  void allocate_memory();
+  void register_peft_model(PEFTModelID const &model_id);
+  
+
+
+  int max_rank, max_concurrent_adapters;
+  int lora_in_dim, lora_out_dim;
+}
 
 }; // namespace FlexFlow
 
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index 5dab73e1a4..136ce99edd 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -168,7 +168,7 @@ FFHandler
   } else {
     handle.batch_config_metadata = nullptr;
   }
-
+#ifdef DEADCODE
   if (info->peft_activation_reserve_space_size > 0) {
     // allocate memory for peft activation reserve space
     Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
@@ -208,6 +208,7 @@ FFHandler
   } else {
     handle.peft_weight_allocator = nullptr;
   }
+#endif
   // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize));
 #ifdef FF_USE_NCCL
   handle.ncclComm = NULL;

From 62275c22aa37428a711bdfceacafd1477b4294dd Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 2 Oct 2024 04:13:24 +0000
Subject: [PATCH 09/37] backup

---
 .../ops/kernels/lora_linear_kernels.h         |  5 +-
 .../flexflow/utils/peft_weight_allocator.h    | 78 +++++++++++++++++--
 src/ops/lora_linear.cc                        |  5 ++
 3 files changed, 79 insertions(+), 9 deletions(-)

diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
index 5360b5f8ea..2fde38728a 100644
--- a/include/flexflow/ops/kernels/lora_linear_kernels.h
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -35,8 +35,9 @@ class LoraLinearMeta : public OpMeta {
   // PEFT related fields
   void *low_rank_activation;
   void *input_activation;
-  std::unordered_map<PEFTModelID, LoraLinearModelState> model_state;
-  size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0;
+  // std::unordered_map<PEFTModelID, LoraLinearModelState> model_state;
+  // size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0;
+  PEFTMemoryManager *peft_memory_manager;
 };
 
 namespace Kernels {
diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h
index 0d43d4722b..9028656949 100644
--- a/include/flexflow/utils/peft_weight_allocator.h
+++ b/include/flexflow/utils/peft_weight_allocator.h
@@ -91,15 +91,79 @@ class PEFTWeightAllocator {
 
 class PEFTMemoryManager {
 public:
-  PEFTMemoryManager(int max_rank_, int max_concurrent_adapters_, int lora_in_dim, int lora_out_dim) : max_rank(max_rank_), max_concurrent_adapters(max_concurrent_adapters_), lora_in_dim(lora_in_dim), lora_out_dim(lora_out_dim) {}
-
-  void allocate_memory();
-  void register_peft_model(PEFTModelID const &model_id);
+  PEFTMemoryManager(size_t max_lora_size_, int max_concurrent_adapters_) 
+  : max_concurrent_adapters(max_concurrent_adapters_), max_lora_size(max_lora_size_), base_ptr(nullptr) {}
+
+  // allocate memory for all the PEFT adapters for a given layer on a given shard
+  void allocate_memory(Memory gpu_mem) {
+    // allocate chunk of memory for all the PEFT adapters
+    Realm::Rect<1, coord_t> bounds(
+        Realm::Point<1, coord_t>(0),
+        Realm::Point<1, coord_t>(max_lora_size - 1));
+    std::vector<size_t> field_sizes;
+    field_sizes.push_back(sizeof(char));
+    Realm::RegionInstance::create_instance(peftLegionInst,
+                                           gpu_mem,
+                                           bounds,
+                                           field_sizes,
+                                           0,
+                                           Realm::ProfilingRequestSet())
+        .wait();
+    base_ptr = peftLegionInst.pointer_untyped(0, sizeof(char));
+  }
   
+  // Returns the slot in memory where the peft model weights are/will be stored. 
+  // If the model is not in memory (cache miss), set the cache_miss flag to true.
+  void *get_peft_model_handle(PEFTModelID const &model_id, bool *cache_miss) {
+    assert(base_ptr != nullptr && "PEFT Memory Manager not initialized");
+    assert(lru_hashtable.size() == lru_list.size() &&
+           lru_list.size() == peft2mem_slot.size() &&
+           "PEFT Memory Manager LRU hashtable/list and/or peft2mem_slot are out of sync");
+    // check for cache hit
+    if (lru_hashtable.find(model_id) != lru_hashtable.end()) {
+      int lru_list_index = lru_hashtable[model_id];
+      assert(lru_list[lru_list_index] == model_id &&
+             "PEFT Memory Manager LRU hashtable/list are out of sync");
+      // move the model to the end of the LRU list
+      lru_list.erase(lru_list.begin() + lru_list_index);
+      lru_list.push_back(model_id);
+      // update the LRU hashtable
+      lru_hashtable[model_id] = lru_list.size() - 1;
+      // get memory slot
+      assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() && "PEFT Memory Manager peft2mem_slot is out of sync");
+      *cache_miss = false;
+    } else {
+      // cache miss
+      // check if you need to evict
+      bool need_to_evict = lru_list.size() == max_concurrent_adapters;
+      int mem_slot = -1;
+      if (need_to_evict) {
+        // evict the least recently used model
+        PEFTModelID lru_model_id = lru_list[0];
+        lru_list.erase(lru_list.begin());
+        lru_hashtable.erase(lru_model_id);
+        mem_slot = peft2mem_slot[lru_model_id];
+        peft2mem_slot.erase(lru_model_id);
+      } else {
+        mem_slot = lru_list.size();
+      }
+      // update the LRU list and hashtable
+      lru_list.push_back(model_id);
+      lru_hashtable[model_id] = lru_list.size() - 1;
+      // update the memory slot
+      peft2mem_slot[model_id] = mem_slot;
+      *cache_miss = true;
+    }
+    return static_cast<char *>(base_ptr) + peft2mem_slot[model_id]*max_lora_size;
+  }
 
-
-  int max_rank, max_concurrent_adapters;
-  int lora_in_dim, lora_out_dim;
+  int max_concurrent_adapters;
+  size_t max_lora_size;
+  Realm::RegionInstance peftLegionInst;
+  void *base_ptr;
+  std::unordered_map<PEFTModelID, int> lru_hashtable;
+  std::vector<PEFTModelID> lru_list; // head = least recently used, tail=most recently used
+  std::unordered_map<PEFTModelID, int> peft2mem_slot;
 }
 
 }; // namespace FlexFlow
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index e97087ea68..0277c008cc 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -518,6 +518,11 @@ OpMeta *LoraLinear::init_task(Task const *task,
   }
   std::string lora_layername_substr =
       lora_layername.substr(0, found + searchString.length());
+  
+  size_t max_lora_size = data_type_size(dt) * (lora->max_rank * in_dim + lora->max_rank * out_dim);
+  m->peft_memory_manager = new PEFTMemoryManager(max_lora_size, lora->max_concurrent_adapters);
+  Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
+  m->peft_memory_manager->allocate_memory(gpu_mem);
 
   for (auto const &kv : lora->peft_configs) {
     PEFTModelID const &model_id = kv.first;

From 88d60ca294f36ef4ba54fc5c2369058f6a7210d4 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 2 Oct 2024 07:47:35 +0000
Subject: [PATCH 10/37] lora configs serialize / deserialize into single file

---
 include/flexflow/batch_config.h               |   6 +
 include/flexflow/ops/lora_linear_params.h     | 122 ++++++++++++++----
 .../flexflow/utils/peft_weight_allocator.h    |   7 +-
 src/ops/lora_linear.cc                        |  10 +-
 src/ops/lora_linear_params.cc                 |  32 -----
 5 files changed, 114 insertions(+), 63 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index a509af765c..29915bf2d9 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -44,6 +44,11 @@ struct OptimizerTasks {
   bool save_updated_weights = false;
 };
 
+struct NewPeftModelPath {
+  PEFTModelID peft_model_id;
+  std::string filepath;
+};
+
 void set_optimizer_tasks(OptimizerTasks &tasks,
                          int max_training_steps,
                          int completed_training_steps,
@@ -135,6 +140,7 @@ class BatchConfig {
   PerRequestInfo requestsInfo[MAX_NUM_REQUESTS];
   PerTokenInfo tokensInfo[MAX_NUM_TOKENS];
   PerTokenInfo labelsInfo[MAX_NUM_TOKENS];
+  NewPeftModelPath new_peft_model_paths[MAX_NUM_REQUESTS];
 
   bool request_completed[MAX_NUM_REQUESTS];
   bool request_running[MAX_NUM_REQUESTS];
diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
index 1cdeb65aa2..2d8e5360dd 100644
--- a/include/flexflow/ops/lora_linear_params.h
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -17,7 +17,10 @@ namespace FlexFlow {
 class LoraOptimizerConfig {
 public:
   LoraOptimizerConfig();
-  virtual ~LoraOptimizerConfig() {}
+  virtual std::string getType() const = 0;
+  virtual nlohmann::json toJson() const = 0;
+  static std::unique_ptr<LoraOptimizerConfig> fromJson(const nlohmann::json& j);
+  virtual ~LoraOptimizerConfig() = default;
 };
 
 class LoraSGDOptimizerConfig : public LoraOptimizerConfig {
@@ -29,9 +32,25 @@ class LoraSGDOptimizerConfig : public LoraOptimizerConfig {
                          bool weight_decay_ = 0.0f);
   friend std::ostream &operator<<(std::ostream &os,
                                   LoraSGDOptimizerConfig const &llc);
-
-  NLOHMANN_DEFINE_TYPE_INTRUSIVE(
-      LoraSGDOptimizerConfig, lr, momentum, nesterov, weight_decay)
+  
+  std::string getType() const override { return "SGD"; }  
+  
+  nlohmann::json toJson() const override {
+    return {{"type", "SGD"},
+            {"lr", lr},
+            {"momentum", momentum},
+            {"nesterov", nesterov},
+            {"weight_decay", weight_decay}};
+  }
+
+  static std::unique_ptr<LoraSGDOptimizerConfig> fromJson(const nlohmann::json& j) {
+    auto sgd = std::make_unique<LoraSGDOptimizerConfig>();
+    sgd->lr = j["lr"];
+    sgd->momentum = j["momentum"];
+    sgd->nesterov = j["nesterov"];
+    sgd->weight_decay = j["weight_decay"];
+    return sgd;
+  }
 
 public:
   double lr = 0.001f;
@@ -50,9 +69,27 @@ class LoraAdamOptimizerConfig : public LoraOptimizerConfig {
                           double epsilon_ = 1e-8);
   friend std::ostream &operator<<(std::ostream &os,
                                   LoraAdamOptimizerConfig const &llc);
-
-  NLOHMANN_DEFINE_TYPE_INTRUSIVE(
-      LoraAdamOptimizerConfig, alpha, beta1, beta2, weight_decay, epsilon)
+  
+  std::string getType() const override { return "Adam"; }  
+  
+  nlohmann::json toJson() const override {
+    return {{"type", "Adam"},
+            {"alpha", alpha},
+            {"beta1", beta1},
+            {"beta2", beta2},
+            {"weight_decay", weight_decay},
+            {"epsilon", epsilon}};
+  }
+
+  static std::unique_ptr<LoraAdamOptimizerConfig> fromJson(const nlohmann::json& j) {
+    auto adam = std::make_unique<LoraAdamOptimizerConfig>();
+    adam->alpha = j["alpha"];
+    adam->beta1 = j["beta1"];
+    adam->beta2 = j["beta2"];
+    adam->weight_decay = j["weight_decay"];
+    adam->epsilon = j["epsilon"];
+    return adam;
+  }
 
 public:
   // Adam
@@ -63,13 +100,13 @@ class LoraAdamOptimizerConfig : public LoraOptimizerConfig {
   double epsilon = 1e-8;
 };
 
-// Serialization helpers
-template <typename T>
-void serialize_to_json_file(T const &obj, fs::path const &filepath);
+std::unique_ptr<LoraOptimizerConfig> LoraOptimizerConfig::fromJson(const nlohmann::json& j) {
+  std::string type = j["type"];
+  if (type == "SGD") return LoraSGDOptimizerConfig::fromJson(j);
+  if (type == "Adam") return LoraAdamOptimizerConfig::fromJson(j);
+  throw std::runtime_error("Unknown optimizer type");
+}
 
-// Function to deserialize JSON from file and create object
-template <typename T>
-std::unique_ptr<T> deserialize_from_json_file(fs::path const &filepath);
 
 class LoraLinearConfig {
 public:
@@ -87,22 +124,54 @@ class LoraLinearConfig {
                    std::vector<std::string> const &target_modules_ = {});
   // constructor used to support std::unordered_map
   LoraLinearConfig();
+  template<typename T>
+    void setOptimizer(T&& opt) {
+        optimizer_config = std::make_unique<T>(std::forward<T>(opt));
+    }
   friend bool operator==(LoraLinearConfig const &lhs,
                          LoraLinearConfig const &rhs);
   friend std::ostream &operator<<(std::ostream &os,
                                   LoraLinearConfig const &llc);
-
-  NLOHMANN_DEFINE_TYPE_INTRUSIVE(LoraLinearConfig,
-                                 cache_folder,
-                                 peft_model_id,
-                                 rank,
-                                 lora_alpha,
-                                 lora_dropout,
-                                 target_modules,
-                                 trainable,
-                                 init_lora_weights,
-                                 base_model_name_or_path,
-                                 precision)
+  void serialize_to_json_file(const std::string& filename) const {
+    json j = {
+        {"cache_folder", cache_folder},
+        {"peft_model_id", peft_model_id},
+        {"rank", rank},
+        {"lora_alpha", lora_alpha},
+        {"lora_dropout", lora_dropout},
+        {"target_modules", target_modules},
+        {"trainable", trainable},
+        {"init_lora_weights", init_lora_weights},
+        {"base_model_name_or_path", base_model_name_or_path},
+        {"precision", precision},
+        {"optimizer_config", optimizer_config ? optimizer_config->toJson() : nullptr}
+    };
+
+    std::ofstream file(filename);
+    file << j.dump(4);  // Use 4 spaces for indentation
+  }
+  // Deserialization method
+  static LoraLinearConfig deserialize_from_json_file(const std::string& filename) {
+    std::ifstream file(filename);
+    json j;
+    file >> j;
+    LoraLinearConfig metadata(
+      j["cache_folder"].get<std::string>(),
+      j["peft_model_id"].get<std::vector<int>>(),
+      j["rank"].get<std::string>(),
+      j["lora_alpha"].get<std::string>(),
+      j["lora_dropout"].get<std::string>(),
+      j["target_modules"].get<std::vector<std::string>>(),
+      j["trainable"].get<bool>(),
+      j["init_lora_weights"].get<bool>(),
+      j["base_model_name_or_path"].get<std::string>(),
+      j["precision"].get<std::string>()
+    );
+    if (!j["optimizer_config"].is_null()) {
+      metadata.optimizer_config = LoraOptimizerConfig::fromJson(j["optimizer_config"]);
+    }
+    return metadata;
+  }
 
   std::string cache_folder;
   // Huggingface model ID (for download and/or upload)
@@ -116,7 +185,8 @@ class LoraLinearConfig {
   // whether the weights are trainable (fine-tuning scenario) or not
   // (inference-only). If set to true, allocate space for the gradients
   bool trainable = false;
-  LoraOptimizerConfig *optimizer_config;
+  // LoraOptimizerConfig *optimizer_config;
+  std::unique_ptr<LoraOptimizerConfig> optimizer_config;
   // whether to initialize weights randomly (instead of attempting to load them
   // from file)
   bool init_lora_weights;
diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h
index 9028656949..7c1bd01ea5 100644
--- a/include/flexflow/utils/peft_weight_allocator.h
+++ b/include/flexflow/utils/peft_weight_allocator.h
@@ -95,7 +95,7 @@ class PEFTMemoryManager {
   : max_concurrent_adapters(max_concurrent_adapters_), max_lora_size(max_lora_size_), base_ptr(nullptr) {}
 
   // allocate memory for all the PEFT adapters for a given layer on a given shard
-  void allocate_memory(Memory gpu_mem) {
+  void allocate_inference_memory(Memory gpu_mem) {
     // allocate chunk of memory for all the PEFT adapters
     Realm::Rect<1, coord_t> bounds(
         Realm::Point<1, coord_t>(0),
@@ -111,6 +111,9 @@ class PEFTMemoryManager {
         .wait();
     base_ptr = peftLegionInst.pointer_untyped(0, sizeof(char));
   }
+  void allocate_finetuning_memory(Memory gpu_mem) {
+
+  }
   
   // Returns the slot in memory where the peft model weights are/will be stored. 
   // If the model is not in memory (cache miss), set the cache_miss flag to true.
@@ -160,7 +163,7 @@ class PEFTMemoryManager {
   int max_concurrent_adapters;
   size_t max_lora_size;
   Realm::RegionInstance peftLegionInst;
-  void *base_ptr;
+  void *base_ptr; void *finetuning_ptr;
   std::unordered_map<PEFTModelID, int> lru_hashtable;
   std::vector<PEFTModelID> lru_list; // head = least recently used, tail=most recently used
   std::unordered_map<PEFTModelID, int> peft2mem_slot;
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 0277c008cc..f4c1ba9c35 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -519,12 +519,17 @@ OpMeta *LoraLinear::init_task(Task const *task,
   std::string lora_layername_substr =
       lora_layername.substr(0, found + searchString.length());
   
+  // allocate space for lora weights
   size_t max_lora_size = data_type_size(dt) * (lora->max_rank * in_dim + lora->max_rank * out_dim);
   m->peft_memory_manager = new PEFTMemoryManager(max_lora_size, lora->max_concurrent_adapters);
   Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
-  m->peft_memory_manager->allocate_memory(gpu_mem);
+  m->peft_memory_manager->allocate_inference_memory(gpu_mem);
 
-  for (auto const &kv : lora->peft_configs) {
+  return m;
+}
+
+void load_peft_adapters(BatchConfig const *bc){
+  for (auto const &kv : bc->peft_configs) {
     PEFTModelID const &model_id = kv.first;
     LoraLinearConfig const &lora_config = kv.second;
 
@@ -680,7 +685,6 @@ OpMeta *LoraLinear::init_task(Task const *task,
     m->model_state[model_id].cache_folder = lora_config.cache_folder;
     m->model_state[model_id].peft_model_id = lora_config.peft_model_id;
   }
-  return m;
 }
 
 void LoraLinear::forward(FFModel const &ff) {
diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc
index 6e0c60e057..310b6d0973 100644
--- a/src/ops/lora_linear_params.cc
+++ b/src/ops/lora_linear_params.cc
@@ -50,38 +50,6 @@ std::ostream &operator<<(std::ostream &os, LoraAdamOptimizerConfig const &llc) {
   return os;
 }
 
-// Serialization helpers
-template <typename T>
-void serialize_to_json_file(T const &obj, fs::path const &filepath) {
-  json j = obj;
-  std::ofstream file(filepath);
-  file << j.dump(4);
-}
-
-template <typename T>
-std::unique_ptr<T> deserialize_from_json_file(fs::path const &filepath) {
-  std::ifstream file(filepath);
-  json j;
-  file >> j;
-  return std::make_unique<T>(j.get<T>());
-}
-
-template void
-    serialize_to_json_file<LoraLinearConfig>(LoraLinearConfig const &obj,
-                                             fs::path const &filepath);
-template void serialize_to_json_file<LoraSGDOptimizerConfig>(
-    LoraSGDOptimizerConfig const &obj, fs::path const &filepath);
-template void serialize_to_json_file<LoraAdamOptimizerConfig>(
-    LoraAdamOptimizerConfig const &obj, fs::path const &filepath);
-template std::unique_ptr<LoraLinearConfig>
-    deserialize_from_json_file<LoraLinearConfig>(fs::path const &filepath);
-template std::unique_ptr<LoraSGDOptimizerConfig>
-    deserialize_from_json_file<LoraSGDOptimizerConfig>(
-        fs::path const &filepath);
-template std::unique_ptr<LoraAdamOptimizerConfig>
-    deserialize_from_json_file<LoraAdamOptimizerConfig>(
-        fs::path const &filepath);
-
 // ------------------ LoRA configs -------------------
 // ---------------------------------------------------
 const LoraLinearConfig LoraLinearConfig::EmptyConfig = LoraLinearConfig("", "");

From e453237d71518a014bb2966418b38ef662378909 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 4 Oct 2024 23:03:32 +0000
Subject: [PATCH 11/37] backup

---
 include/flexflow/batch_config.h           |  3 +-
 include/flexflow/ops/lora_linear_params.h | 48 +++++++++++--------
 include/flexflow/request_manager.h        |  3 ++
 src/runtime/request_manager.cc            | 56 +++++++++++++++++++++--
 4 files changed, 84 insertions(+), 26 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 29915bf2d9..cb2f8d3a3d 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -96,7 +96,6 @@ class BatchConfig {
       request_guid = 0;
       prompt_phase = false;
       batch_config_request_id = -1;
-      peft_model_id = PEFTModelID::NO_ID;
       peft_bwd = false;
       optimizer_tasks = {true, false, false, false};
     }
@@ -110,7 +109,7 @@ class BatchConfig {
     bool prompt_phase = false;
     RequestGuid request_guid;
     // PEFT fields
-    PEFTModelID peft_model_id;
+    std::unordered_map<PEFTModelID, std::string> peft_adapters;
     bool peft_bwd;
     OptimizerTasks optimizer_tasks;
   };
diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
index 2d8e5360dd..84e76c4cc7 100644
--- a/include/flexflow/ops/lora_linear_params.h
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -132,7 +132,7 @@ class LoraLinearConfig {
                          LoraLinearConfig const &rhs);
   friend std::ostream &operator<<(std::ostream &os,
                                   LoraLinearConfig const &llc);
-  void serialize_to_json_file(const std::string& filename) const {
+  std::string serialize_to_json_string(int indent=-1) const {
     json j = {
         {"cache_folder", cache_folder},
         {"peft_model_id", peft_model_id},
@@ -147,30 +147,40 @@ class LoraLinearConfig {
         {"optimizer_config", optimizer_config ? optimizer_config->toJson() : nullptr}
     };
 
+    return j.dump(indent);  // No indentation
+  }
+  void serialize_to_json_file(const std::string& filename) const {
+    std::string j = serialize_to_json_string(4);
     std::ofstream file(filename);
-    file << j.dump(4);  // Use 4 spaces for indentation
+    file << j;
   }
   // Deserialization method
-  static LoraLinearConfig deserialize_from_json_file(const std::string& filename) {
-    std::ifstream file(filename);
-    json j;
-    file >> j;
-    LoraLinearConfig metadata(
-      j["cache_folder"].get<std::string>(),
-      j["peft_model_id"].get<std::vector<int>>(),
-      j["rank"].get<std::string>(),
-      j["lora_alpha"].get<std::string>(),
-      j["lora_dropout"].get<std::string>(),
-      j["target_modules"].get<std::vector<std::string>>(),
-      j["trainable"].get<bool>(),
-      j["init_lora_weights"].get<bool>(),
-      j["base_model_name_or_path"].get<std::string>(),
-      j["precision"].get<std::string>()
+  static LoraLinearConfig deserialize_from_json_string(const std::string& json_string) {
+    json j = json::parse(json_string);
+    LoraLinearConfig config(
+        j["cache_folder"].get<std::string>(),
+        j["peft_model_id"].get<std::string>(),
+        j["trainable"].get<bool>(),
+        nullptr,  // optimizer_config will be set later if present
+        j["init_lora_weights"].get<bool>(),
+        j["base_model_name_or_path"].get<std::string>(),
+        j["precision"].get<std::string>(),
+        j["rank"].get<int>(),
+        j["lora_alpha"].get<float>(),
+        j["lora_dropout"].get<float>(),
+        j["target_modules"].get<std::vector<std::string>>()
     );
     if (!j["optimizer_config"].is_null()) {
-      metadata.optimizer_config = LoraOptimizerConfig::fromJson(j["optimizer_config"]);
+      config.setOptimizer(LoraOptimizerConfig::fromJson(j["optimizer_config"]));
     }
-    return metadata;
+    return config;
+  }
+  // Deserialization method
+  static LoraLinearConfig deserialize_from_json_file(const std::string& filename) {
+    std::ifstream file(filename);
+    std::string j;
+    file >> j;
+    return deserialize_from_json_string(j);
   }
 
   std::string cache_folder;
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 36a56012fc..bff0e4d90c 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -149,6 +149,8 @@ class RequestManager {
                           int eos_token_id,
                           std::string const &path);
   void register_output_filepath(std::string const &);
+  void register_peft_model(FFModel *model, PEFTModelID peft_model_id);
+  LoraLinearConfig get_peft_config(PEFTModelID peft_model_id);
   void initBitMask(BatchConfig::BitMask &bitmask, int initLength);
   void appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength);
   void appendBitMask(BatchConfig::BitMask &bitmask,
@@ -289,6 +291,7 @@ class RequestManager {
   int max_sequence_length;
   Status request_manager_status;
 
+  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
   // peft benchmarking
   bool enable_peft_finetuning = false;
   static bool inference_finished;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 44b181fcb3..5e9a724d3f 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -255,6 +255,46 @@ size_t RequestManager::get_num_ssms() {
   return ssm_models.size();
 }
 
+void RequestManager::register_peft_config(PEFTModelID const &peft_model_id,
+                                         LoraLinearConfig const &peft_config) {
+  // check that peft_model_id is not already in use
+  assert(peft_configs.find(peft_model_id) == peft_configs.end() &&
+         "PEFT model ID already in use");
+  peft_configs[peft_model_id] = peft_config;
+}
+
+LoraLinearConfig const &RequestManager::get_peft_config(
+    PEFTModelID const &peft_model_id) {
+  assert(peft_configs.find(peft_model_id) != peft_configs.end() &&
+         "PEFT model ID not found");
+  return peft_configs[peft_model_id];
+}
+
+PEFTModelID *FFModel::register_peft_adapter(LoraLinearConfig const peft_config) {
+  assert(config.enable_peft &&
+         "Cannot add a LoRA layer if PEFT mode is not enabled");
+  if (peft_config.target_modules.size() == 0) {
+    printf("PEFT config does not contain any target module\n");
+    std::cout << peft_config << std::endl;
+    assert(false);
+  }
+  // go over base_layer_to_peft_layer and check that you can find at least one match
+  for (int i=0; i<peft_config.target_modules.size(); i++) {
+    bool found = false;
+    for (auto const &base_layer : peft_config.base_layer_to_peft_layer) {
+      if (base_layer.name != nullptr && strlen(base_layer.name) > 0 && std::string(base_layer.name).find(peft_config.target_modules[0]) != std::string::npos) {
+        found = true;
+        break;
+      }
+    }
+    assert(found && "Attempting to add LoRA to a LLM target module that does not exist or does not support LoRA");
+  }
+  PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++);
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->register_peft_config(*peft_model_id, peft_config);
+  return peft_model_id;
+}
+
 RequestManager::RequestGuid
     RequestManager::register_new_request(Request const &request_) {
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
@@ -730,8 +770,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
         new_bc.requestsInfo[i].request_guid =
             old_bc.requestsInfo[i].request_guid;
-        new_bc.requestsInfo[i].peft_model_id =
-            old_bc.requestsInfo[i].peft_model_id;
+        // new_bc.requestsInfo[i].peft_model_id =
+        //     old_bc.requestsInfo[i].peft_model_id;
+        new_bc.requestsInfo[i].peft_adapters =
+            old_bc.requestsInfo[i].peft_adapters;
         new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd;
         new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length;
         num_active_req++;
@@ -800,7 +842,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
             std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
                      (int)new_request.tokens.size());
         new_bc.requestsInfo[i].max_length = new_request.max_length;
-        new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id;
+        // new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id;
+        if (new_request.peft_model_id != PEFTModelID::NO_ID) {
+          new_bc.requestsInfo[i].peft_adapters[new_request.peft_model_id] = get_peft_config(new_request.peft_model_id).serialize_to_json_string();
+        }
         new_bc.requestsInfo[i].peft_bwd = false;
         new_bc.request_completed[i] = false;
         new_bc.requestsInfo[i].prompt_phase = true;
@@ -967,8 +1012,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
           num_peft_tokens;
       new_bc.requestsInfo[inference_batch_size].max_length = request.max_length;
       new_bc.requestsInfo[inference_batch_size].request_guid = request.guid;
-      new_bc.requestsInfo[inference_batch_size].peft_model_id =
-          request.peft_model_id;
+      // new_bc.requestsInfo[inference_batch_size].peft_model_id =
+      //     request.peft_model_id;
+      new_bc.requestsInfo[inference_batch_size].peft_adapters[request.peft_model_id] = get_peft_config(request.peft_model_id).serialize_to_json_string();
       new_bc.requestsInfo[inference_batch_size].peft_bwd = true;
       set_optimizer_tasks(
           new_bc.requestsInfo[inference_batch_size].optimizer_tasks,

From 5c8c4480b8a5b8dd4d41b54709cec703b9ade6fd Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 5 Oct 2024 04:38:28 +0000
Subject: [PATCH 12/37] .

---
 include/flexflow/model.h                  |  2 +-
 include/flexflow/ops/lora_linear_params.h |  2 +-
 include/flexflow/request_manager.h        |  9 ++++-
 src/ops/lora_linear.cc                    | 41 ++++++++++++++++++-----
 src/runtime/request_manager.cc            | 31 ++++++++++++++++-
 5 files changed, 72 insertions(+), 13 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 5ac91d5b81..d1dbe72d7c 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -846,7 +846,7 @@ class FFModel {
   // PEFT Layers
   // ========================================
 //   PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config);
-    void add_lora_layers(std::vector<std::string> target_modules, int max_rank, int max_concurrent_adapters);
+    void add_lora_layers(std::vector<std::string> target_modules);
   // ========================================
   // Inference APIs
   // ========================================
diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
index 84e76c4cc7..c5a327459f 100644
--- a/include/flexflow/ops/lora_linear_params.h
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -208,7 +208,7 @@ class LoraLinearConfig {
 class LoraLinearParams {
 public:
   LayerID layer_guid;
-  OperatorType type;
+  // OperatorType type;
   // std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
   int max_rank;
   int max_concurrent_adapters;
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index bff0e4d90c..fcb09f15ed 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -151,6 +151,10 @@ class RequestManager {
   void register_output_filepath(std::string const &);
   void register_peft_model(FFModel *model, PEFTModelID peft_model_id);
   LoraLinearConfig get_peft_config(PEFTModelID peft_model_id);
+  void set_max_lora_rank(int max_lora_rank);
+  void set_max_concurrent_adapters(int max_concurrent_adapters);
+  int get_max_lora_rank();
+  int get_max_concurrent_adapters();
   void initBitMask(BatchConfig::BitMask &bitmask, int initLength);
   void appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength);
   void appendBitMask(BatchConfig::BitMask &bitmask,
@@ -290,8 +294,11 @@ class RequestManager {
   int max_spec_tree_token_num;
   int max_sequence_length;
   Status request_manager_status;
-
+  
+  // peft
   std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
+  int max_lora_rank;
+  int max_concurrent_adapters;
   // peft benchmarking
   bool enable_peft_finetuning = false;
   static bool inference_finished;
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index f4c1ba9c35..1ba11ed75e 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -51,10 +51,13 @@ bool check_lora_layer_match(Layer *potential_target,
   return false;
 }
 
-void FFmodel::add_lora_layers(std::vector<std::string> target_modules, int max_rank, int max_concurrent_adapters) {
+void FFmodel::add_lora_layers(std::vector<std::string> target_modules) {
   assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled");
   assert(target_modules.size() > 0 && "LoRA target module name is empty");
-  assrt(max_rank > 1 && max_rank <= 32 && "Invalid max LoRA rank");
+  RequestManager *rm = RequestManager::get_request_manager();
+  int max_lora_rank = rm->get_max_lora_rank();
+  int max_concurrent_adapters = rm->get_max_concurrent_adapters();
+  assert(max_rank > 1 && max_rank <= 32 && "Invalid max LoRA rank");
   assert(max_concurrent_adapters > 0 && "Invalid number of LoRA concurrent adapters");
 
   for (std::string target_module_name : target_modules) {
@@ -1197,14 +1200,17 @@ bool LoraLinear::measure_operator_cost(Simulator *sim,
 }
 
 bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) {
-  if (lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type &&
-      lhs.peft_configs.size() == rhs.peft_configs.size()) {
+  if (lhs.layer_guid == rhs.layer_guid && lhs.max_rank == rhs.max_rank &&
+      lhs.max_concurrent_adapters == rhs.max_concurrent_adapters &&
+      strcmp(lhs.name, rhs.name) == 0) {
+#ifdef DEADCODE
     for (auto const &kv : lhs.peft_configs) {
       auto it = rhs.peft_configs.find(kv.first);
       if (it == rhs.peft_configs.end() || !(it->second == kv.second)) {
         return false;
       }
     }
+#endif
     return true;
   }
   return false;
@@ -1243,6 +1249,9 @@ void LoraLinear::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.id);
   sez.serialize(this->layer_guid.transformer_layer_id);
   sez.serialize(this->layer_guid.model_id);
+  sez.serialize(this->max_rank);
+  sez.serialize(this->max_concurrent_adapters);
+#ifdef DEADCODE  
   sez.serialize(this->op_type);
   sez.serialize(this->peft_configs.size());
   for (auto const &kv : this->peft_configs) {
@@ -1285,6 +1294,7 @@ void LoraLinear::serialize(Legion::Serializer &sez) const {
       }
     }
   }
+#endif
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
 }
@@ -1297,8 +1307,9 @@ Node LoraLinear::deserialize(FFModel &ff,
                              int num_inputs) {
   assert(num_inputs == 2);
   size_t id, transformer_layer_id, deserialized_model_id;
-  OperatorType op_type;
-  size_t num_pefts;
+  int max_rank, max_concurrent_adapters;
+  // OperatorType op_type;
+  // size_t num_pefts;
   size_t name_len;
   char name[MAX_OPNAME] = {0};
 
@@ -1307,6 +1318,9 @@ Node LoraLinear::deserialize(FFModel &ff,
   dez.deserialize(id);
   dez.deserialize(transformer_layer_id);
   dez.deserialize(deserialized_model_id);
+  dez.deserialize(max_rank);
+  dez.deserialize(max_concurrent_adapters);
+#ifdef DEADCODE
   dez.deserialize(op_type);
   dez.deserialize(num_pefts);
   for (int i = 0; i < num_pefts; i++) {
@@ -1357,12 +1371,15 @@ Node LoraLinear::deserialize(FFModel &ff,
     params.peft_configs.emplace(
         std::make_pair(peft_model_id, *lora_linear_config));
   }
+#endif  
   dez.deserialize(name_len);
   dez.deserialize(name, name_len);
   LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
 
   params.layer_guid = layer_guid;
-  params.type = op_type;
+  // params.type = op_type;
+  params.max_rank = max_rank;
+  params.max_concurrent_adapters = max_concurrent_adapters;
   strcpy(params.name, name);
   return ff.get_or_create_node<LoraLinear>({inputs[0], inputs[1]}, params);
 }
@@ -1377,11 +1394,13 @@ Op *LoraLinear::materialize(FFModel &ff,
 LoraLinearParams LoraLinear::get_params() const {
   LoraLinearParams params;
   params.layer_guid = this->layer_guid;
-  params.type = this->op_type;
+  params.max_rank = this->max_rank;
+  params.max_concurrent_adapters = this->max_concurrent_adapters;
+  // params.type = this->op_type;
   if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
-  params.peft_configs = this->peft_configs;
+  // params.peft_configs = this->peft_configs;
   return params;
 }
 
@@ -1400,6 +1419,9 @@ size_t hash<FlexFlow::LoraLinearParams>::operator()(
   hash_combine(key, params.layer_guid.id);
   hash_combine(key, params.layer_guid.transformer_layer_id);
   hash_combine(key, params.layer_guid.model_id);
+  hash_combine(key, params.max_rank);
+  hash_combine(key, params.max_concurrent_adapters);
+#ifdef DEADCODE  
   for (auto const &kv : params.peft_configs) {
     hash_combine(key, kv.first.id);
     hash_combine(key, kv.second.rank);
@@ -1411,6 +1433,7 @@ size_t hash<FlexFlow::LoraLinearParams>::operator()(
     hash_combine(key, kv.second.target_modules);
     hash_combine(key, kv.second.init_lora_weights);
   }
+#endif
   return key;
 }
 }; // namespace std
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 5e9a724d3f..79fcdfdcfe 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -270,6 +270,20 @@ LoraLinearConfig const &RequestManager::get_peft_config(
   return peft_configs[peft_model_id];
 }
 
+void RequestManager::set_max_lora_rank(int max_lora_rank_) {
+  max_lora_rank = max_lora_rank_;
+}
+
+void RequestManager::set_max_concurrent_adapters(int max_concurrent_adapters_) {
+  max_concurrent_adapters = max_concurrent_adapters_;
+}
+
+int RequestManager::get_max_lora_rank() { return max_lora_rank; }
+
+int RequestManager::get_max_concurrent_adapters() {
+  return max_concurrent_adapters;
+}
+
 PEFTModelID *FFModel::register_peft_adapter(LoraLinearConfig const peft_config) {
   assert(config.enable_peft &&
          "Cannot add a LoRA layer if PEFT mode is not enabled");
@@ -679,6 +693,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
   int inference_batch_size =
       BatchConfig::max_requests_per_batch() - (int)enable_peft_finetuning;
 
+  int num_concurrent_adapters = 0;
+
   // Step 2: prepare the next batch for existing inference requests
   BatchConfig new_bc;
   for (int i = 0; i < inference_batch_size; i++) {
@@ -774,6 +790,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         //     old_bc.requestsInfo[i].peft_model_id;
         new_bc.requestsInfo[i].peft_adapters =
             old_bc.requestsInfo[i].peft_adapters;
+        num_concurrent_adapters += new_bc.requestsInfo[i].peft_adapters.size();
         new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd;
         new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length;
         num_active_req++;
@@ -825,6 +842,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
   }
   new_bc.num_generation_tokens = num_generation_tokens;
 
+  assert(num_concurrent_adapters <= get_max_concurrent_adapters() &&
+         "Number of concurrent adapters exceeded the limit");
+
   // Step 3: add new inference requests to the next batch if there is space
   for (int i = 0; i < inference_batch_size; i++) {
     if (new_bc.request_completed[i]) {
@@ -832,6 +852,12 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
           new_bc.num_tokens < get_max_tokens_per_batch()) {
         Request new_request = pending_infr_request_queue.front();
         assert(new_request.req_type == RequestType::REQ_INFERENCE);
+        
+        // if the request has peft adapters and we are at capacity, don't add it yet
+        if (new_request.peft_model_id != PEFTModelID::NO_ID && num_concurrent_adapters == get_max_concurrent_adapters()) {
+          break;
+        }
+        
         pending_infr_request_queue.pop();
         // all_requests[new_request.guid] = new_request;
 
@@ -1000,7 +1026,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
     int num_peft_label_tokens = request.dataset[dataset_entry].second.size();
     assert(num_peft_label_tokens == 0);
 
-    if (num_peft_tokens > 0) {
+    if (num_peft_tokens > 0 && num_concurrent_adapters < get_max_concurrent_adapters()) {
       assert(new_bc.request_completed[inference_batch_size]);
       // request info
       new_bc.request_completed[inference_batch_size] = false;
@@ -1033,8 +1059,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         new_bc.num_tokens++;
         new_bc.num_peft_tokens++;
       }
+      num_concurrent_adapters +=1;
     }
   }
+  assert(num_concurrent_adapters <= get_max_concurrent_adapters() &&
+         "Number of concurrent adapters exceeded the limit");
   return new_bc;
 }
 

From 21f8cb97e768d3dab074e8b8070d7322ebdb3a9c Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 5 Oct 2024 20:32:27 +0000
Subject: [PATCH 13/37] .

---
 .../ops/kernels/lora_linear_kernels.h         |  13 +-
 .../flexflow/utils/peft_weight_allocator.h    | 120 ++++----
 src/ops/kernels/lora_linear_kernels.cu        | 101 +++----
 src/ops/lora_linear.cc                        |  57 +---
 src/runtime/peft_weight_allocator.cc          | 263 ++++++++++++++++++
 src/runtime/peft_weight_allocator.cu          |  70 +++++
 6 files changed, 447 insertions(+), 177 deletions(-)
 create mode 100644 src/runtime/peft_weight_allocator.cc
 create mode 100644 src/runtime/peft_weight_allocator.cu

diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
index 2fde38728a..55ca34ff7d 100644
--- a/include/flexflow/ops/kernels/lora_linear_kernels.h
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -9,16 +9,7 @@
 
 namespace FlexFlow {
 
-struct LoraLinearWeight {
-  // weights
-  void *w0_ptr, *w1_ptr;
-  // gradients
-  void *w0_grad_ptr, *w1_grad_ptr;
-  // v values for SGD optimizer (when using momentum)
-  void *w0_v_values_ptr, *w1_v_values_ptr;
-  int in_dim, out_dim, rank, num_shards;
-};
-
+#ifdef DEADCODE
 struct LoraLinearModelState {
   LoraLinearWeight weights;
   LoraOptimizerConfig const *optimizer_config;
@@ -27,6 +18,7 @@ struct LoraLinearModelState {
   // Huggingface model ID (for download and/or upload)
   std::string peft_model_id;
 };
+#endif
 
 class LoraLinearMeta : public OpMeta {
 public:
@@ -35,6 +27,7 @@ class LoraLinearMeta : public OpMeta {
   // PEFT related fields
   void *low_rank_activation;
   void *input_activation;
+  std::unordeded_map<PEFTModelID, LoraLinearWeight> model_state;
   // std::unordered_map<PEFTModelID, LoraLinearModelState> model_state;
   // size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0;
   PEFTMemoryManager *peft_memory_manager;
diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h
index 7c1bd01ea5..5235ac9f38 100644
--- a/include/flexflow/utils/peft_weight_allocator.h
+++ b/include/flexflow/utils/peft_weight_allocator.h
@@ -17,7 +17,8 @@
 #define _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_
 
 #include "flexflow/config.h"
-#include <mutex>
+#include "lora_linear_params.h"
+// #include <mutex>
 
 namespace FlexFlow {
 
@@ -89,84 +90,73 @@ class PEFTWeightAllocator {
 };
 #endif
 
+struct LoraLinearWeight {
+  // weights
+  void *w0_ptr, *w1_ptr;
+  // gradients
+  void *w0_grad_ptr, *w1_grad_ptr;
+  // v values for SGD optimizer (when using momentum)
+  void *w0_v_values_ptr, *w1_v_values_ptr;
+  // int in_dim, out_dim, rank, num_shards;
+  LoraLinearWeight(void *w0=nullptr, void *w1=nullptr, void *w0_grad=nullptr, void *w1_grad=nullptr, 
+                   void *w0_v_values=nullptr, void *w1_v_values=nullptr)
+    : w0_ptr(w0), w1_ptr(w1),
+      w0_grad_ptr(w0_grad), w1_grad_ptr(w1_grad),
+      w0_v_values_ptr(w0_v_values), w1_v_values_ptr(w1_v_values) {}
+};
+
 class PEFTMemoryManager {
 public:
-  PEFTMemoryManager(size_t max_lora_size_, int max_concurrent_adapters_) 
-  : max_concurrent_adapters(max_concurrent_adapters_), max_lora_size(max_lora_size_), base_ptr(nullptr) {}
+  PEFTMemoryManager(Memory gpu_mem_, size_t max_lora_size_, int max_concurrent_adapters_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_) 
+  : gpu_mem(gpu_mem_), 
+    max_concurrent_adapters(max_concurrent_adapters_), 
+    max_lora_size(max_lora_size_),
+    in_dim(in_dim_), out_dim(out_dim_), num_shards(num_shards_), shard_id(shard_id_),
+    lora_layername_substr(lora_layername_substr_), dt(dt_),
+    base_ptr(nullptr), 
+    finetuning_ptr(nullptr), 
+    finetuning_model_id(PEFTModelID::NO_ID) {
+    
+    assert(max_concurrent_adapters > 0 && "PEFT Memory Manager max_concurrent_adapters must be > 0");
+    assert(max_lora_size > 0 && "PEFT Memory Manager max_lora_size must be > 0");
+    allocate_inference_memory();
+    // finetuning memory is allocated upon the first finetuning request, so we can skip for inference-only workloads
+  }
 
   // allocate memory for all the PEFT adapters for a given layer on a given shard
-  void allocate_inference_memory(Memory gpu_mem) {
-    // allocate chunk of memory for all the PEFT adapters
-    Realm::Rect<1, coord_t> bounds(
-        Realm::Point<1, coord_t>(0),
-        Realm::Point<1, coord_t>(max_lora_size - 1));
-    std::vector<size_t> field_sizes;
-    field_sizes.push_back(sizeof(char));
-    Realm::RegionInstance::create_instance(peftLegionInst,
-                                           gpu_mem,
-                                           bounds,
-                                           field_sizes,
-                                           0,
-                                           Realm::ProfilingRequestSet())
-        .wait();
-    base_ptr = peftLegionInst.pointer_untyped(0, sizeof(char));
-  }
-  void allocate_finetuning_memory(Memory gpu_mem) {
+  void allocate_inference_memory();
+  // allocate memory for the PEFT adapter for a finetuning request for a given layer and shard
+  void allocate_finetuning_memory();
 
-  }
+  // Check if the PEFT adapter for the given model is in memory. If not, sets the cache_miss flag to true. If this is the first finetuning request, allocate memory for the finetuning adapter.
+  void get_finetuning_slot(PEFTModelID const &model_id, bool *cache_miss);
   
   // Returns the slot in memory where the peft model weights are/will be stored. 
   // If the model is not in memory (cache miss), set the cache_miss flag to true.
-  void *get_peft_model_handle(PEFTModelID const &model_id, bool *cache_miss) {
-    assert(base_ptr != nullptr && "PEFT Memory Manager not initialized");
-    assert(lru_hashtable.size() == lru_list.size() &&
-           lru_list.size() == peft2mem_slot.size() &&
-           "PEFT Memory Manager LRU hashtable/list and/or peft2mem_slot are out of sync");
-    // check for cache hit
-    if (lru_hashtable.find(model_id) != lru_hashtable.end()) {
-      int lru_list_index = lru_hashtable[model_id];
-      assert(lru_list[lru_list_index] == model_id &&
-             "PEFT Memory Manager LRU hashtable/list are out of sync");
-      // move the model to the end of the LRU list
-      lru_list.erase(lru_list.begin() + lru_list_index);
-      lru_list.push_back(model_id);
-      // update the LRU hashtable
-      lru_hashtable[model_id] = lru_list.size() - 1;
-      // get memory slot
-      assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() && "PEFT Memory Manager peft2mem_slot is out of sync");
-      *cache_miss = false;
-    } else {
-      // cache miss
-      // check if you need to evict
-      bool need_to_evict = lru_list.size() == max_concurrent_adapters;
-      int mem_slot = -1;
-      if (need_to_evict) {
-        // evict the least recently used model
-        PEFTModelID lru_model_id = lru_list[0];
-        lru_list.erase(lru_list.begin());
-        lru_hashtable.erase(lru_model_id);
-        mem_slot = peft2mem_slot[lru_model_id];
-        peft2mem_slot.erase(lru_model_id);
-      } else {
-        mem_slot = lru_list.size();
-      }
-      // update the LRU list and hashtable
-      lru_list.push_back(model_id);
-      lru_hashtable[model_id] = lru_list.size() - 1;
-      // update the memory slot
-      peft2mem_slot[model_id] = mem_slot;
-      *cache_miss = true;
-    }
-    return static_cast<char *>(base_ptr) + peft2mem_slot[model_id]*max_lora_size;
-  }
+  int get_inference_peft_slot(PEFTModelID const &model_id, bool *cache_miss);
+
+  void load_peft_model(LoraLinearWeight &weight, LoraLinearConfig const &lora_config);
+
+  LoraLinearWeight get_inference_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config);
 
+  LoraLinearWeight get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config);
+
+  // Legion memory management apparatus
+  Memory gpu_mem;
+  Realm::RegionInstance peftLegionInst;
+  void *base_ptr, *finetuning_ptr;
+  // Size and shapes
   int max_concurrent_adapters;
   size_t max_lora_size;
-  Realm::RegionInstance peftLegionInst;
-  void *base_ptr; void *finetuning_ptr;
+  int in_dim, out_dim, num_shards, shard_id;
+  // LRU cache apparatus
   std::unordered_map<PEFTModelID, int> lru_hashtable;
   std::vector<PEFTModelID> lru_list; // head = least recently used, tail=most recently used
   std::unordered_map<PEFTModelID, int> peft2mem_slot;
+  // Miscellanea
+  std::string lora_layername_substr;
+  DataType dt;
+  PEFTModelID finetuning_model_id;
 }
 
 }; // namespace FlexFlow
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 93e5820f9c..0bb5cb64fc 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -147,56 +147,8 @@ void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
 
 namespace Internal {
 
-template <typename DT>
-void init_kernel(LoraLinearMeta *m, int seed, cudaStream_t stream) {
-  // Initialize generator
-  std::mt19937 gen(seed);
-
-  // Get handle to weights by iterating over m->model_state to get each
-  // LoraLinearWeight object
-  for (auto &model_state : m->model_state) {
-    LoraLinearWeight weight = model_state.second.weights;
-    int w0_num_elements = weight.rank * weight.in_dim;
-    int w1_num_elements = weight.rank * weight.out_dim;
-
-    // LoRA_A weight: [in_dim, rank]
-    float stdv_lora_a = 1.0f / sqrt(weight.in_dim);
-    std::uniform_real_distribution<float> dis_lora_a(-stdv_lora_a, stdv_lora_a);
-    std::vector<DT> lora_a_random_init(w0_num_elements);
-    for (auto &num : lora_a_random_init) {
-      float num_float = dis_lora_a(gen);
-      if (std::is_same<DT, half>::value) {
-        num = __float2half(num_float);
-      } else {
-        num = num_float;
-      }
-    }
-    checkCUDA(cudaMemcpyAsync(static_cast<DT *>(weight.w0_ptr),
-                              lora_a_random_init.data(),
-                              w0_num_elements * sizeof(DT),
-                              cudaMemcpyHostToDevice,
-                              stream));
-
-    // LoRA_B weight: [rank, out_dim]
-    float stdv_lora_b = 1.0f / sqrt(weight.rank);
-    std::uniform_real_distribution<float> dis_lora_b(-stdv_lora_b, stdv_lora_b);
-    std::vector<float> lora_b_random_init(w1_num_elements);
-    for (auto &num : lora_b_random_init) {
-      float num_float = dis_lora_b(gen);
-      if (std::is_same<DT, half>::value) {
-        num = __float2half(num_float);
-      } else {
-        num = num_float;
-      }
-    }
-    checkCUDA(cudaMemcpyAsync(static_cast<DT *>(weight.w1_ptr),
-                              lora_b_random_init.data(),
-                              w1_num_elements * sizeof(DT),
-                              cudaMemcpyHostToDevice,
-                              stream));
-  }
-}
 
+#ifdef DEADCODE
 template <typename DT>
 void inference_kernel(LoraLinearMeta *m,
                       BatchConfig const *bc,
@@ -335,6 +287,57 @@ void inference_kernel(LoraLinearMeta *m,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
   }
 }
+#endif
+
+bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config) {
+  for (std::string s : config.target_modules) {
+    std::string n(m->op_name);
+    if (n.find(s) != std::string::npos) {
+      return true;
+    }
+  }
+  return false;
+}
+
+
+template <typename DT>
+void inference_kernel(LoraLinearMeta *m,
+                      BatchConfig const *bc,
+                      DT const *input_ptr,
+                      DT *output_ptr,
+                      int in_dim,
+                      int out_dim,
+                      int num_shards,
+                      ffStream_t stream) {
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+
+  int num_peft_requests = 0;
+  for (int i=0; i< bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_bwd) {
+      num_peft_requests++;
+    }
+    LoraLinearConfig deserialized_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]);
+    if (!lora_applies_to_this_layer(m, deserialized_config)) {
+      continue;
+    }
+    assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && "Trainable flag mismatch");
+    bool cache_miss;
+    void *peft_slot;
+    if (!lora_config.trainable) {
+      peft_slot = m->peft_memory_manager->get_peft_model_handle(bc->requestsInfo[i].peft_model_id, &cache_miss);
+    } else {
+      peft_slot = m->peft_memory_manager->get_finetuning_handle(bc->requestsInfo[i].peft_model_id, &cache_miss);
+    }
+    if (cache_miss) {
+      // load model into memory
+      load_peft_model(m, peft_slot, deserialized_config, in_dim, out_dim, num_shards);
+    }
+  }
+}
 
 template <typename DT>
 __global__ void sgd_update(size_t count,
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 1ba11ed75e..a18f47c4ac 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -407,56 +407,6 @@ void LoraLinear::init_inference(
   set_opmeta_from_futuremap_inference(ff, fm, output_tensor);
 }
 
-template <typename DT>
-void load_peft_from_file(DT *ptr,
-                         size_t num_rows,
-                         size_t num_columns,
-                         int num_shards,
-                         int shard_id,
-                         std::string filepath) {
-  std::ifstream in(filepath, std::ios::in | std::ios::binary);
-  if (!in.good()) {
-    printf("Could not open file: %s\n", filepath.c_str());
-  }
-  assert(in.good() && "incorrect weight file path");
-
-  // HuggingFace dims (serialized in row-major order)
-  //    lora_A: [rank, intermediate_dim]
-  //    lora_B: [hidden_dim, rank]
-  // FlexFlow dims (serialized in column-major order)
-  //    lora_A: [intermediate_dim, rank]
-  //    lora_B: [rank, out_dim]
-  // Tensor parallelism: shard lora_A along intermediate_dim, replicate lora_B
-  assert(num_rows % num_shards == 0);
-  size_t chunk_size = num_rows / num_shards;
-  size_t offset = (num_shards > 1) ? shard_id * chunk_size : 0;
-
-  // Allocate memory for the weight shard
-  std::vector<DT> host_array(chunk_size * num_columns);
-  // Read the chunk
-  size_t total_size_read = 0;
-  for (int i = 0; i < num_columns; ++i) {
-    in.seekg((i * num_rows + offset) * sizeof(DT));
-    in.read(reinterpret_cast<char *>(host_array.data() + i * chunk_size),
-            chunk_size * sizeof(DT));
-    total_size_read += in.gcount();
-  }
-  // Check weight shard size
-  size_t expected_data_size = chunk_size * num_columns * sizeof(DT);
-  if (total_size_read != expected_data_size) {
-    printf("load weight data error: expected %lu bytes, got: %lu bytes, data "
-           "size: %lu\n",
-           expected_data_size,
-           total_size_read,
-           sizeof(DT));
-    assert(false);
-  }
-  assert(host_array.size() == chunk_size * num_columns);
-  // Copy weight to device memory
-  copy_tensor_host_to_dev(ptr, host_array.data(), chunk_size * num_columns);
-  in.close();
-}
-
 /*
   regions[0](O): output
   regions[1](I): kernel
@@ -524,13 +474,13 @@ OpMeta *LoraLinear::init_task(Task const *task,
   
   // allocate space for lora weights
   size_t max_lora_size = data_type_size(dt) * (lora->max_rank * in_dim + lora->max_rank * out_dim);
-  m->peft_memory_manager = new PEFTMemoryManager(max_lora_size, lora->max_concurrent_adapters);
   Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
-  m->peft_memory_manager->allocate_inference_memory(gpu_mem);
-
+  m->peft_memory_manager = new PEFTMemoryManager(gpu_mem, max_lora_size, lora->max_concurrent_adapters, in_dim, out_dim, num_shards, shard_id, lora_layername_substr, dt);
+  m->peft_memory_manager->allocate_inference_memory();
   return m;
 }
 
+#ifdef DEADCODE
 void load_peft_adapters(BatchConfig const *bc){
   for (auto const &kv : bc->peft_configs) {
     PEFTModelID const &model_id = kv.first;
@@ -689,6 +639,7 @@ void load_peft_adapters(BatchConfig const *bc){
     m->model_state[model_id].peft_model_id = lora_config.peft_model_id;
   }
 }
+#endif
 
 void LoraLinear::forward(FFModel const &ff) {
   assert(false && "LoraLinear does not support normal init");
diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc
new file mode 100644
index 0000000000..ab0e1ccd21
--- /dev/null
+++ b/src/runtime/peft_weight_allocator.cc
@@ -0,0 +1,263 @@
+#include "peft_weight_allocator.h"
+
+namespace FlexFlow {
+
+void PEFTMemoryManager::allocate_inference_memory() {
+    // allocate chunk of memory for all the PEFT adapters
+    Realm::Rect<1, coord_t> bounds(
+        Realm::Point<1, coord_t>(0),
+        Realm::Point<1, coord_t>(max_lora_size - 1));
+    std::vector<size_t> field_sizes;
+    field_sizes.push_back(sizeof(char));
+    Realm::RegionInstance::create_instance(peftLegionInst,
+                                           gpu_mem,
+                                           bounds,
+                                           field_sizes,
+                                           0,
+                                           Realm::ProfilingRequestSet())
+        .wait();
+    base_ptr = peftLegionInst.pointer_untyped(0, sizeof(char));
+}
+
+void PEFTMemoryManager::allocate_finetuning_memory() {
+    size_t ft_size = max_lora_size*3; // weights, gradients, momentum values
+    // allocate chunk of memory for PEFT adapter
+    Realm::Rect<1, coord_t> bounds(
+        Realm::Point<1, coord_t>(0),
+        Realm::Point<1, coord_t>(ft_size - 1));
+    std::vector<size_t> field_sizes;
+    field_sizes.push_back(sizeof(char));
+    Realm::RegionInstance::create_instance(peftLegionInst,
+                                           gpu_mem,
+                                           bounds,
+                                           field_sizes,
+                                           0,
+                                           Realm::ProfilingRequestSet())
+        .wait();
+    finetuning_ptr = peftLegionInst.pointer_untyped(0, sizeof(char));
+}
+
+void PEFTMemoryManager::get_finetuning_slot(PEFTModelID const &model_id, bool *cache_miss) {
+    if (finetuning_ptr == nullptr) {
+      allocate_finetuning_memory();
+    }
+    assert(finetuning_ptr != nullptr && "PEFT Memory Manager finetuning_ptr is null");
+    *cache_miss = (model_id.id != finetuning_model_id.id);
+}
+
+int PEFTMemoryManager::get_inference_peft_slot(PEFTModelID const &model_id, bool *cache_miss) {
+    assert(base_ptr != nullptr && "PEFT Memory Manager not initialized");
+    assert(lru_hashtable.size() == lru_list.size() &&
+           lru_list.size() == peft2mem_slot.size() &&
+           "PEFT Memory Manager LRU hashtable/list and/or peft2mem_slot are out of sync");
+    // check for cache hit
+    if (lru_hashtable.find(model_id) != lru_hashtable.end()) {
+      int lru_list_index = lru_hashtable[model_id];
+      assert(lru_list[lru_list_index] == model_id &&
+             "PEFT Memory Manager LRU hashtable/list are out of sync");
+      // move the model to the end of the LRU list
+      lru_list.erase(lru_list.begin() + lru_list_index);
+      lru_list.push_back(model_id);
+      // update the LRU hashtable
+      lru_hashtable[model_id] = lru_list.size() - 1;
+      // get memory slot
+      assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() && "PEFT Memory Manager peft2mem_slot is out of sync");
+      *cache_miss = false;
+    } else {
+      // cache miss
+      // check if you need to evict
+      bool need_to_evict = lru_list.size() == max_concurrent_adapters;
+      int mem_slot = -1;
+      if (need_to_evict) {
+        // evict the least recently used model
+        PEFTModelID lru_model_id = lru_list[0];
+        lru_list.erase(lru_list.begin());
+        lru_hashtable.erase(lru_model_id);
+        mem_slot = peft2mem_slot[lru_model_id];
+        peft2mem_slot.erase(lru_model_id);
+      } else {
+        mem_slot = lru_list.size();
+      }
+      // update the LRU list and hashtable
+      lru_list.push_back(model_id);
+      lru_hashtable[model_id] = lru_list.size() - 1;
+      // update the memory slot
+      peft2mem_slot[model_id] = mem_slot;
+      *cache_miss = true;
+    }
+    assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() && "PEFT Memory Manager peft2mem_slot is out of sync");
+    int slot = peft2mem_slot[model_id];
+    assert(slot >= 0 && slot < max_concurrent_adapters && "PEFT Memory Manager peft2mem_slot is out of bounds");
+    return slot;
+}
+
+template <typename DT>
+void load_peft_from_file(DT *ptr,
+                         size_t num_rows,
+                         size_t num_columns,
+                         int num_shards,
+                         int shard_id,
+                         std::string filepath) {
+  std::ifstream in(filepath, std::ios::in | std::ios::binary);
+  if (!in.good()) {
+    printf("Could not open file: %s\n", filepath.c_str());
+  }
+  assert(in.good() && "incorrect weight file path");
+
+  // HuggingFace dims (serialized in row-major order)
+  //    lora_A: [rank, intermediate_dim]
+  //    lora_B: [hidden_dim, rank]
+  // FlexFlow dims (serialized in column-major order)
+  //    lora_A: [intermediate_dim, rank]
+  //    lora_B: [rank, out_dim]
+  // Tensor parallelism: shard lora_A along intermediate_dim, replicate lora_B
+  assert(num_rows % num_shards == 0);
+  size_t chunk_size = num_rows / num_shards;
+  size_t offset = (num_shards > 1) ? shard_id * chunk_size : 0;
+
+  // Allocate memory for the weight shard
+  std::vector<DT> host_array(chunk_size * num_columns);
+  // Read the chunk
+  size_t total_size_read = 0;
+  for (int i = 0; i < num_columns; ++i) {
+    in.seekg((i * num_rows + offset) * sizeof(DT));
+    in.read(reinterpret_cast<char *>(host_array.data() + i * chunk_size),
+            chunk_size * sizeof(DT));
+    total_size_read += in.gcount();
+  }
+  // Check weight shard size
+  size_t expected_data_size = chunk_size * num_columns * sizeof(DT);
+  if (total_size_read != expected_data_size) {
+    printf("load weight data error: expected %lu bytes, got: %lu bytes, data "
+           "size: %lu\n",
+           expected_data_size,
+           total_size_read,
+           sizeof(DT));
+    assert(false);
+  }
+  assert(host_array.size() == chunk_size * num_columns);
+  // Copy weight to device memory
+  copy_tensor_host_to_dev(ptr, host_array.data(), chunk_size * num_columns);
+  in.close();
+}
+
+void PEFTMemoryManager::load_peft_model(LoraLinearWeight &weight, LoraLinearConfig const &lora_config) {
+    // Load weights
+    assert(weight.w0_ptr != nullptr && weight.w1_ptr != nullptr "PEFT Memory Manager weight ptr null");
+    int w0_num_elements = lora_config.rank * in_dim;
+    int w1_num_elements = lora_config.rank * out_dim;
+    // values below represent total weight sizes before sharding. Lora B is not
+    // sharded.
+    int lora_A_num_rows = in_dim * num_shards;
+    int lora_A_num_cols = lora_config.rank;
+    int lora_B_num_rows = lora_config.rank;
+    int lora_B_num_cols = out_dim;
+    int lora_A_num_shards = num_shards;
+    int lora_B_num_shards = 1;
+    if (lora_config.init_lora_weights) {
+        // initialize weights randomly
+        int seed = 0;
+        init_peft_weight_wrapper(weight, in_dim, out_dim, lora_config.rank, dt, seed);
+    } else {
+        // load weights from file
+        std::string weights_folder_filepath = join_path({
+            lora_config.cache_folder,
+            "weights",
+            lora_config.peft_model_id,
+            dt == DT_FLOAT ? "full-precision" : "half-precision",
+        });
+        std::string w0_filepath = join_path(
+            {weights_folder_filepath, lora_layername_substr + "_A.weight"});
+        std::string w1_filepath = join_path(
+            {weights_folder_filepath, lora_layername_substr + "_B.weight"});
+        if (dt == DT_FLOAT) {
+            std::cout << "Loading LORA weight "
+                        << lora_layername_substr + "_A.weight"
+                        << ", num_rows: " << lora_A_num_rows
+                        << ", num_cols: " << lora_A_num_cols
+                        << ", num_shards: " << lora_A_num_shards
+                        << ", shard_id: " << shard_id << std::endl;
+            load_peft_from_file((float *)weight.w0_ptr,
+                                lora_A_num_rows,
+                                lora_A_num_cols,
+                                lora_A_num_shards,
+                                shard_id,
+                                w0_filepath);
+            std::cout << "Loading LORA weight "
+                        << lora_layername_substr + "_B.weight"
+                        << ", num_rows: " << lora_B_num_rows
+                        << ", num_cols: " << lora_B_num_cols
+                        << ", num_shards: " << lora_B_num_shards
+                        << ", shard_id: " << shard_id << std::endl;
+            load_peft_from_file((float *)weight.w1_ptr,
+                                lora_B_num_rows,
+                                lora_B_num_cols,
+                                lora_B_num_shards,
+                                shard_id,
+                                w1_filepath);
+        } else if (dt == DT_HALF) {
+            std::cout << "Loading LORA weight "
+                        << lora_layername_substr + "_A.weight"
+                        << ", num_rows: " << lora_A_num_rows
+                        << ", num_cols: " << lora_A_num_cols
+                        << ", num_shards: " << lora_A_num_shards
+                        << ", shard_id: " << shard_id << std::endl;
+            load_peft_from_file((half *)weight.w0_ptr,
+                                lora_A_num_rows,
+                                lora_A_num_cols,
+                                lora_A_num_shards,
+                                shard_id,
+                                w0_filepath);
+            std::cout << "Loading LORA weight "
+                        << lora_layername_substr + "_B.weight"
+                        << ", num_rows: " << lora_B_num_rows
+                        << ", num_cols: " << lora_B_num_cols
+                        << ", num_shards: " << lora_B_num_shards
+                        << ", shard_id: " << shard_id << std::endl;
+            load_peft_from_file((half *)weight.w1_ptr,
+                                lora_B_num_rows,
+                                lora_B_num_cols,
+                                lora_B_num_shards,
+                                shard_id,
+                                w1_filepath);
+        } else {
+            assert(false && "Data type not supported");
+        }
+    }
+}
+
+LoraLinearWeight PEFTMemoryManager::get_inference_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config) {
+    assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set");
+    bool cache_miss;
+    int mem_slot = get_inference_peft_slot(model_id, &cache_miss);
+    int w0_num_elements = lora_config.rank * in_dim;
+    int data_size = data_type_size(dt);
+    LoraLinearWeight result;
+    result.w0_ptr = static_cast<char *>(base_ptr) + mem_slot * max_lora_size;
+    result.w1_ptr = result.w0_ptr + w0_num_elements * data_size;
+    if (cache_miss) {
+      load_peft_model(result, lora_config);
+    }
+    return result;
+}
+
+LoraLinearWeight PEFTMemoryManager::get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config) {
+    assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set");
+    bool cache_miss = get_finetuning_slot(model_id);
+    int w0_num_elements = lora_config.rank * in_dim;
+    int w1_num_elements = lora_config.rank * out_dim;
+    int data_size = data_type_size(dt);
+    LoraLinearWeight result;
+    result.w0_ptr = finetuning_ptr;
+    result.w1_ptr = result.w0_ptr + w0_num_elements*data_size;
+    result.w0_grad_ptr = result.w1_ptr + w1_num_elements*data_size;
+    result.w1_grad_ptr = result.w0_grad_ptr + w0_num_elements*data_size;
+    result.w0_v_values_ptr = result.w1_grad_ptr + w1_num_elements*data_size;
+    result.w1_v_values_ptr = result.w0_v_values_ptr + w0_num_elements*data_size;
+    if (cache_miss) {
+      load_peft_model(result, lora_config);
+    }
+    return result;
+}
+
+}; // namespace FlexFlow
\ No newline at end of file
diff --git a/src/runtime/peft_weight_allocator.cu b/src/runtime/peft_weight_allocator.cu
new file mode 100644
index 0000000000..cc8d095069
--- /dev/null
+++ b/src/runtime/peft_weight_allocator.cu
@@ -0,0 +1,70 @@
+
+
+#include "flexflow/ops/kernels/decompress_kernels.h"
+#include "flexflow/utils/peft_weight_allocator.h"
+#include "flexflow/utils/cuda_helper.h"
+#include <random>
+#include <vector>
+namespace FlexFlow {
+
+template <typename DT>
+void init_kernel(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, int seed, cudaStream_t stream) {
+    // Initialize generator
+    std::mt19937 gen(seed);
+
+    // Get handle to weights by iterating over m->model_state to get each
+    // LoraLinearWeight object
+    int w0_num_elements = rank * in_dim;
+    int w1_num_elements = rank * out_dim;
+
+    // LoRA_A weight: [in_dim, rank]
+    float stdv_lora_a = 1.0f / sqrt(in_dim);
+    std::uniform_real_distribution<float> dis_lora_a(-stdv_lora_a, stdv_lora_a);
+    std::vector<DT> lora_a_random_init(w0_num_elements);
+    for (auto &num : lora_a_random_init) {
+        float num_float = dis_lora_a(gen);
+        if (std::is_same<DT, half>::value) {
+            num = __float2half(num_float);
+        } else {
+            num = num_float;
+        }
+    }
+    checkCUDA(cudaMemcpyAsync(static_cast<DT *>(weight.w0_ptr),
+                                lora_a_random_init.data(),
+                                w0_num_elements * sizeof(DT),
+                                cudaMemcpyHostToDevice,
+                                stream));
+
+    // LoRA_B weight: [rank, out_dim]
+    float stdv_lora_b = 1.0f / sqrt(rank);
+    std::uniform_real_distribution<float> dis_lora_b(-stdv_lora_b, stdv_lora_b);
+    std::vector<float> lora_b_random_init(w1_num_elements);
+    for (auto &num : lora_b_random_init) {
+        float num_float = dis_lora_b(gen);
+        if (std::is_same<DT, half>::value) {
+            num = __float2half(num_float);
+        } else {
+            num = num_float;
+        }
+    }
+    checkCUDA(cudaMemcpyAsync(static_cast<DT *>(w1_ptr),
+                                lora_b_random_init.data(),
+                                w1_num_elements * sizeof(DT),
+                                cudaMemcpyHostToDevice,
+                                stream));
+}
+
+void init_peft_weight_wrapper(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, DataType dt, int seed) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  if (dt == DT_FLOAT) {
+    Internal::init_kernel<float>(weight, in_di, out_dim, rank, seed, stream);
+  } else if (dt == DT_HALF) {
+    Internal::init_kernel<half>(weight, in_di, out_dim, rank, seed, stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+}
+
+} // namespace FlexFlow
\ No newline at end of file

From c5e813bea5e15934a4fbb77e4f0561d87dc3dd8a Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 5 Oct 2024 21:13:18 +0000
Subject: [PATCH 14/37] .

---
 .../ops/kernels/lora_linear_kernels.h         |  6 +-
 .../flexflow/utils/peft_weight_allocator.h    | 20 +++--
 src/ops/kernels/lora_linear_kernels.cu        | 80 ++++++++++++++++---
 src/runtime/peft_weight_allocator.cc          | 11 +++
 4 files changed, 95 insertions(+), 22 deletions(-)

diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
index 55ca34ff7d..eef3b392b3 100644
--- a/include/flexflow/ops/kernels/lora_linear_kernels.h
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -25,9 +25,9 @@ class LoraLinearMeta : public OpMeta {
   LoraLinearMeta(FFHandler handle, LoraLinear const *li);
   ~LoraLinearMeta(void);
   // PEFT related fields
-  void *low_rank_activation;
-  void *input_activation;
-  std::unordeded_map<PEFTModelID, LoraLinearWeight> model_state;
+  // void *low_rank_activation;
+  // void *input_activation;
+  // std::unordeded_map<PEFTModelID, LoraLinearWeight> model_state;
   // std::unordered_map<PEFTModelID, LoraLinearModelState> model_state;
   // size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0;
   PEFTMemoryManager *peft_memory_manager;
diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h
index 5235ac9f38..19b987a728 100644
--- a/include/flexflow/utils/peft_weight_allocator.h
+++ b/include/flexflow/utils/peft_weight_allocator.h
@@ -95,23 +95,27 @@ struct LoraLinearWeight {
   void *w0_ptr, *w1_ptr;
   // gradients
   void *w0_grad_ptr, *w1_grad_ptr;
+  // activations
+  void *input_activation;
+  void *low_rank_activation;
   // v values for SGD optimizer (when using momentum)
   void *w0_v_values_ptr, *w1_v_values_ptr;
-  // int in_dim, out_dim, rank, num_shards;
   LoraLinearWeight(void *w0=nullptr, void *w1=nullptr, void *w0_grad=nullptr, void *w1_grad=nullptr, 
-                   void *w0_v_values=nullptr, void *w1_v_values=nullptr)
+                   void *w0_v_values=nullptr, void *w1_v_values=nullptr, void *low_rank_activation_=nullptr, void *input_activation_=nullptr)
     : w0_ptr(w0), w1_ptr(w1),
       w0_grad_ptr(w0_grad), w1_grad_ptr(w1_grad),
-      w0_v_values_ptr(w0_v_values), w1_v_values_ptr(w1_v_values) {}
+      w0_v_values_ptr(w0_v_values), w1_v_values_ptr(w1_v_values),
+      low_rank_activation(low_rank_activation_), input_activation(input_activation_) {}
 };
 
 class PEFTMemoryManager {
 public:
-  PEFTMemoryManager(Memory gpu_mem_, size_t max_lora_size_, int max_concurrent_adapters_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_) 
+  PEFTMemoryManager(Memory gpu_mem_, size_t max_lora_size_, int max_concurrent_adapters_, int max_peft_tokens_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_) 
   : gpu_mem(gpu_mem_), 
     max_concurrent_adapters(max_concurrent_adapters_), 
     max_lora_size(max_lora_size_),
     in_dim(in_dim_), out_dim(out_dim_), num_shards(num_shards_), shard_id(shard_id_),
+    max_peft_tokens(max_peft_tokens_),
     lora_layername_substr(lora_layername_substr_), dt(dt_),
     base_ptr(nullptr), 
     finetuning_ptr(nullptr), 
@@ -128,17 +132,16 @@ class PEFTMemoryManager {
   // allocate memory for the PEFT adapter for a finetuning request for a given layer and shard
   void allocate_finetuning_memory();
 
+  LoraLinearWeight get_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config);
+
+private:
   // Check if the PEFT adapter for the given model is in memory. If not, sets the cache_miss flag to true. If this is the first finetuning request, allocate memory for the finetuning adapter.
   void get_finetuning_slot(PEFTModelID const &model_id, bool *cache_miss);
-  
   // Returns the slot in memory where the peft model weights are/will be stored. 
   // If the model is not in memory (cache miss), set the cache_miss flag to true.
   int get_inference_peft_slot(PEFTModelID const &model_id, bool *cache_miss);
-
   void load_peft_model(LoraLinearWeight &weight, LoraLinearConfig const &lora_config);
-
   LoraLinearWeight get_inference_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config);
-
   LoraLinearWeight get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config);
 
   // Legion memory management apparatus
@@ -149,6 +152,7 @@ class PEFTMemoryManager {
   int max_concurrent_adapters;
   size_t max_lora_size;
   int in_dim, out_dim, num_shards, shard_id;
+  int max_peft_tokens;
   // LRU cache apparatus
   std::unordered_map<PEFTModelID, int> lru_hashtable;
   std::vector<PEFTModelID> lru_list; // head = least recently used, tail=most recently used
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 0bb5cb64fc..eab8b30227 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -311,6 +311,12 @@ void inference_kernel(LoraLinearMeta *m,
                       ffStream_t stream) {
   checkCUDA(cublasSetStream(m->handle.blas, stream));
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+  cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]);
+  cudaDataType_t lr_actv_type = output_type;
+  assert(input_type == output_type);
+  cudaDataType_t weight_type = output_type;
+  cudaDataType_t compute_type = output_type;
 
   int num_peft_requests = 0;
   for (int i=0; i< bc->max_requests_per_batch(); i++) {
@@ -320,22 +326,74 @@ void inference_kernel(LoraLinearMeta *m,
     if (bc->requestsInfo[i].peft_bwd) {
       num_peft_requests++;
     }
-    LoraLinearConfig deserialized_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]);
-    if (!lora_applies_to_this_layer(m, deserialized_config)) {
+    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]);
+    if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
     assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && "Trainable flag mismatch");
-    bool cache_miss;
-    void *peft_slot;
-    if (!lora_config.trainable) {
-      peft_slot = m->peft_memory_manager->get_peft_model_handle(bc->requestsInfo[i].peft_model_id, &cache_miss);
+    int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    // int max_peft_tokens = bc->requestsInfo[i].max_length;
+    int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+    LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config);
+    void *intermediate_result_ptr = (bc->requestsInfo[i].peft_bwd) ? weight.low_rank_activation : m->handle.workSpace;
+    if (bc->requestsInfo[i].peft_bwd) {
+      checkCUDA(cudaMemcpyAsync(weight.input_activation,
+                                input_ptr + first_token_offset * in_dim,
+                                data_type_size(m->input_type[0]) *
+                                    num_peft_tokens * in_dim,
+                                cudaMemcpyDeviceToDevice,
+                                stream));
     } else {
-      peft_slot = m->peft_memory_manager->get_finetuning_handle(bc->requestsInfo[i].peft_model_id, &cache_miss);
-    }
-    if (cache_miss) {
-      // load model into memory
-      load_peft_model(m, peft_slot, deserialized_config, in_dim, out_dim, num_shards);
+      // use workspace to save intermediate result
+      assert(m->handle.workSpaceSize >=
+             data_type_size(m->input_type[1]) * num_peft_tokens * lora_config.rank);
     }
+    DT alpha = 1.0f, beta = 0.0f;
+    // buffer = weight_first * input
+    // [rank, num_peft_tokens] = [in_dim, rank].T * [in_dim, num_peft_tokens]
+    checkCUDA(cublasGemmEx(m->handle.blas,
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           lora_config.rank,
+                           num_peft_tokens,
+                           in_dim,
+                           &alpha,
+                           weight.w0_ptr,
+                           weight_type,
+                           in_dim,
+                           input_ptr + first_token_offset * in_dim,
+                           input_type,
+                           in_dim,
+                           &beta,
+                           intermediate_result_ptr,
+                           lr_actv_type,
+                           lora_config.rank,
+                           compute_type,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    // output = weight_second * buffer
+    // [out_dim, num_peft_tokens] = [rank, out_dim].T * [rank, num_peft_tokens]
+    // Note that we use alpha in both places since we do
+    // an in-place update for LoraLinear
+    DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank);
+    checkCUDA(cublasGemmEx(m->handle.blas,
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           out_dim,
+                           num_peft_tokens,
+                           lora_config.rank,
+                           &scaling_constant,
+                           weight.w1_ptr,
+                           weight_type,
+                           lora_config.rank,
+                           intermediate_result_ptr,
+                           lr_actv_type,
+                           lora_config.rank,
+                           &alpha,
+                           output_ptr + first_token_offset * out_dim,
+                           output_type,
+                           out_dim,
+                           compute_type,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
   }
 }
 
diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc
index ab0e1ccd21..83fa66aa15 100644
--- a/src/runtime/peft_weight_allocator.cc
+++ b/src/runtime/peft_weight_allocator.cc
@@ -21,6 +21,7 @@ void PEFTMemoryManager::allocate_inference_memory() {
 
 void PEFTMemoryManager::allocate_finetuning_memory() {
     size_t ft_size = max_lora_size*3; // weights, gradients, momentum values
+    ft_size += max_peft_tokens*(in_dim+rank); // input, low-rank activations
     // allocate chunk of memory for PEFT adapter
     Realm::Rect<1, coord_t> bounds(
         Realm::Point<1, coord_t>(0),
@@ -254,10 +255,20 @@ LoraLinearWeight PEFTMemoryManager::get_finetuning_peft(PEFTModelID const &model
     result.w1_grad_ptr = result.w0_grad_ptr + w0_num_elements*data_size;
     result.w0_v_values_ptr = result.w1_grad_ptr + w1_num_elements*data_size;
     result.w1_v_values_ptr = result.w0_v_values_ptr + w0_num_elements*data_size;
+    result.input_activation = result.w1_v_values_ptr + w1_num_elements*data_size; // max_peft_tokens*in_dim
+    result.low_rank_activation = result.input_activation + max_peft_tokens*in_dim*data_size; // max_peft_tokens*rank
     if (cache_miss) {
       load_peft_model(result, lora_config);
     }
     return result;
 }
 
+LoraLinearWeight PEFTMemoryManager::get_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config) {
+    if (lora_config.trainable) {
+      return get_finetuning_peft(model_id, lora_config);
+    } else {
+      return get_inference_peft(model_id, lora_config);
+    }
+}
+
 }; // namespace FlexFlow
\ No newline at end of file

From aa57f9807401adf05b03713918bf2be3a4cb4396 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 5 Oct 2024 21:30:36 +0000
Subject: [PATCH 15/37] .

---
 .../flexflow/utils/peft_weight_allocator.h    |  1 +
 src/ops/kernels/lora_linear_kernels.cu        | 82 ++++++++-----------
 src/runtime/peft_weight_allocator.cc          |  4 +
 3 files changed, 38 insertions(+), 49 deletions(-)

diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h
index 19b987a728..3c9efc0812 100644
--- a/include/flexflow/utils/peft_weight_allocator.h
+++ b/include/flexflow/utils/peft_weight_allocator.h
@@ -133,6 +133,7 @@ class PEFTMemoryManager {
   void allocate_finetuning_memory();
 
   LoraLinearWeight get_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config);
+  void check_ft_model_id(PEFTModelID const &model_id);
 
 private:
   // Check if the PEFT adapter for the given model is in memory. If not, sets the cache_miss flag to true. If this is the first finetuning request, allocate memory for the finetuning adapter.
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index eab8b30227..d5baf49cdc 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -395,6 +395,7 @@ void inference_kernel(LoraLinearMeta *m,
                            compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
   }
+  assert(num_peft_requests <= 1);
 }
 
 template <typename DT>
@@ -437,39 +438,24 @@ void peft_bwd_kernel(LoraLinearMeta *m,
   cudaDataType_t weight_type = output_type;
   cudaDataType_t lr_actv_type = output_type;
   cudaDataType_t compute_type = output_type;
-  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  //   cudaDataType_t compute_type = output_type;
-  // #else
-  //   // For best performance, set the default cublas compute type to
-  //   // CUBLAS_COMPUTE_16F for half precision and to
-  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  //   if (m->output_type[0] == DT_FLOAT) {
-  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  //   }
-  // #endif
+  
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
-      continue;
-    }
-    // Skip non-PEFT requests
-    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+    // Skip completed, non-PEFT and PEFT forward-only requests
+    if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || !bc->requestsInfo[i].peft_bwd) {
       continue;
     }
-    // Skip PEFT forward-only requests
-    if (!bc->requestsInfo[i].peft_bwd) {
+    int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]);
+    if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
+    assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && "Trainable flag mismatch");
+    m->peft_memory_manager->check_ft_model_id(bc->requestsInfo[i].peft_model_id);
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
-    assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
-           m->model_state.end());
-    LoraLinearWeight weight =
-        m->model_state[bc->requestsInfo[i].peft_model_id].weights;
-    int rank = weight.rank;
-    float lora_alpha =
-        m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha;
-    DT scaling_constant = (DT)(lora_alpha / rank);
+    // int max_peft_tokens = bc->requestsInfo[i].max_length;
+    int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+    LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config);
+    DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank);
 
     // Compute LORA_B weight's gradient
     if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) {
@@ -480,20 +466,20 @@ void peft_bwd_kernel(LoraLinearMeta *m,
       checkCUDA(cublasGemmEx(m->handle.blas,
                              CUBLAS_OP_N,
                              CUBLAS_OP_T,
-                             rank,
+                             lora_config.rank,
                              out_dim,
                              num_peft_tokens,
                              &scaling_constant,
-                             m->low_rank_activation,
+                             weight.low_rank_activation,
                              lr_actv_type,
-                             rank,
+                             lora_config.rank,
                              output_grad_ptr,
                              output_type,
                              out_dim,
                              &beta,
                              weight.w1_grad_ptr,
                              weight_type,
-                             rank,
+                             lora_config.rank,
                              compute_type,
                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     }
@@ -505,20 +491,20 @@ void peft_bwd_kernel(LoraLinearMeta *m,
       checkCUDA(cublasGemmEx(m->handle.blas,
                              CUBLAS_OP_N,
                              CUBLAS_OP_N,
-                             rank,
+                             lora_config.rank,
                              num_peft_tokens,
                              out_dim,
                              &scaling_constant,
                              weight.w1_ptr,
                              weight_type,
-                             rank,
+                             lora_config.rank,
                              output_grad_ptr,
                              output_type,
                              out_dim,
                              &beta,
-                             m->low_rank_activation,
+                             weight.low_rank_activation,
                              lr_actv_type,
-                             rank,
+                             lora_config.rank,
                              compute_type,
                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     }
@@ -533,15 +519,15 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                              CUBLAS_OP_N,
                              CUBLAS_OP_T,
                              in_dim,
-                             rank,
+                             lora_config.rank,
                              num_peft_tokens,
                              &alpha,
-                             m->input_activation,
+                             weight.input_activation,
                              input_type,
                              in_dim,
-                             m->low_rank_activation,
+                             weight.low_rank_activation,
                              lr_actv_type,
-                             rank,
+                             lora_config.rank,
                              &beta,
                              weight.w0_grad_ptr,
                              weight_type,
@@ -559,14 +545,14 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                              CUBLAS_OP_N,
                              in_dim,
                              num_peft_tokens,
-                             rank,
+                             lora_config.rank,
                              &alpha,
                              weight.w0_ptr,
                              weight_type,
                              in_dim,
-                             m->low_rank_activation,
+                             weight.low_rank_activation,
                              lr_actv_type,
-                             rank,
+                             lora_config.rank,
                              &beta,
                              input_grad_ptr,
                              input_type,
@@ -576,15 +562,13 @@ void peft_bwd_kernel(LoraLinearMeta *m,
     }
 
     if (bc->requestsInfo[i].optimizer_tasks.update_weights) {
-      LoraOptimizerConfig const *optimizer_config =
-          m->model_state[bc->requestsInfo[i].peft_model_id].optimizer_config;
+      LoraOptimizerConfig const *optimizer_config = lora_config.optimizer_config;
       assert(optimizer_config != nullptr);
-      assert(typeid(*optimizer_config) != typeid(LoraOptimizerConfig));
-      int w0_num_elements = rank * in_dim;
-      int w1_num_elements = rank * out_dim;
+      int w0_num_elements = lora_config.rank * in_dim;
+      int w1_num_elements = lora_config.rank * out_dim;
 
       // Get optimizer config
-      if (typeid(*optimizer_config) == typeid(LoraSGDOptimizerConfig)) {
+      if (optimizer_config->getType() == "SGD") {
         LoraSGDOptimizerConfig const *sgd_config =
             (LoraSGDOptimizerConfig const *)optimizer_config;
         // LoRA_A weight is split in tensor parallelism, so no need to apply
@@ -625,7 +609,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                                static_cast<DT const *>(weight.w1_grad_ptr),
                                static_cast<DT *>(weight.w1_v_values_ptr),
                                static_cast<DT *>(weight.w1_ptr));
-      } else if (typeid(*optimizer_config) == typeid(LoraAdamOptimizerConfig)) {
+      } else if (optimizer_config->getType() == "Adam") {
         assert(false && "Adam optimizer type not implemented yet");
       } else {
         assert(false && "Unsupported optimizer type");
diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc
index 83fa66aa15..cc40d666ed 100644
--- a/src/runtime/peft_weight_allocator.cc
+++ b/src/runtime/peft_weight_allocator.cc
@@ -271,4 +271,8 @@ LoraLinearWeight PEFTMemoryManager::get_peft(PEFTModelID const &model_id, LoraLi
     }
 }
 
+void PEFTMemoryManager::check_ft_model_id(PEFTModelID const &model_id) {
+    assert(finetuning_model_id == model_id && "PEFT bwd model is not in memory!");
+}
+
 }; // namespace FlexFlow
\ No newline at end of file

From 53c408c3111e43bd0bbe084c4310df5a5ed1c1b1 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 5 Oct 2024 22:33:49 +0000
Subject: [PATCH 16/37] frontend

---
 include/flexflow/flexflow_c.h             |  8 ++++++--
 include/flexflow/request_manager.h        |  1 -
 inference/models/falcon.cc                |  7 +++++++
 inference/models/llama.cc                 | 10 +++++++---
 inference/models/mpt.cc                   |  8 ++++++++
 inference/models/opt.cc                   |  7 +++++++
 inference/models/starcoder.cc             |  7 +++++++
 python/flexflow/core/flexflow_cffi.py     | 12 ++++++++++--
 python/flexflow/serve/models/falcon.py    |  4 ++++
 python/flexflow/serve/models/llama.py     |  4 ++++
 python/flexflow/serve/models/mpt.py       |  4 ++++
 python/flexflow/serve/models/opt.py       |  4 ++++
 python/flexflow/serve/models/starcoder.py |  4 ++++
 python/flexflow/serve/serve.py            | 13 +++++++------
 src/c/flexflow_c.cc                       | 23 ++++++++++++++++++++---
 15 files changed, 99 insertions(+), 17 deletions(-)

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 5aa2fdd551..19b2bc7c83 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -91,6 +91,8 @@ int flexflow_config_get_tensor_parallelism_degree(flexflow_config_t handle_);
 
 int flexflow_config_get_pipeline_parallelism_degree(flexflow_config_t handle_);
 
+bool flexflow_config_get_enable_peft(flexflow_config_t handle_);
+
 void flexflow_config_set_data_parallelism_degree(flexflow_config_t handle_,
                                                  int value);
 
@@ -598,8 +600,10 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
                                             bool beam_search,
                                             char const *name);
 
-flexflow_peft_model_id_t flexflow_model_add_lora_layer(
-    flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_);
+void flexflow_model_add_lora_layers(flexflow_model_t handle_, int num_target_modules, char const **target_modules_);
+
+
+flexflow_peft_model_id_t flexflow_model_register_peft_adapter(flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_);
 
 void flexflow_model_set_sgd_optimizer(flexflow_model_t handle,
                                       flexflow_sgd_optimizer_t optimizer);
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index fcb09f15ed..542deb336d 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -149,7 +149,6 @@ class RequestManager {
                           int eos_token_id,
                           std::string const &path);
   void register_output_filepath(std::string const &);
-  void register_peft_model(FFModel *model, PEFTModelID peft_model_id);
   LoraLinearConfig get_peft_config(PEFTModelID peft_model_id);
   void set_max_lora_rank(int max_lora_rank);
   void set_max_concurrent_adapters(int max_concurrent_adapters);
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index 195d6ba7e3..945c55f296 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -242,6 +242,13 @@ void FALCON::create_falcon_model(FFModel &ff,
     output = ff.argmax(lm_head, /*beam_Search*/ false);
   }
 
+  // If PEFT is enabled, add LoRA layers
+  if (ff.config.enable_peft) {
+    // todo: add attention projections
+    std::vector<std::string> target_modules = {"dense_h_to_4h", "dense_4h_to_h"};
+    ff.add_lora_layers();
+  }
+
   FileDataLoader *fileloader =
       new FileDataLoader("",
                          weight_file_path,
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index cf26194597..6a70620942 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -226,9 +226,6 @@ void LLAMA::create_llama_model(FFModel &ff,
         REG_MODE_NONE,
         0.0f,
         std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str());
-    // Low-Rank Adapter (LoRA) for the second linear layer
-    // ff.lora_linear(std::string("down_proj"), std::string("layers." +
-    // std::to_string(i) + ".mlp.down_proj.lora").c_str());
   }
   // final normalization and linear
   Tensor final_rms_norm_output[2] = {nullptr, nullptr};
@@ -273,6 +270,13 @@ void LLAMA::create_llama_model(FFModel &ff,
     }
   }
 
+  // If PEFT is enabled, add LoRA layers
+  if (ff.config.enable_peft) {
+    // todo: add attention projections
+    std::vector<std::string> target_modules = {"gate_proj", "up_proj", "down_proj"};
+    ff.add_lora_layers();
+  }
+
   FileDataLoader *fileloader = new FileDataLoader(
       "",
       weight_file_path,
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index e4a7e0056d..6946ed18c3 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -250,6 +250,14 @@ void MPT::create_mpt_model(FFModel &ff,
   } else {
     output = ff.argmax(lm_head, /*beam_Search*/ false);
   }
+  
+  // If PEFT is enabled, add LoRA layers
+  if (ff.config.enable_peft) {
+    // todo: add attention projections
+    std::vector<std::string> target_modules = {"up_proj", "down_proj"};
+    ff.add_lora_layers();
+  }
+
   FileDataLoader *fileloader =
       new FileDataLoader("",
                          weight_file_path,
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index b3f2ef4e17..b78dafbe95 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -262,6 +262,13 @@ void OPT::create_opt_model(FFModel &ff,
     output = ff.argmax(softmax, /*beam_Search*/ false);
   }
 
+  // If PEFT is enabled, add LoRA layers
+  if (ff.config.enable_peft) {
+    // todo: add attention projections
+    std::vector<std::string> target_modules = {"fc1", "fc2"};
+    ff.add_lora_layers();
+  }
+
   FileDataLoader *fileloader = new FileDataLoader(
       "",
       weight_file_path,
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index cd8bf3a9a7..3da1e82a79 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -224,6 +224,13 @@ void STARCODER::create_starcoder_model(
     }
   }
 
+  // If PEFT is enabled, add LoRA layers
+  if (ff.config.enable_peft) {
+    // todo: add attention projections
+    std::vector<std::string> target_modules = {"c_fc", "c_proj"};
+    ff.add_lora_layers();
+  }
+
   InferenceManager *im = InferenceManager::get_inference_manager();
   FileDataLoader *fileloader = new FileDataLoader(
       "",
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index ec07ee9a5f..5a16fbc34f 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -810,6 +810,10 @@ def pipeline_parallelism_degree(self, value):
     @property
     def python_data_loader_type(self):
         return ffc().flexflow_config_get_python_data_loader_type(self.handle)
+    
+    @property
+    def enable_peft(self):
+        return ffc().flexflow_config_get_enable_peft(self.handle)
 
     @property
     def cpu_offload(self):
@@ -4284,8 +4288,12 @@ def argmax(self, input, beam_search, name=None):
         self.add_layer(OpType.ARGMAX, name)
         return Tensor(handle, owner_op_type=OpType.ARGMAX)
 
-    def add_lora_layer(self, peft_config):
-        return ffc().flexflow_model_add_lora_layer(self.handle, peft_config.handle)
+    def add_lora_layers(self, target_modules: List[str]):
+        c_target_modules = [get_c_name(module) for module in target_modules]
+        return ffc().flexflow_model_add_lora_layers(self.handle, len(target_modules), c_target_modules)
+    
+    def register_peft_adapter(self, peft_config):
+        return ffc().flexflow_model_register_peft_adapter(self.handle, peft_config.handle)
 
     def reset_metrics(self):
         """Reset performance metrics.
diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index 0e8fbcbd7d..b38ffb2963 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -241,6 +241,10 @@ def build_model(self, max_tokens_per_batch):
                 # output = ffmodel.arg_top_k(lm_head, 1, False)
                 softmax = ffmodel.softmax(lm_head, -1)
                 output = ffmodel.argmax(softmax, False)
+        
+        if self.ffconfig.enable_peft:
+            # TODO: add attention projections
+            ffmodel.add_lora_layers(["dense_h_to_4h", "dense_4h_to_h"])
 
         self.ffmodel = ffmodel
 
diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index 96f0258572..0cb2847556 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -248,6 +248,10 @@ def build_model(self, max_tokens_per_batch):
                 # output = ffmodel.arg_top_k(dense, 1, False)
                 softmax = ffmodel.softmax(dense, -1)
                 output = ffmodel.argmax(softmax, False)
+        
+        if self.ffconfig.enable_peft:
+            # TODO: add attention projections
+            ffmodel.add_lora_layers(["gate_proj", "up_proj", "down_proj"])
 
         self.ffmodel = ffmodel
 
diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index b350ae106d..4bc3026989 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -252,6 +252,10 @@ def build_model(self, max_tokens_per_batch):
             softmax = ffmodel.softmax(lm_head, -1)
             output = ffmodel.argmax(softmax, False)
 
+        if self.ffconfig.enable_peft:
+            # TODO: add attention projections
+            ffmodel.add_lora_layers(["up_proj", "down_proj"])
+        
         self.ffmodel = ffmodel
 
     # TODO: finish this
diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index 02668abf59..047e2df013 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -282,6 +282,10 @@ def build_model(self, max_tokens_per_batch):
                 softmax = ffmodel.softmax(lm_head, -1)
                 output = ffmodel.argmax(softmax, False)
 
+        if self.ffconfig.enable_peft:
+            # TODO: add attention projections
+            ffmodel.add_lora_layers(["fc1", "fc2"])
+        
         self.ffmodel = ffmodel
 
     def convert_hf_weight_name(name):
diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index 2d4471201f..58c2bf621a 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -220,6 +220,10 @@ def build_model(self, max_tokens_per_batch):
             softmax = ffmodel.softmax(lm_head, -1)
             output = ffmodel.argmax(softmax, False)
 
+        if self.ffconfig.enable_peft:
+            # TODO: add attention projections
+            ffmodel.add_lora_layers(["c_fc", "c_proj"])
+        
         self.ffmodel = ffmodel
 
     def convert_hf_model(model, dst_folder):
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 794f1babb3..cfa723d3c6 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -443,12 +443,6 @@ def compile(
         # Download the weights from huggingface (if needed)
         self.download_hf_weights_if_needed()
 
-        # Add PEFT layer if registered
-        for ff_peft_config, peft_dict in self.pefts.items():
-            ff_peft_config.ff_compile()
-            ff_peft_model_id = self.model.ffmodel.add_lora_layer(ff_peft_config)
-            peft_dict["ff_peft_model_id"] = ff_peft_model_id
-
         # Create file data loader, load weights into tensors
         model_configs = self.config_class(self.hf_config)
 
@@ -487,6 +481,13 @@ def compile(
         for ssm in self.ssms:
             self.rm.register_ssm_model(ssm.model.ffmodel)
 
+        # Add PEFT layer if registered
+        for ff_peft_config, peft_dict in self.pefts.items():
+            ff_peft_config.ff_compile()
+            ff_peft_model_id = self.model.ffmodel.register_peft_adapter(ff_peft_config)
+            peft_dict["ff_peft_model_id"] = ff_peft_model_id
+
+
         # start background server
         if (mode == InferenceMode.TREE_VERIFY_MODE) or (
             mode == InferenceMode.INC_DECODING_MODE
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index e6b246597f..8810cfb30c 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -173,6 +173,11 @@ void flexflow_config_set_pipeline_parallelism_degree(flexflow_config_t handle_,
   handle->pipeline_parallelism_degree = value;
 }
 
+bool flexflow_config_get_enable_peft(flexflow_config_t handle_) {
+  FFConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->enable_peft;
+}
+
 int flexflow_config_get_python_data_loader_type(flexflow_config_t handle_) {
   FFConfig *handle = FFCObjectWrapper::unwrap(handle_);
   return handle->python_data_loader_type;
@@ -1549,14 +1554,26 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
   return FFCObjectWrapper::wrap(tensor);
 }
 
-flexflow_peft_model_id_t flexflow_model_add_lora_layer(
+void flexflow_model_add_lora_layers(flexflow_model_t handle_, int num_target_modules, char const **target_modules_) {
+  FFModel *handle = FFCObjectWrapper::unwrap(handle_);
+  std::vector<std::string> target_modules;
+  for (int i = 0; i < num_target_modules; i++) {
+    target_modules.push_back(target_modules_[i]);
+  }
+  DEBUG_PRINT("[Add Lora Layers] model handle: %p, num_target_modules %d",
+              handle,
+              num_target_modules);
+  handle->add_lora_layers(target_modules);
+}
+
+flexflow_peft_model_id_t flexflow_model_register_peft_adapter(
     flexflow_model_t handle_,
     const flexflow_lora_linear_config_t peft_config_) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   LoraLinearConfig const *peft_config = FFCObjectWrapper::unwrap(peft_config_);
-  PEFTModelID *peft_model_id = handle->add_lora_layer(*peft_config);
+  PEFTModelID *peft_model_id = handle->register_peft_adapter(*peft_config);
 
-  DEBUG_PRINT("[Add Lora Layer] model handle: %p, peft_config handle %p, "
+  DEBUG_PRINT("[Register PEFT Adapter] model handle: %p, peft_config handle %p, "
               "peft_model_id: %p",
               handle,
               peft_config,

From 1691100906ddf25191fb0e1444fa75d0675cd44d Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 6 Oct 2024 05:10:58 +0000
Subject: [PATCH 17/37] bug fix

---
 include/flexflow/batch_config.h               |  4 +-
 include/flexflow/fftype.h                     |  1 +
 include/flexflow/model.h                      |  1 +
 .../ops/kernels/lora_linear_kernels.h         |  4 +
 include/flexflow/ops/lora_linear.h            |  2 -
 include/flexflow/ops/lora_linear_params.h     | 25 ++++--
 include/flexflow/request_manager.h            |  2 +
 .../flexflow/utils/peft_weight_allocator.h    | 22 +++--
 src/ops/kernels/lora_linear_kernels.cu        | 41 +++++----
 src/ops/lora_linear.cc                        | 88 +++++++++++--------
 src/ops/lora_linear_params.cc                 |  9 +-
 src/runtime/fftype.cc                         |  2 +
 src/runtime/peft_weight_allocator.cc          | 43 ++++++---
 src/runtime/peft_weight_allocator.cu          |  8 +-
 src/runtime/request_manager.cc                |  5 +-
 15 files changed, 156 insertions(+), 101 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index cb2f8d3a3d..44d829a7f7 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -94,6 +94,7 @@ class BatchConfig {
       num_tokens_in_batch = 0;
       max_length = 0;
       request_guid = 0;
+      peft_model_id = PEFTModelID::NO_ID;
       prompt_phase = false;
       batch_config_request_id = -1;
       peft_bwd = false;
@@ -109,7 +110,8 @@ class BatchConfig {
     bool prompt_phase = false;
     RequestGuid request_guid;
     // PEFT fields
-    std::unordered_map<PEFTModelID, std::string> peft_adapters;
+    PEFTModelID peft_model_id;
+    std::string peft_model_config;
     bool peft_bwd;
     OptimizerTasks optimizer_tasks;
   };
diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h
index 3e482b8d67..ebc811c262 100644
--- a/include/flexflow/fftype.h
+++ b/include/flexflow/fftype.h
@@ -27,6 +27,7 @@ class PEFTModelID {
   PEFTModelID(size_t id);
   bool is_valid_id() const;
   friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs);
+  friend bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs);
   friend std::ostream &operator<<(std::ostream &os,
                                   PEFTModelID const &peft_model_id);
 
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index d1dbe72d7c..e3beafe20c 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -847,6 +847,7 @@ class FFModel {
   // ========================================
 //   PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config);
     void add_lora_layers(std::vector<std::string> target_modules);
+    PEFTModelID *register_peft_adapter(LoraLinearConfig const &peft_config);
   // ========================================
   // Inference APIs
   // ========================================
diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
index eef3b392b3..00f16af146 100644
--- a/include/flexflow/ops/kernels/lora_linear_kernels.h
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -6,6 +6,7 @@
 #include "flexflow/fftype.h"
 #include "flexflow/op_meta.h"
 #include "flexflow/ops/lora_linear.h"
+#include "flexflow/utils/peft_weight_allocator.h"
 
 namespace FlexFlow {
 
@@ -35,6 +36,9 @@ class LoraLinearMeta : public OpMeta {
 
 namespace Kernels {
 namespace LoraLinear {
+
+bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config);
+
 void init_kernel_wrapper(LoraLinearMeta *m, int seed);
 void inference_kernel_wrapper(LoraLinearMeta *m,
                               BatchConfig const *bc,
diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h
index 8d37be0c64..1c6070afe4 100644
--- a/include/flexflow/ops/lora_linear.h
+++ b/include/flexflow/ops/lora_linear.h
@@ -20,12 +20,10 @@ class LoraLinear : public Op {
   LoraLinear(
       FFModel &model,
       LayerID const &layer_guid,
-      OperatorType type,
       ParallelTensor const input,
       ParallelTensor const output,
       int max_rank,
       int max_concurrent_adapters,
-      // std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
       char const *name = nullptr);
   LoraLinear(FFModel &model,
              LoraLinear const &other,
diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
index c5a327459f..525a9209d3 100644
--- a/include/flexflow/ops/lora_linear_params.h
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -124,16 +124,28 @@ class LoraLinearConfig {
                    std::vector<std::string> const &target_modules_ = {});
   // constructor used to support std::unordered_map
   LoraLinearConfig();
+
+  // Method to set optimizer
   template<typename T>
-    void setOptimizer(T&& opt) {
-        optimizer_config = std::make_unique<T>(std::forward<T>(opt));
+  void setOptimizer(T&& opt) {
+    if constexpr (std::is_base_of_v<LoraOptimizerConfig, std::remove_reference_t<T>>) {
+      optimizer_config = std::make_unique<std::remove_reference_t<T>>(std::forward<T>(opt));
+    } else if constexpr (std::is_same_v<std::unique_ptr<LoraOptimizerConfig>, std::remove_reference_t<T>>) {
+      optimizer_config = std::move(opt);
+    } else {
+      static_assert(always_false<T>, "Unsupported optimizer type");
     }
+  }
+  // Helper template for static_assert
+  template <typename>
+  static inline constexpr bool always_false = false;
+  
   friend bool operator==(LoraLinearConfig const &lhs,
                          LoraLinearConfig const &rhs);
   friend std::ostream &operator<<(std::ostream &os,
                                   LoraLinearConfig const &llc);
   std::string serialize_to_json_string(int indent=-1) const {
-    json j = {
+    nlohmann::json j = {
         {"cache_folder", cache_folder},
         {"peft_model_id", peft_model_id},
         {"rank", rank},
@@ -144,7 +156,8 @@ class LoraLinearConfig {
         {"init_lora_weights", init_lora_weights},
         {"base_model_name_or_path", base_model_name_or_path},
         {"precision", precision},
-        {"optimizer_config", optimizer_config ? optimizer_config->toJson() : nullptr}
+        // {"optimizer_config", optimizer_config ? optimizer_config->toJson() : nullptr}
+        {"optimizer_config", optimizer_config ? nlohmann::json(optimizer_config->toJson()) : nlohmann::json()}
     };
 
     return j.dump(indent);  // No indentation
@@ -156,7 +169,7 @@ class LoraLinearConfig {
   }
   // Deserialization method
   static LoraLinearConfig deserialize_from_json_string(const std::string& json_string) {
-    json j = json::parse(json_string);
+    nlohmann::json j = nlohmann::json::parse(json_string);
     LoraLinearConfig config(
         j["cache_folder"].get<std::string>(),
         j["peft_model_id"].get<std::string>(),
@@ -208,8 +221,6 @@ class LoraLinearConfig {
 class LoraLinearParams {
 public:
   LayerID layer_guid;
-  // OperatorType type;
-  // std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
   int max_rank;
   int max_concurrent_adapters;
   char name[MAX_OPNAME];
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 542deb336d..628714dcc0 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -149,6 +149,8 @@ class RequestManager {
                           int eos_token_id,
                           std::string const &path);
   void register_output_filepath(std::string const &);
+  void register_peft_config(PEFTModelID const &peft_model_id,
+                            LoraLinearConfig const &peft_config);
   LoraLinearConfig get_peft_config(PEFTModelID peft_model_id);
   void set_max_lora_rank(int max_lora_rank);
   void set_max_concurrent_adapters(int max_concurrent_adapters);
diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h
index 3c9efc0812..9670da8a4f 100644
--- a/include/flexflow/utils/peft_weight_allocator.h
+++ b/include/flexflow/utils/peft_weight_allocator.h
@@ -17,12 +17,13 @@
 #define _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_
 
 #include "flexflow/config.h"
-#include "lora_linear_params.h"
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/ops/lora_linear_params.h"
 // #include <mutex>
 
 namespace FlexFlow {
 
-#ifdef DEACODE
+#ifdef DEADCODE
 class PEFTWeightAllocator {
 public:
   PEFTWeightAllocator(void *_base_ptr, size_t _total_size)
@@ -108,19 +109,21 @@ struct LoraLinearWeight {
       low_rank_activation(low_rank_activation_), input_activation(input_activation_) {}
 };
 
+void init_peft_weight_wrapper(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, DataType dt, int seed);
+
 class PEFTMemoryManager {
 public:
-  PEFTMemoryManager(Memory gpu_mem_, size_t max_lora_size_, int max_concurrent_adapters_, int max_peft_tokens_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_) 
+  PEFTMemoryManager(Legion::Memory gpu_mem_, int max_rank_, int max_concurrent_adapters_, int max_peft_tokens_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_) 
   : gpu_mem(gpu_mem_), 
     max_concurrent_adapters(max_concurrent_adapters_), 
-    max_lora_size(max_lora_size_),
+    max_rank(max_rank_),
     in_dim(in_dim_), out_dim(out_dim_), num_shards(num_shards_), shard_id(shard_id_),
     max_peft_tokens(max_peft_tokens_),
     lora_layername_substr(lora_layername_substr_), dt(dt_),
     base_ptr(nullptr), 
     finetuning_ptr(nullptr), 
     finetuning_model_id(PEFTModelID::NO_ID) {
-    
+    max_lora_size = data_type_size(dt) * (max_rank * in_dim + max_rank * out_dim);
     assert(max_concurrent_adapters > 0 && "PEFT Memory Manager max_concurrent_adapters must be > 0");
     assert(max_lora_size > 0 && "PEFT Memory Manager max_lora_size must be > 0");
     allocate_inference_memory();
@@ -146,12 +149,13 @@ class PEFTMemoryManager {
   LoraLinearWeight get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config);
 
   // Legion memory management apparatus
-  Memory gpu_mem;
+  Legion::Memory gpu_mem;
   Realm::RegionInstance peftLegionInst;
   void *base_ptr, *finetuning_ptr;
   // Size and shapes
   int max_concurrent_adapters;
-  size_t max_lora_size;
+  int max_rank;
+  int max_lora_size;
   int in_dim, out_dim, num_shards, shard_id;
   int max_peft_tokens;
   // LRU cache apparatus
@@ -162,8 +166,8 @@ class PEFTMemoryManager {
   std::string lora_layername_substr;
   DataType dt;
   PEFTModelID finetuning_model_id;
-}
+};
 
-}; // namespace FlexFlow
+} // namespace FlexFlow
 
 #endif // _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index d5baf49cdc..134af3ca6e 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -24,8 +24,10 @@ namespace FlexFlow {
 
 LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li)
     : OpMeta(handler, li) {
+#ifdef DEADCODE
   allocated_peft_buffer_size1 = 0;
   allocated_peft_buffer_size2 = 0;
+#endif
 }
 
 LoraLinearMeta::~LoraLinearMeta(void) {}
@@ -145,6 +147,16 @@ void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
   }
 }
 
+bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config) {
+  for (std::string s : config.target_modules) {
+    std::string n(m->op_name);
+    if (n.find(s) != std::string::npos) {
+      return true;
+    }
+  }
+  return false;
+}
+
 namespace Internal {
 
 
@@ -289,17 +301,6 @@ void inference_kernel(LoraLinearMeta *m,
 }
 #endif
 
-bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config) {
-  for (std::string s : config.target_modules) {
-    std::string n(m->op_name);
-    if (n.find(s) != std::string::npos) {
-      return true;
-    }
-  }
-  return false;
-}
-
-
 template <typename DT>
 void inference_kernel(LoraLinearMeta *m,
                       BatchConfig const *bc,
@@ -326,7 +327,7 @@ void inference_kernel(LoraLinearMeta *m,
     if (bc->requestsInfo[i].peft_bwd) {
       num_peft_requests++;
     }
-    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]);
+    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config);
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
@@ -444,8 +445,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
     if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || !bc->requestsInfo[i].peft_bwd) {
       continue;
     }
-    int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]);
+    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config);
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
@@ -453,7 +453,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
     m->peft_memory_manager->check_ft_model_id(bc->requestsInfo[i].peft_model_id);
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     // int max_peft_tokens = bc->requestsInfo[i].max_length;
-    int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+    // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
     LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config);
     DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank);
 
@@ -562,15 +562,14 @@ void peft_bwd_kernel(LoraLinearMeta *m,
     }
 
     if (bc->requestsInfo[i].optimizer_tasks.update_weights) {
-      LoraOptimizerConfig const *optimizer_config = lora_config.optimizer_config;
-      assert(optimizer_config != nullptr);
+      assert(lora_config.optimizer_config != nullptr);
       int w0_num_elements = lora_config.rank * in_dim;
       int w1_num_elements = lora_config.rank * out_dim;
 
       // Get optimizer config
-      if (optimizer_config->getType() == "SGD") {
-        LoraSGDOptimizerConfig const *sgd_config =
-            (LoraSGDOptimizerConfig const *)optimizer_config;
+
+      if (lora_config.optimizer_config->getType() == "SGD") {
+        LoraSGDOptimizerConfig const *sgd_config = static_cast<LoraSGDOptimizerConfig const *>(lora_config.optimizer_config.get());
         // LoRA_A weight is split in tensor parallelism, so no need to apply
         // all-reduce
         sgd_update<<<GET_BLOCKS(w0_num_elements),
@@ -609,7 +608,7 @@ void peft_bwd_kernel(LoraLinearMeta *m,
                                static_cast<DT const *>(weight.w1_grad_ptr),
                                static_cast<DT *>(weight.w1_v_values_ptr),
                                static_cast<DT *>(weight.w1_ptr));
-      } else if (optimizer_config->getType() == "Adam") {
+      } else if (lora_config.optimizer_config->getType() == "Adam") {
         assert(false && "Adam optimizer type not implemented yet");
       } else {
         assert(false && "Unsupported optimizer type");
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index a18f47c4ac..f7ac4ff06e 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -6,6 +6,7 @@
 #include "flexflow/utils/hash_utils.h"
 #include "flexflow/utils/peft_weight_allocator.h"
 #include "legion/legion_utilities.h"
+#include "flexflow/request_manager.h"
 #include <sys/stat.h>
 #include <sys/types.h>
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
@@ -51,13 +52,13 @@ bool check_lora_layer_match(Layer *potential_target,
   return false;
 }
 
-void FFmodel::add_lora_layers(std::vector<std::string> target_modules) {
+void FFModel::add_lora_layers(std::vector<std::string> target_modules) {
   assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled");
   assert(target_modules.size() > 0 && "LoRA target module name is empty");
   RequestManager *rm = RequestManager::get_request_manager();
   int max_lora_rank = rm->get_max_lora_rank();
   int max_concurrent_adapters = rm->get_max_concurrent_adapters();
-  assert(max_rank > 1 && max_rank <= 32 && "Invalid max LoRA rank");
+  assert(max_lora_rank > 1 && max_lora_rank <= 32 && "Invalid max LoRA rank");
   assert(max_concurrent_adapters > 0 && "Invalid number of LoRA concurrent adapters");
 
   for (std::string target_module_name : target_modules) {
@@ -120,7 +121,7 @@ void FFmodel::add_lora_layers(std::vector<std::string> target_modules) {
                                           true /*create_grad*/);
       }
       // pass max_rank and max_concurrent_adapters to OP_LORA layer
-      peft_layer->add_int_property("max_rank", max_rank);
+      peft_layer->add_int_property("max_rank", max_lora_rank);
       peft_layer->add_int_property("max_concurrent_adapters", max_concurrent_adapters);
       it = layers.insert(it + 1, peft_layer);
       ++it;
@@ -263,7 +264,7 @@ Op *LoraLinear::create_operator_from_layer(
   long long value;
   layer->get_int_property("max_rank", value);
   int max_rank = value;
-  layer->get_int_property("max_concurrent_adapters", max_concurrent_adapters);
+  layer->get_int_property("max_concurrent_adapters", value);
   int max_concurrent_adapters = value;
 #ifdef DEADCODE
   std::unordered_map<PEFTModelID, LoraLinearConfig> _peft_configs;
@@ -276,7 +277,6 @@ Op *LoraLinear::create_operator_from_layer(
 #endif
   return new LoraLinear(model,
                         layer->layer_guid,
-                        layer->op_type,
                         inputs[0],
                         inputs[1],
                         max_rank,
@@ -290,7 +290,6 @@ LoraLinear::LoraLinear(FFModel &model,
                        ParallelTensor const output)
     : LoraLinear(model,
                  other.layer_guid,
-                 other.op_type,
                  input,
                  output,
                  other.max_rank,
@@ -303,7 +302,6 @@ LoraLinear::LoraLinear(FFModel &model,
                        char const *name)
     : LoraLinear(model,
                  params.layer_guid,
-                 params.type,
                  inputs.first,
                  inputs.second,
                  params.max_rank,
@@ -313,7 +311,6 @@ LoraLinear::LoraLinear(FFModel &model,
 LoraLinear::LoraLinear(
     FFModel &model,
     LayerID const &_layer_guid,
-    OperatorType _op_type,
     ParallelTensor const _input,
     ParallelTensor const _output,
     int _max_rank,
@@ -321,7 +318,7 @@ LoraLinear::LoraLinear(
     // std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
     char const *name)
     : Op(model,
-         _op_type,
+         OP_LORA,
          _output->data_type,
          name,
          2 /*inputs*/,
@@ -473,9 +470,8 @@ OpMeta *LoraLinear::init_task(Task const *task,
       lora_layername.substr(0, found + searchString.length());
   
   // allocate space for lora weights
-  size_t max_lora_size = data_type_size(dt) * (lora->max_rank * in_dim + lora->max_rank * out_dim);
   Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
-  m->peft_memory_manager = new PEFTMemoryManager(gpu_mem, max_lora_size, lora->max_concurrent_adapters, in_dim, out_dim, num_shards, shard_id, lora_layername_substr, dt);
+  m->peft_memory_manager = new PEFTMemoryManager(gpu_mem, lora->max_rank, lora->max_concurrent_adapters, BatchConfig::max_sequence_length(), in_dim, out_dim, num_shards, shard_id, lora_layername_substr, dt);
   m->peft_memory_manager->allocate_inference_memory();
   return m;
 }
@@ -709,8 +705,8 @@ void LoraLinear::inference_task(Task const *task,
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorRW(
       m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  // int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
-  // int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+  int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
 
   // int num_infr_tokens = bc->num_active_infr_tokens();
   // int num_peft_tokens = bc->num_active_peft_tokens();
@@ -761,12 +757,15 @@ void LoraLinear::inference_task(Task const *task,
       assert(false);
     }
 
-    int rank, num_tokens;
-    for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) {
-      PEFTModelID peft_model_id = it->first;
-      LoraLinearWeight weight = m->model_state[peft_model_id].weights;
-      rank = weight.rank;
-      num_tokens = input.domain.get_volume() / weight.in_dim;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config);
+      if (!lora_applies_to_this_layer(m, lora_config)) {
+        continue;
+      }
+      LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config);
       fs::path dst_filepath_weights =
           get_dst_folder("weights", m->decoding_step, shard_id) / layername;
       std::string filenameA =
@@ -775,21 +774,38 @@ void LoraLinear::inference_task(Task const *task,
           dst_filepath_weights.string() + ".weight_B.original";
       if (m->input_type[0] == DT_FLOAT) {
         save_tensor((float *)weight.w0_ptr,
-                    weight.rank * weight.in_dim,
+                    lora_config.rank * in_dim,
                     filenameA.c_str());
         save_tensor((float *)weight.w1_ptr,
-                    weight.rank * weight.out_dim,
+                    lora_config.rank * out_dim,
                     filenameB.c_str());
       } else if (m->input_type[0] == DT_HALF) {
         save_tensor((half *)weight.w0_ptr,
-                    weight.rank * weight.in_dim,
+                    lora_config.rank * in_dim,
                     filenameA.c_str());
         save_tensor((half *)weight.w1_ptr,
-                    weight.rank * weight.out_dim,
+                    lora_config.rank * out_dim,
                     filenameB.c_str());
       } else {
         assert(false && "Data type not supported");
       }
+
+      if (bc->requestsInfo[i].peft_bwd) {
+        int num_tokens = input.domain.get_volume() / in_dim;
+        // input activation (intermediate)
+        filename = dst_filepath.string() + ".low_rank_activation";
+        if (output.data_type == DT_FLOAT) {
+          save_tensor((float *)weight.low_rank_activation,
+                      lora_config.rank * num_tokens,
+                      filename.c_str());
+        } else if (output.data_type == DT_HALF) {
+          save_tensor((half *)weight.low_rank_activation,
+                      lora_config.rank * num_tokens,
+                      filename.c_str());
+        } else {
+          assert(false);
+        }
+      }
     }
 
     filename = dst_filepath.string() + ".output_0";
@@ -803,21 +819,7 @@ void LoraLinear::inference_task(Task const *task,
       assert(false);
     }
 
-    if (bc->num_active_peft_tokens() > 0) {
-      // input activation (intermediate)
-      filename = dst_filepath.string() + ".low_rank_activation";
-      if (output.data_type == DT_FLOAT) {
-        save_tensor((float *)m->low_rank_activation,
-                    rank * num_tokens,
-                    filename.c_str());
-      } else if (output.data_type == DT_HALF) {
-        save_tensor((half *)m->low_rank_activation,
-                    rank * num_tokens,
-                    filename.c_str());
-      } else {
-        assert(false);
-      }
-    }
+    
     m->decoding_step++;
   }
 }
@@ -905,6 +907,16 @@ void lora_inference_debugging(LoraLinearMeta *m,
   // weights, weights gradients
   fs::path dst_filepath_weights =
       get_dst_folder("weights", m->bwd_step, shard_id) / layername;
+  
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config);
+    if (!lora_applies_to_this_layer(m, lora_config)) {
+      continue;
+    }
+  
   assert(m->model_state.size() >= 1 && "Model state empty!");
   for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) {
     PEFTModelID peft_model_id = it->first;
diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc
index 310b6d0973..c7b9fcc711 100644
--- a/src/ops/lora_linear_params.cc
+++ b/src/ops/lora_linear_params.cc
@@ -170,11 +170,10 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) {
   os << "trainable: " << llc.trainable << ", ";
   if (llc.optimizer_config != nullptr) {
     os << "optimizer_config: ";
-    if (typeid(*llc.optimizer_config) == typeid(LoraSGDOptimizerConfig)) {
-      os << *static_cast<LoraSGDOptimizerConfig *>(llc.optimizer_config);
-    } else if (typeid(*llc.optimizer_config) ==
-               typeid(LoraAdamOptimizerConfig)) {
-      os << *static_cast<LoraAdamOptimizerConfig *>(llc.optimizer_config);
+    if (llc.optimizer_config.get()->getType() == "SGD") {
+      os << *static_cast<LoraSGDOptimizerConfig const *>(llc.optimizer_config.get());
+    } else if (llc.optimizer_config.get()->getType() == "Adam") {
+      os << *static_cast<LoraAdamOptimizerConfig const *>(llc.optimizer_config.get());
     } else {
       os << "Unknown optimizer config type";
     }
diff --git a/src/runtime/fftype.cc b/src/runtime/fftype.cc
index 8213726e8a..0af5f45350 100644
--- a/src/runtime/fftype.cc
+++ b/src/runtime/fftype.cc
@@ -46,6 +46,8 @@ bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs) {
   return lhs.id == rhs.id;
 }
 
+bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs) { return !(lhs == rhs); }
+
 std::ostream &operator<<(std::ostream &os, PEFTModelID const &peft_model_id) {
   if (peft_model_id == PEFTModelID::NO_ID) {
     os << "NO_ID";
diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc
index cc40d666ed..287eb7e20a 100644
--- a/src/runtime/peft_weight_allocator.cc
+++ b/src/runtime/peft_weight_allocator.cc
@@ -1,6 +1,24 @@
-#include "peft_weight_allocator.h"
+#include "flexflow/utils/peft_weight_allocator.h"
 
 namespace FlexFlow {
+// declare legion names
+using Legion::ArgumentMap;
+using Legion::Context;
+using Legion::coord_t;
+using Legion::Domain;
+using Legion::FutureMap;
+using Legion::IndexLauncher;
+using Legion::InlineLauncher;
+using Legion::Machine;
+using Legion::Memory;
+using Legion::PhysicalRegion;
+using Legion::Predicate;
+using Legion::Rect;
+using Legion::RegionRequirement;
+using Legion::Runtime;
+using Legion::Task;
+using Legion::TaskArgument;
+using Legion::TaskLauncher;
 
 void PEFTMemoryManager::allocate_inference_memory() {
     // allocate chunk of memory for all the PEFT adapters
@@ -21,7 +39,7 @@ void PEFTMemoryManager::allocate_inference_memory() {
 
 void PEFTMemoryManager::allocate_finetuning_memory() {
     size_t ft_size = max_lora_size*3; // weights, gradients, momentum values
-    ft_size += max_peft_tokens*(in_dim+rank); // input, low-rank activations
+    ft_size += max_peft_tokens * (in_dim + max_rank); // input, low-rank activations
     // allocate chunk of memory for PEFT adapter
     Realm::Rect<1, coord_t> bounds(
         Realm::Point<1, coord_t>(0),
@@ -144,7 +162,7 @@ void load_peft_from_file(DT *ptr,
 
 void PEFTMemoryManager::load_peft_model(LoraLinearWeight &weight, LoraLinearConfig const &lora_config) {
     // Load weights
-    assert(weight.w0_ptr != nullptr && weight.w1_ptr != nullptr "PEFT Memory Manager weight ptr null");
+    assert(weight.w0_ptr != nullptr && weight.w1_ptr != nullptr && "PEFT Memory Manager weight ptr null");
     int w0_num_elements = lora_config.rank * in_dim;
     int w1_num_elements = lora_config.rank * out_dim;
     // values below represent total weight sizes before sharding. Lora B is not
@@ -235,7 +253,7 @@ LoraLinearWeight PEFTMemoryManager::get_inference_peft(PEFTModelID const &model_
     int data_size = data_type_size(dt);
     LoraLinearWeight result;
     result.w0_ptr = static_cast<char *>(base_ptr) + mem_slot * max_lora_size;
-    result.w1_ptr = result.w0_ptr + w0_num_elements * data_size;
+    result.w1_ptr = static_cast<char *>(result.w0_ptr) + w0_num_elements * data_size;
     if (cache_miss) {
       load_peft_model(result, lora_config);
     }
@@ -244,19 +262,20 @@ LoraLinearWeight PEFTMemoryManager::get_inference_peft(PEFTModelID const &model_
 
 LoraLinearWeight PEFTMemoryManager::get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config) {
     assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set");
-    bool cache_miss = get_finetuning_slot(model_id);
+    bool cache_miss;
+    get_finetuning_slot(model_id, &cache_miss);
     int w0_num_elements = lora_config.rank * in_dim;
     int w1_num_elements = lora_config.rank * out_dim;
     int data_size = data_type_size(dt);
     LoraLinearWeight result;
     result.w0_ptr = finetuning_ptr;
-    result.w1_ptr = result.w0_ptr + w0_num_elements*data_size;
-    result.w0_grad_ptr = result.w1_ptr + w1_num_elements*data_size;
-    result.w1_grad_ptr = result.w0_grad_ptr + w0_num_elements*data_size;
-    result.w0_v_values_ptr = result.w1_grad_ptr + w1_num_elements*data_size;
-    result.w1_v_values_ptr = result.w0_v_values_ptr + w0_num_elements*data_size;
-    result.input_activation = result.w1_v_values_ptr + w1_num_elements*data_size; // max_peft_tokens*in_dim
-    result.low_rank_activation = result.input_activation + max_peft_tokens*in_dim*data_size; // max_peft_tokens*rank
+    result.w1_ptr = static_cast<char *>(result.w0_ptr)+ w0_num_elements*data_size;
+    result.w0_grad_ptr = static_cast<char *>(result.w1_ptr) + w1_num_elements*data_size;
+    result.w1_grad_ptr = static_cast<char *>(result.w0_grad_ptr) + w0_num_elements*data_size;
+    result.w0_v_values_ptr = static_cast<char *>(result.w1_grad_ptr) + w1_num_elements*data_size;
+    result.w1_v_values_ptr = static_cast<char *>(result.w0_v_values_ptr) + w0_num_elements*data_size;
+    result.input_activation = static_cast<char *>(result.w1_v_values_ptr) + w1_num_elements*data_size; // max_peft_tokens*in_dim
+    result.low_rank_activation = static_cast<char *>(result.input_activation) + max_peft_tokens*in_dim*data_size; // max_peft_tokens*rank
     if (cache_miss) {
       load_peft_model(result, lora_config);
     }
diff --git a/src/runtime/peft_weight_allocator.cu b/src/runtime/peft_weight_allocator.cu
index cc8d095069..bc9ab443cb 100644
--- a/src/runtime/peft_weight_allocator.cu
+++ b/src/runtime/peft_weight_allocator.cu
@@ -8,7 +8,7 @@
 namespace FlexFlow {
 
 template <typename DT>
-void init_kernel(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, int seed, cudaStream_t stream) {
+void lora_init_kernel(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, int seed, cudaStream_t stream) {
     // Initialize generator
     std::mt19937 gen(seed);
 
@@ -47,7 +47,7 @@ void init_kernel(LoraLinearWeight const &weight, int in_dim, int out_dim, int ra
             num = num_float;
         }
     }
-    checkCUDA(cudaMemcpyAsync(static_cast<DT *>(w1_ptr),
+    checkCUDA(cudaMemcpyAsync(static_cast<DT *>(weight.w1_ptr),
                                 lora_b_random_init.data(),
                                 w1_num_elements * sizeof(DT),
                                 cudaMemcpyHostToDevice,
@@ -59,9 +59,9 @@ void init_peft_weight_wrapper(LoraLinearWeight const &weight, int in_dim, int ou
   checkCUDA(get_legion_stream(&stream));
 
   if (dt == DT_FLOAT) {
-    Internal::init_kernel<float>(weight, in_di, out_dim, rank, seed, stream);
+    lora_init_kernel<float>(weight, in_dim, out_dim, rank, seed, stream);
   } else if (dt == DT_HALF) {
-    Internal::init_kernel<half>(weight, in_di, out_dim, rank, seed, stream);
+    lora_init_kernel<half>(weight, in_dim, out_dim, rank, seed, stream);
   } else {
     assert(false && "Unsupported data type");
   }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 79fcdfdcfe..2377a4f938 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -260,7 +260,8 @@ void RequestManager::register_peft_config(PEFTModelID const &peft_model_id,
   // check that peft_model_id is not already in use
   assert(peft_configs.find(peft_model_id) == peft_configs.end() &&
          "PEFT model ID already in use");
-  peft_configs[peft_model_id] = peft_config;
+  // peft_configs[peft_model_id] = std::move(peft_config);
+  peft_configs.emplace(peft_model_id, std::move(peft_config));
 }
 
 LoraLinearConfig const &RequestManager::get_peft_config(
@@ -284,7 +285,7 @@ int RequestManager::get_max_concurrent_adapters() {
   return max_concurrent_adapters;
 }
 
-PEFTModelID *FFModel::register_peft_adapter(LoraLinearConfig const peft_config) {
+PEFTModelID *FFModel::register_peft_adapter(LoraLinearConfig const &peft_config) {
   assert(config.enable_peft &&
          "Cannot add a LoRA layer if PEFT mode is not enabled");
   if (peft_config.target_modules.size() == 0) {

From 7ff96d782ac71fc05c943f2bebdd4be616fbe91d Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 6 Oct 2024 06:16:05 +0000
Subject: [PATCH 18/37] fixes

---
 include/flexflow/flexflow_c.h                 |   8 +-
 include/flexflow/model.h                      |  12 +-
 .../ops/kernels/lora_linear_kernels.h         |   3 +-
 include/flexflow/ops/lora_linear.h            |  15 +-
 include/flexflow/ops/lora_linear_params.h     | 130 ++----
 include/flexflow/request_manager.h            |   4 +-
 .../flexflow/utils/peft_weight_allocator.h    |  94 ++--
 inference/models/falcon.cc                    |   5 +-
 inference/models/llama.cc                     |   5 +-
 inference/models/mpt.cc                       |   4 +-
 inference/models/opt.cc                       |   2 +-
 inference/models/starcoder.cc                 |   2 +-
 src/c/flexflow_c.cc                           |  15 +-
 src/ops/kernels/lora_linear_kernels.cu        |  50 ++-
 src/ops/lora_linear.cc                        | 143 +++---
 src/ops/lora_linear_params.cc                 | 115 ++++-
 src/runtime/fftype.cc                         |   4 +-
 src/runtime/inference_manager.cc              |   1 -
 src/runtime/peft_weight_allocator.cc          | 418 +++++++++---------
 src/runtime/peft_weight_allocator.cu          |  92 ++--
 src/runtime/request_manager.cc                |  72 +--
 21 files changed, 673 insertions(+), 521 deletions(-)

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 19b2bc7c83..7a68c6566f 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -600,10 +600,12 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
                                             bool beam_search,
                                             char const *name);
 
-void flexflow_model_add_lora_layers(flexflow_model_t handle_, int num_target_modules, char const **target_modules_);
+void flexflow_model_add_lora_layers(flexflow_model_t handle_,
+                                    int num_target_modules,
+                                    char const **target_modules_);
 
-
-flexflow_peft_model_id_t flexflow_model_register_peft_adapter(flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_);
+flexflow_peft_model_id_t flexflow_model_register_peft_adapter(
+    flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_);
 
 void flexflow_model_set_sgd_optimizer(flexflow_model_t handle,
                                       flexflow_sgd_optimizer_t optimizer);
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index e3beafe20c..82f0a9add1 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -845,9 +845,9 @@ class FFModel {
   // ========================================
   // PEFT Layers
   // ========================================
-//   PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config);
-    void add_lora_layers(std::vector<std::string> target_modules);
-    PEFTModelID *register_peft_adapter(LoraLinearConfig const &peft_config);
+  //   PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config);
+  void add_lora_layers(std::vector<std::string> target_modules);
+  PEFTModelID *register_peft_adapter(LoraLinearConfig const &peft_config);
   // ========================================
   // Inference APIs
   // ========================================
@@ -1182,9 +1182,9 @@ class FFModel {
   std::vector<ParallelTensor> parameters;
   // PEFT related
   std::unordered_map<Layer *, Layer *> base_layer_to_peft_layer;
-//   std::unordered_map<Layer *, std::vector<PEFTModelID>> peft_layer_to_peft_id;
-//   std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
-  //   std::vector<Op *> peft_operators;
+  //   std::unordered_map<Layer *, std::vector<PEFTModelID>>
+  //   peft_layer_to_peft_id; std::unordered_map<PEFTModelID, LoraLinearConfig>
+  //   peft_configs; std::vector<Op *> peft_operators;
 
   FFHandler handlers[MAX_NUM_WORKERS];
   Legion::Future current_metrics;
diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
index 00f16af146..b3e047fc0e 100644
--- a/include/flexflow/ops/kernels/lora_linear_kernels.h
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -37,7 +37,8 @@ class LoraLinearMeta : public OpMeta {
 namespace Kernels {
 namespace LoraLinear {
 
-bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config);
+bool lora_applies_to_this_layer(LoraLinearMeta *m,
+                                LoraLinearConfig const &config);
 
 void init_kernel_wrapper(LoraLinearMeta *m, int seed);
 void inference_kernel_wrapper(LoraLinearMeta *m,
diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h
index 1c6070afe4..cc625cafc2 100644
--- a/include/flexflow/ops/lora_linear.h
+++ b/include/flexflow/ops/lora_linear.h
@@ -17,14 +17,13 @@ class LoraLinear : public Op {
   using Params = LoraLinearParams;
   using Input = std::pair<ParallelTensor, ParallelTensor>;
 
-  LoraLinear(
-      FFModel &model,
-      LayerID const &layer_guid,
-      ParallelTensor const input,
-      ParallelTensor const output,
-      int max_rank,
-      int max_concurrent_adapters,
-      char const *name = nullptr);
+  LoraLinear(FFModel &model,
+             LayerID const &layer_guid,
+             ParallelTensor const input,
+             ParallelTensor const output,
+             int max_rank,
+             int max_concurrent_adapters,
+             char const *name = nullptr);
   LoraLinear(FFModel &model,
              LoraLinear const &other,
              ParallelTensor const input,
diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
index 525a9209d3..1dfe5f17bd 100644
--- a/include/flexflow/ops/lora_linear_params.h
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -19,7 +19,7 @@ class LoraOptimizerConfig {
   LoraOptimizerConfig();
   virtual std::string getType() const = 0;
   virtual nlohmann::json toJson() const = 0;
-  static std::unique_ptr<LoraOptimizerConfig> fromJson(const nlohmann::json& j);
+  static std::unique_ptr<LoraOptimizerConfig> fromJson(nlohmann::json const &j);
   virtual ~LoraOptimizerConfig() = default;
 };
 
@@ -32,26 +32,16 @@ class LoraSGDOptimizerConfig : public LoraOptimizerConfig {
                          bool weight_decay_ = 0.0f);
   friend std::ostream &operator<<(std::ostream &os,
                                   LoraSGDOptimizerConfig const &llc);
-  
-  std::string getType() const override { return "SGD"; }  
-  
-  nlohmann::json toJson() const override {
-    return {{"type", "SGD"},
-            {"lr", lr},
-            {"momentum", momentum},
-            {"nesterov", nesterov},
-            {"weight_decay", weight_decay}};
-  }
 
-  static std::unique_ptr<LoraSGDOptimizerConfig> fromJson(const nlohmann::json& j) {
-    auto sgd = std::make_unique<LoraSGDOptimizerConfig>();
-    sgd->lr = j["lr"];
-    sgd->momentum = j["momentum"];
-    sgd->nesterov = j["nesterov"];
-    sgd->weight_decay = j["weight_decay"];
-    return sgd;
+  std::string getType() const override {
+    return "SGD";
   }
 
+  nlohmann::json toJson() const override;
+
+  static std::unique_ptr<LoraSGDOptimizerConfig>
+      fromJson(nlohmann::json const &j);
+
 public:
   double lr = 0.001f;
   double momentum = 0.0f;
@@ -69,28 +59,16 @@ class LoraAdamOptimizerConfig : public LoraOptimizerConfig {
                           double epsilon_ = 1e-8);
   friend std::ostream &operator<<(std::ostream &os,
                                   LoraAdamOptimizerConfig const &llc);
-  
-  std::string getType() const override { return "Adam"; }  
-  
-  nlohmann::json toJson() const override {
-    return {{"type", "Adam"},
-            {"alpha", alpha},
-            {"beta1", beta1},
-            {"beta2", beta2},
-            {"weight_decay", weight_decay},
-            {"epsilon", epsilon}};
-  }
 
-  static std::unique_ptr<LoraAdamOptimizerConfig> fromJson(const nlohmann::json& j) {
-    auto adam = std::make_unique<LoraAdamOptimizerConfig>();
-    adam->alpha = j["alpha"];
-    adam->beta1 = j["beta1"];
-    adam->beta2 = j["beta2"];
-    adam->weight_decay = j["weight_decay"];
-    adam->epsilon = j["epsilon"];
-    return adam;
+  std::string getType() const override {
+    return "Adam";
   }
 
+  nlohmann::json toJson() const override;
+
+  static std::unique_ptr<LoraAdamOptimizerConfig>
+      fromJson(nlohmann::json const &j);
+
 public:
   // Adam
   double alpha = 0.001f;
@@ -100,14 +78,6 @@ class LoraAdamOptimizerConfig : public LoraOptimizerConfig {
   double epsilon = 1e-8;
 };
 
-std::unique_ptr<LoraOptimizerConfig> LoraOptimizerConfig::fromJson(const nlohmann::json& j) {
-  std::string type = j["type"];
-  if (type == "SGD") return LoraSGDOptimizerConfig::fromJson(j);
-  if (type == "Adam") return LoraAdamOptimizerConfig::fromJson(j);
-  throw std::runtime_error("Unknown optimizer type");
-}
-
-
 class LoraLinearConfig {
 public:
   static const LoraLinearConfig EmptyConfig;
@@ -126,11 +96,14 @@ class LoraLinearConfig {
   LoraLinearConfig();
 
   // Method to set optimizer
-  template<typename T>
-  void setOptimizer(T&& opt) {
-    if constexpr (std::is_base_of_v<LoraOptimizerConfig, std::remove_reference_t<T>>) {
-      optimizer_config = std::make_unique<std::remove_reference_t<T>>(std::forward<T>(opt));
-    } else if constexpr (std::is_same_v<std::unique_ptr<LoraOptimizerConfig>, std::remove_reference_t<T>>) {
+  template <typename T>
+  void setOptimizer(T &&opt) {
+    if constexpr (std::is_base_of_v<LoraOptimizerConfig,
+                                    std::remove_reference_t<T>>) {
+      optimizer_config =
+          std::make_unique<std::remove_reference_t<T>>(std::forward<T>(opt));
+    } else if constexpr (std::is_same_v<std::unique_ptr<LoraOptimizerConfig>,
+                                        std::remove_reference_t<T>>) {
       optimizer_config = std::move(opt);
     } else {
       static_assert(always_false<T>, "Unsupported optimizer type");
@@ -139,62 +112,19 @@ class LoraLinearConfig {
   // Helper template for static_assert
   template <typename>
   static inline constexpr bool always_false = false;
-  
+
   friend bool operator==(LoraLinearConfig const &lhs,
                          LoraLinearConfig const &rhs);
   friend std::ostream &operator<<(std::ostream &os,
                                   LoraLinearConfig const &llc);
-  std::string serialize_to_json_string(int indent=-1) const {
-    nlohmann::json j = {
-        {"cache_folder", cache_folder},
-        {"peft_model_id", peft_model_id},
-        {"rank", rank},
-        {"lora_alpha", lora_alpha},
-        {"lora_dropout", lora_dropout},
-        {"target_modules", target_modules},
-        {"trainable", trainable},
-        {"init_lora_weights", init_lora_weights},
-        {"base_model_name_or_path", base_model_name_or_path},
-        {"precision", precision},
-        // {"optimizer_config", optimizer_config ? optimizer_config->toJson() : nullptr}
-        {"optimizer_config", optimizer_config ? nlohmann::json(optimizer_config->toJson()) : nlohmann::json()}
-    };
-
-    return j.dump(indent);  // No indentation
-  }
-  void serialize_to_json_file(const std::string& filename) const {
-    std::string j = serialize_to_json_string(4);
-    std::ofstream file(filename);
-    file << j;
-  }
+  std::string serialize_to_json_string(int indent = -1) const;
+  void serialize_to_json_file(std::string const &filename) const;
   // Deserialization method
-  static LoraLinearConfig deserialize_from_json_string(const std::string& json_string) {
-    nlohmann::json j = nlohmann::json::parse(json_string);
-    LoraLinearConfig config(
-        j["cache_folder"].get<std::string>(),
-        j["peft_model_id"].get<std::string>(),
-        j["trainable"].get<bool>(),
-        nullptr,  // optimizer_config will be set later if present
-        j["init_lora_weights"].get<bool>(),
-        j["base_model_name_or_path"].get<std::string>(),
-        j["precision"].get<std::string>(),
-        j["rank"].get<int>(),
-        j["lora_alpha"].get<float>(),
-        j["lora_dropout"].get<float>(),
-        j["target_modules"].get<std::vector<std::string>>()
-    );
-    if (!j["optimizer_config"].is_null()) {
-      config.setOptimizer(LoraOptimizerConfig::fromJson(j["optimizer_config"]));
-    }
-    return config;
-  }
+  static LoraLinearConfig
+      deserialize_from_json_string(std::string const &json_string);
   // Deserialization method
-  static LoraLinearConfig deserialize_from_json_file(const std::string& filename) {
-    std::ifstream file(filename);
-    std::string j;
-    file >> j;
-    return deserialize_from_json_string(j);
-  }
+  static LoraLinearConfig
+      deserialize_from_json_file(std::string const &filename);
 
   std::string cache_folder;
   // Huggingface model ID (for download and/or upload)
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 628714dcc0..ce75d2e0d3 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -151,7 +151,7 @@ class RequestManager {
   void register_output_filepath(std::string const &);
   void register_peft_config(PEFTModelID const &peft_model_id,
                             LoraLinearConfig const &peft_config);
-  LoraLinearConfig get_peft_config(PEFTModelID peft_model_id);
+  LoraLinearConfig const &get_peft_config(PEFTModelID const &peft_model_id);
   void set_max_lora_rank(int max_lora_rank);
   void set_max_concurrent_adapters(int max_concurrent_adapters);
   int get_max_lora_rank();
@@ -295,7 +295,7 @@ class RequestManager {
   int max_spec_tree_token_num;
   int max_sequence_length;
   Status request_manager_status;
-  
+
   // peft
   std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
   int max_lora_rank;
diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h
index 9670da8a4f..bd8ddb1dce 100644
--- a/include/flexflow/utils/peft_weight_allocator.h
+++ b/include/flexflow/utils/peft_weight_allocator.h
@@ -101,52 +101,83 @@ struct LoraLinearWeight {
   void *low_rank_activation;
   // v values for SGD optimizer (when using momentum)
   void *w0_v_values_ptr, *w1_v_values_ptr;
-  LoraLinearWeight(void *w0=nullptr, void *w1=nullptr, void *w0_grad=nullptr, void *w1_grad=nullptr, 
-                   void *w0_v_values=nullptr, void *w1_v_values=nullptr, void *low_rank_activation_=nullptr, void *input_activation_=nullptr)
-    : w0_ptr(w0), w1_ptr(w1),
-      w0_grad_ptr(w0_grad), w1_grad_ptr(w1_grad),
-      w0_v_values_ptr(w0_v_values), w1_v_values_ptr(w1_v_values),
-      low_rank_activation(low_rank_activation_), input_activation(input_activation_) {}
+  LoraLinearWeight(void *w0 = nullptr,
+                   void *w1 = nullptr,
+                   void *w0_grad = nullptr,
+                   void *w1_grad = nullptr,
+                   void *w0_v_values = nullptr,
+                   void *w1_v_values = nullptr,
+                   void *low_rank_activation_ = nullptr,
+                   void *input_activation_ = nullptr)
+      : w0_ptr(w0), w1_ptr(w1), w0_grad_ptr(w0_grad), w1_grad_ptr(w1_grad),
+        w0_v_values_ptr(w0_v_values), w1_v_values_ptr(w1_v_values),
+        low_rank_activation(low_rank_activation_),
+        input_activation(input_activation_) {}
 };
 
-void init_peft_weight_wrapper(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, DataType dt, int seed);
+void init_peft_weight_wrapper(LoraLinearWeight const &weight,
+                              int in_dim,
+                              int out_dim,
+                              int rank,
+                              DataType dt,
+                              int seed);
 
 class PEFTMemoryManager {
 public:
-  PEFTMemoryManager(Legion::Memory gpu_mem_, int max_rank_, int max_concurrent_adapters_, int max_peft_tokens_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_) 
-  : gpu_mem(gpu_mem_), 
-    max_concurrent_adapters(max_concurrent_adapters_), 
-    max_rank(max_rank_),
-    in_dim(in_dim_), out_dim(out_dim_), num_shards(num_shards_), shard_id(shard_id_),
-    max_peft_tokens(max_peft_tokens_),
-    lora_layername_substr(lora_layername_substr_), dt(dt_),
-    base_ptr(nullptr), 
-    finetuning_ptr(nullptr), 
-    finetuning_model_id(PEFTModelID::NO_ID) {
-    max_lora_size = data_type_size(dt) * (max_rank * in_dim + max_rank * out_dim);
-    assert(max_concurrent_adapters > 0 && "PEFT Memory Manager max_concurrent_adapters must be > 0");
-    assert(max_lora_size > 0 && "PEFT Memory Manager max_lora_size must be > 0");
+  PEFTMemoryManager(Legion::Memory gpu_mem_,
+                    int max_rank_,
+                    int max_concurrent_adapters_,
+                    int max_peft_tokens_,
+                    int in_dim_,
+                    int out_dim_,
+                    int num_shards_,
+                    int shard_id_,
+                    std::string const &lora_layername_substr_,
+                    DataType dt_)
+      : gpu_mem(gpu_mem_), max_concurrent_adapters(max_concurrent_adapters_),
+        max_rank(max_rank_), in_dim(in_dim_), out_dim(out_dim_),
+        num_shards(num_shards_), shard_id(shard_id_),
+        max_peft_tokens(max_peft_tokens_),
+        lora_layername_substr(lora_layername_substr_), dt(dt_),
+        base_ptr(nullptr), finetuning_ptr(nullptr),
+        finetuning_model_id(PEFTModelID::NO_ID) {
+    max_lora_size =
+        data_type_size(dt) * (max_rank * in_dim + max_rank * out_dim);
+    assert(max_concurrent_adapters > 0 &&
+           "PEFT Memory Manager max_concurrent_adapters must be > 0");
+    assert(max_lora_size > 0 &&
+           "PEFT Memory Manager max_lora_size must be > 0");
     allocate_inference_memory();
-    // finetuning memory is allocated upon the first finetuning request, so we can skip for inference-only workloads
+    // finetuning memory is allocated upon the first finetuning request, so we
+    // can skip for inference-only workloads
   }
 
-  // allocate memory for all the PEFT adapters for a given layer on a given shard
+  // allocate memory for all the PEFT adapters for a given layer on a given
+  // shard
   void allocate_inference_memory();
-  // allocate memory for the PEFT adapter for a finetuning request for a given layer and shard
+  // allocate memory for the PEFT adapter for a finetuning request for a given
+  // layer and shard
   void allocate_finetuning_memory();
 
-  LoraLinearWeight get_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config);
+  LoraLinearWeight get_peft(PEFTModelID const &model_id,
+                            LoraLinearConfig const &lora_config);
   void check_ft_model_id(PEFTModelID const &model_id);
 
 private:
-  // Check if the PEFT adapter for the given model is in memory. If not, sets the cache_miss flag to true. If this is the first finetuning request, allocate memory for the finetuning adapter.
+  // Check if the PEFT adapter for the given model is in memory. If not, sets
+  // the cache_miss flag to true. If this is the first finetuning request,
+  // allocate memory for the finetuning adapter.
   void get_finetuning_slot(PEFTModelID const &model_id, bool *cache_miss);
-  // Returns the slot in memory where the peft model weights are/will be stored. 
-  // If the model is not in memory (cache miss), set the cache_miss flag to true.
+  // Returns the slot in memory where the peft model weights are/will be stored.
+  // If the model is not in memory (cache miss), set the cache_miss flag to
+  // true.
   int get_inference_peft_slot(PEFTModelID const &model_id, bool *cache_miss);
-  void load_peft_model(LoraLinearWeight &weight, LoraLinearConfig const &lora_config);
-  LoraLinearWeight get_inference_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config);
-  LoraLinearWeight get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config);
+  void load_peft_model(LoraLinearWeight &weight,
+                       LoraLinearConfig const &lora_config);
+  LoraLinearWeight get_inference_peft(PEFTModelID const &model_id,
+                                      LoraLinearConfig const &lora_config);
+  LoraLinearWeight get_finetuning_peft(PEFTModelID const &model_id,
+                                       LoraLinearConfig const &lora_config);
 
   // Legion memory management apparatus
   Legion::Memory gpu_mem;
@@ -160,7 +191,8 @@ class PEFTMemoryManager {
   int max_peft_tokens;
   // LRU cache apparatus
   std::unordered_map<PEFTModelID, int> lru_hashtable;
-  std::vector<PEFTModelID> lru_list; // head = least recently used, tail=most recently used
+  std::vector<PEFTModelID>
+      lru_list; // head = least recently used, tail=most recently used
   std::unordered_map<PEFTModelID, int> peft2mem_slot;
   // Miscellanea
   std::string lora_layername_substr;
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index 945c55f296..318ee128ad 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -245,8 +245,9 @@ void FALCON::create_falcon_model(FFModel &ff,
   // If PEFT is enabled, add LoRA layers
   if (ff.config.enable_peft) {
     // todo: add attention projections
-    std::vector<std::string> target_modules = {"dense_h_to_4h", "dense_4h_to_h"};
-    ff.add_lora_layers();
+    std::vector<std::string> target_modules = {"dense_h_to_4h",
+                                               "dense_4h_to_h"};
+    ff.add_lora_layers(target_modules);
   }
 
   FileDataLoader *fileloader =
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 6a70620942..bc4c80b155 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -273,8 +273,9 @@ void LLAMA::create_llama_model(FFModel &ff,
   // If PEFT is enabled, add LoRA layers
   if (ff.config.enable_peft) {
     // todo: add attention projections
-    std::vector<std::string> target_modules = {"gate_proj", "up_proj", "down_proj"};
-    ff.add_lora_layers();
+    std::vector<std::string> target_modules = {
+        "gate_proj", "up_proj", "down_proj"};
+    ff.add_lora_layers(target_modules);
   }
 
   FileDataLoader *fileloader = new FileDataLoader(
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index 6946ed18c3..b16729f02e 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -250,12 +250,12 @@ void MPT::create_mpt_model(FFModel &ff,
   } else {
     output = ff.argmax(lm_head, /*beam_Search*/ false);
   }
-  
+
   // If PEFT is enabled, add LoRA layers
   if (ff.config.enable_peft) {
     // todo: add attention projections
     std::vector<std::string> target_modules = {"up_proj", "down_proj"};
-    ff.add_lora_layers();
+    ff.add_lora_layers(target_modules);
   }
 
   FileDataLoader *fileloader =
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index b78dafbe95..a892cb9891 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -266,7 +266,7 @@ void OPT::create_opt_model(FFModel &ff,
   if (ff.config.enable_peft) {
     // todo: add attention projections
     std::vector<std::string> target_modules = {"fc1", "fc2"};
-    ff.add_lora_layers();
+    ff.add_lora_layers(target_modules);
   }
 
   FileDataLoader *fileloader = new FileDataLoader(
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 3da1e82a79..18d51cbae0 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -228,7 +228,7 @@ void STARCODER::create_starcoder_model(
   if (ff.config.enable_peft) {
     // todo: add attention projections
     std::vector<std::string> target_modules = {"c_fc", "c_proj"};
-    ff.add_lora_layers();
+    ff.add_lora_layers(target_modules);
   }
 
   InferenceManager *im = InferenceManager::get_inference_manager();
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 8810cfb30c..b9b4300828 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1554,7 +1554,9 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
   return FFCObjectWrapper::wrap(tensor);
 }
 
-void flexflow_model_add_lora_layers(flexflow_model_t handle_, int num_target_modules, char const **target_modules_) {
+void flexflow_model_add_lora_layers(flexflow_model_t handle_,
+                                    int num_target_modules,
+                                    char const **target_modules_) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   std::vector<std::string> target_modules;
   for (int i = 0; i < num_target_modules; i++) {
@@ -1573,11 +1575,12 @@ flexflow_peft_model_id_t flexflow_model_register_peft_adapter(
   LoraLinearConfig const *peft_config = FFCObjectWrapper::unwrap(peft_config_);
   PEFTModelID *peft_model_id = handle->register_peft_adapter(*peft_config);
 
-  DEBUG_PRINT("[Register PEFT Adapter] model handle: %p, peft_config handle %p, "
-              "peft_model_id: %p",
-              handle,
-              peft_config,
-              peft_model_id);
+  DEBUG_PRINT(
+      "[Register PEFT Adapter] model handle: %p, peft_config handle %p, "
+      "peft_model_id: %p",
+      handle,
+      peft_config,
+      peft_model_id);
   return FFCObjectWrapper::wrap(peft_model_id);
 }
 
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 134af3ca6e..5e24b6a873 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -147,7 +147,8 @@ void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
   }
 }
 
-bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config) {
+bool lora_applies_to_this_layer(LoraLinearMeta *m,
+                                LoraLinearConfig const &config) {
   for (std::string s : config.target_modules) {
     std::string n(m->op_name);
     if (n.find(s) != std::string::npos) {
@@ -159,7 +160,6 @@ bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &confi
 
 namespace Internal {
 
-
 #ifdef DEADCODE
 template <typename DT>
 void inference_kernel(LoraLinearMeta *m,
@@ -320,23 +320,30 @@ void inference_kernel(LoraLinearMeta *m,
   cudaDataType_t compute_type = output_type;
 
   int num_peft_requests = 0;
-  for (int i=0; i< bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i] ||
+        bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
       continue;
     }
     if (bc->requestsInfo[i].peft_bwd) {
       num_peft_requests++;
     }
-    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config);
+    LoraLinearConfig lora_config =
+        LoraLinearConfig::deserialize_from_json_string(
+            bc->requestsInfo[i].peft_model_config);
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
-    assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && "Trainable flag mismatch");
+    assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd &&
+           "Trainable flag mismatch");
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     // int max_peft_tokens = bc->requestsInfo[i].max_length;
     int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
-    LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config);
-    void *intermediate_result_ptr = (bc->requestsInfo[i].peft_bwd) ? weight.low_rank_activation : m->handle.workSpace;
+    LoraLinearWeight weight = m->peft_memory_manager->get_peft(
+        bc->requestsInfo[i].peft_model_id, lora_config);
+    void *intermediate_result_ptr = (bc->requestsInfo[i].peft_bwd)
+                                        ? weight.low_rank_activation
+                                        : m->handle.workSpace;
     if (bc->requestsInfo[i].peft_bwd) {
       checkCUDA(cudaMemcpyAsync(weight.input_activation,
                                 input_ptr + first_token_offset * in_dim,
@@ -346,8 +353,8 @@ void inference_kernel(LoraLinearMeta *m,
                                 stream));
     } else {
       // use workspace to save intermediate result
-      assert(m->handle.workSpaceSize >=
-             data_type_size(m->input_type[1]) * num_peft_tokens * lora_config.rank);
+      assert(m->handle.workSpaceSize >= data_type_size(m->input_type[1]) *
+                                            num_peft_tokens * lora_config.rank);
     }
     DT alpha = 1.0f, beta = 0.0f;
     // buffer = weight_first * input
@@ -439,22 +446,29 @@ void peft_bwd_kernel(LoraLinearMeta *m,
   cudaDataType_t weight_type = output_type;
   cudaDataType_t lr_actv_type = output_type;
   cudaDataType_t compute_type = output_type;
-  
+
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     // Skip completed, non-PEFT and PEFT forward-only requests
-    if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || !bc->requestsInfo[i].peft_bwd) {
+    if (bc->request_completed[i] ||
+        bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID ||
+        !bc->requestsInfo[i].peft_bwd) {
       continue;
     }
-    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config);
+    LoraLinearConfig lora_config =
+        LoraLinearConfig::deserialize_from_json_string(
+            bc->requestsInfo[i].peft_model_config);
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
-    assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && "Trainable flag mismatch");
-    m->peft_memory_manager->check_ft_model_id(bc->requestsInfo[i].peft_model_id);
+    assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd &&
+           "Trainable flag mismatch");
+    m->peft_memory_manager->check_ft_model_id(
+        bc->requestsInfo[i].peft_model_id);
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     // int max_peft_tokens = bc->requestsInfo[i].max_length;
     // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
-    LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config);
+    LoraLinearWeight weight = m->peft_memory_manager->get_peft(
+        bc->requestsInfo[i].peft_model_id, lora_config);
     DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank);
 
     // Compute LORA_B weight's gradient
@@ -569,7 +583,9 @@ void peft_bwd_kernel(LoraLinearMeta *m,
       // Get optimizer config
 
       if (lora_config.optimizer_config->getType() == "SGD") {
-        LoraSGDOptimizerConfig const *sgd_config = static_cast<LoraSGDOptimizerConfig const *>(lora_config.optimizer_config.get());
+        LoraSGDOptimizerConfig const *sgd_config =
+            static_cast<LoraSGDOptimizerConfig const *>(
+                lora_config.optimizer_config.get());
         // LoRA_A weight is split in tensor parallelism, so no need to apply
         // all-reduce
         sgd_update<<<GET_BLOCKS(w0_num_elements),
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index f7ac4ff06e..60a4392812 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -3,10 +3,10 @@
 #include "flexflow/layer.h"
 #include "flexflow/model.h"
 #include "flexflow/ops/kernels/lora_linear_kernels.h"
+#include "flexflow/request_manager.h"
 #include "flexflow/utils/hash_utils.h"
 #include "flexflow/utils/peft_weight_allocator.h"
 #include "legion/legion_utilities.h"
-#include "flexflow/request_manager.h"
 #include <sys/stat.h>
 #include <sys/types.h>
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
@@ -53,16 +53,19 @@ bool check_lora_layer_match(Layer *potential_target,
 }
 
 void FFModel::add_lora_layers(std::vector<std::string> target_modules) {
-  assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled");
+  assert(config.enable_peft &&
+         "Cannot add a LoRA layer if PEFT mode is not enabled");
   assert(target_modules.size() > 0 && "LoRA target module name is empty");
   RequestManager *rm = RequestManager::get_request_manager();
   int max_lora_rank = rm->get_max_lora_rank();
   int max_concurrent_adapters = rm->get_max_concurrent_adapters();
   assert(max_lora_rank > 1 && max_lora_rank <= 32 && "Invalid max LoRA rank");
-  assert(max_concurrent_adapters > 0 && "Invalid number of LoRA concurrent adapters");
+  assert(max_concurrent_adapters > 0 &&
+         "Invalid number of LoRA concurrent adapters");
 
   for (std::string target_module_name : target_modules) {
-    assert(target_module_name.length() > 0 && "LoRA target module name is empty");
+    assert(target_module_name.length() > 0 &&
+           "LoRA target module name is empty");
     // find target layer
     for (auto it = layers.begin(); it != layers.end(); ++it) {
       Layer *target_module = *it;
@@ -70,15 +73,16 @@ void FFModel::add_lora_layers(std::vector<std::string> target_modules) {
       if (!match) {
         continue;
       }
-      assert(base_layer_to_peft_layer.find(target_module) == base_layer_to_peft_layer.end() && "LoRA layer already added, attempting to add again");
+      assert(base_layer_to_peft_layer.find(target_module) ==
+                 base_layer_to_peft_layer.end() &&
+             "LoRA layer already added, attempting to add again");
       // Get input and output tensors from target module
       Tensor const input = target_module->inputs[0];
       Tensor const output = target_module->outputs[0];
       assert(input->data_type == output->data_type);
       // Compute OP_LORA layer name, based on target module name
-      std::string name_ = target_module->name
-                              ? std::string(target_module->name)
-                              : std::string("");
+      std::string name_ = target_module->name ? std::string(target_module->name)
+                                              : std::string("");
       size_t last_underscore = name_.length() - 1;
       for (int i = name_.length() - 1; i > 0; i--) {
         if (!(std::isdigit(target_module->name[i]) ||
@@ -101,7 +105,8 @@ void FFModel::add_lora_layers(std::vector<std::string> target_modules) {
                                     1 /*outputs*/,
                                     input,
                                     output);
-      // fix LoRA layer's transformer layer ID and model ID (to be the same as target module)
+      // fix LoRA layer's transformer layer ID and model ID (to be the same as
+      // target module)
       peft_layer->layer_guid.transformer_layer_id =
           target_module->layer_guid.transformer_layer_id;
       peft_layer->layer_guid.model_id = target_module->layer_guid.model_id;
@@ -122,7 +127,8 @@ void FFModel::add_lora_layers(std::vector<std::string> target_modules) {
       }
       // pass max_rank and max_concurrent_adapters to OP_LORA layer
       peft_layer->add_int_property("max_rank", max_lora_rank);
-      peft_layer->add_int_property("max_concurrent_adapters", max_concurrent_adapters);
+      peft_layer->add_int_property("max_concurrent_adapters",
+                                   max_concurrent_adapters);
       it = layers.insert(it + 1, peft_layer);
       ++it;
       base_layer_to_peft_layer[target_module] = peft_layer;
@@ -293,7 +299,7 @@ LoraLinear::LoraLinear(FFModel &model,
                  input,
                  output,
                  other.max_rank,
-                other.max_concurrent_adapters,
+                 other.max_concurrent_adapters,
                  other.name) {}
 
 LoraLinear::LoraLinear(FFModel &model,
@@ -468,16 +474,26 @@ OpMeta *LoraLinear::init_task(Task const *task,
   }
   std::string lora_layername_substr =
       lora_layername.substr(0, found + searchString.length());
-  
+
   // allocate space for lora weights
   Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
-  m->peft_memory_manager = new PEFTMemoryManager(gpu_mem, lora->max_rank, lora->max_concurrent_adapters, BatchConfig::max_sequence_length(), in_dim, out_dim, num_shards, shard_id, lora_layername_substr, dt);
+  m->peft_memory_manager =
+      new PEFTMemoryManager(gpu_mem,
+                            lora->max_rank,
+                            lora->max_concurrent_adapters,
+                            BatchConfig::max_sequence_length(),
+                            in_dim,
+                            out_dim,
+                            num_shards,
+                            shard_id,
+                            lora_layername_substr,
+                            dt);
   m->peft_memory_manager->allocate_inference_memory();
   return m;
 }
 
 #ifdef DEADCODE
-void load_peft_adapters(BatchConfig const *bc){
+void load_peft_adapters(BatchConfig const *bc) {
   for (auto const &kv : bc->peft_configs) {
     PEFTModelID const &model_id = kv.first;
     LoraLinearConfig const &lora_config = kv.second;
@@ -758,14 +774,18 @@ void LoraLinear::inference_task(Task const *task,
     }
 
     for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-      if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      if (bc->request_completed[i] ||
+          bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
         continue;
       }
-      LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config);
+      LoraLinearConfig lora_config =
+          LoraLinearConfig::deserialize_from_json_string(
+              bc->requestsInfo[i].peft_model_config);
       if (!lora_applies_to_this_layer(m, lora_config)) {
         continue;
       }
-      LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config);
+      LoraLinearWeight weight = m->peft_memory_manager->get_peft(
+          bc->requestsInfo[i].peft_model_id, lora_config);
       fs::path dst_filepath_weights =
           get_dst_folder("weights", m->decoding_step, shard_id) / layername;
       std::string filenameA =
@@ -819,7 +839,6 @@ void LoraLinear::inference_task(Task const *task,
       assert(false);
     }
 
-    
     m->decoding_step++;
   }
 }
@@ -874,6 +893,8 @@ void lora_inference_debugging(LoraLinearMeta *m,
                               GenericTensorAccessorW input_grad,
                               GenericTensorAccessorR output_grad,
                               int shard_id) {
+  int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
   // get layer name
   std::string lora_layername = std::string(m->op_name);
   std::string searchString = "lora";
@@ -907,20 +928,21 @@ void lora_inference_debugging(LoraLinearMeta *m,
   // weights, weights gradients
   fs::path dst_filepath_weights =
       get_dst_folder("weights", m->bwd_step, shard_id) / layername;
-  
+
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+    if (bc->request_completed[i] ||
+        bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID ||
+        !bc->requestsInfo[i].peft_bwd) {
       continue;
     }
-    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config);
+    LoraLinearConfig lora_config =
+        LoraLinearConfig::deserialize_from_json_string(
+            bc->requestsInfo[i].peft_model_config);
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
-  
-  assert(m->model_state.size() >= 1 && "Model state empty!");
-  for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) {
-    PEFTModelID peft_model_id = it->first;
-    LoraLinearWeight weight = m->model_state[peft_model_id].weights;
+    LoraLinearWeight weight = m->peft_memory_manager->get_peft(
+        bc->requestsInfo[i].peft_model_id, lora_config);
     std::string filename_weight_A =
         dst_filepath_weights.string() + ".weight_A.finetuned";
     std::string filename_weight_B =
@@ -932,36 +954,36 @@ void lora_inference_debugging(LoraLinearMeta *m,
     if (m->input_type[0] == DT_FLOAT) {
       // weight A
       save_tensor((float *)weight.w0_ptr,
-                  weight.rank * weight.in_dim,
+                  lora_config.rank * in_dim,
                   filename_weight_A.c_str());
       // weight grad A
       save_tensor((float *)weight.w0_grad_ptr,
-                  weight.rank * weight.in_dim,
+                  lora_config.rank * in_dim,
                   filename_grad_A.c_str());
       // weight B
       save_tensor((float *)weight.w1_ptr,
-                  weight.rank * weight.out_dim,
+                  lora_config.rank * out_dim,
                   filename_weight_B.c_str());
       // weight grad B
       save_tensor((float *)weight.w1_grad_ptr,
-                  weight.rank * weight.out_dim,
+                  lora_config.rank * out_dim,
                   filename_grad_B.c_str());
     } else if (m->input_type[0] == DT_HALF) {
       // weight A
       save_tensor((half *)weight.w0_ptr,
-                  weight.rank * weight.in_dim,
+                  lora_config.rank * in_dim,
                   filename_weight_A.c_str());
       // weight grad A
       save_tensor((half *)weight.w0_grad_ptr,
-                  weight.rank * weight.in_dim,
+                  lora_config.rank * in_dim,
                   filename_grad_A.c_str());
       // weight B
       save_tensor((half *)weight.w1_ptr,
-                  weight.rank * weight.out_dim,
+                  lora_config.rank * out_dim,
                   filename_weight_B.c_str());
       // weight grad B
       save_tensor((half *)weight.w1_grad_ptr,
-                  weight.rank * weight.out_dim,
+                  lora_config.rank * out_dim,
                   filename_grad_B.c_str());
     } else {
       assert(false && "Data type not supported");
@@ -1040,62 +1062,49 @@ void save_peft_weights_if_needed(LoraLinearMeta *m,
   }
   std::string lora_layername_substr =
       lora_layername.substr(0, found + searchString.length());
+
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
+    if (bc->request_completed[i] ||
+        bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID ||
+        !bc->requestsInfo[i].peft_bwd) {
       continue;
     }
-    // Skip non-PEFT requests
-    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
-      continue;
-    }
-    // Skip PEFT forward-only requests
-    if (!bc->requestsInfo[i].peft_bwd) {
+    LoraLinearConfig lora_config =
+        LoraLinearConfig::deserialize_from_json_string(
+            bc->requestsInfo[i].peft_model_config);
+    if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
     if (bc->requestsInfo[i].optimizer_tasks.save_updated_weights) {
-      assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
-             m->model_state.end());
       std::string weight_export_folder = join_path({
-          m->model_state[bc->requestsInfo[i].peft_model_id].cache_folder,
+          lora_config.cache_folder,
           "finetuned_models",
-          m->model_state[bc->requestsInfo[i].peft_model_id].peft_model_id,
+          lora_config.peft_model_id,
           "weights",
           "shard_" + std::to_string(shard_id),
       });
       fs::create_directories(weight_export_folder);
 
-      int rank = m->model_state[bc->requestsInfo[i].peft_model_id].weights.rank;
+      int rank = lora_config.rank;
       int w0_num_elements = rank * in_dim;
       int w1_num_elements = rank * out_dim;
       std::string w0_filepath = join_path(
           {weight_export_folder, lora_layername_substr + "_A.weight"});
       std::string w1_filepath = join_path(
           {weight_export_folder, lora_layername_substr + "_B.weight"});
+      LoraLinearWeight weight = m->peft_memory_manager->get_peft(
+          bc->requestsInfo[i].peft_model_id, lora_config);
       if (m->input_type[0] == DT_FLOAT) {
-        save_peft_to_file(
-            (float *)m->model_state[bc->requestsInfo[i].peft_model_id]
-                .weights.w0_ptr,
-            w0_num_elements,
-            w0_filepath);
+        save_peft_to_file((float *)weight.w0_ptr, w0_num_elements, w0_filepath);
         if (shard_id == 0) {
           save_peft_to_file(
-              (float *)m->model_state[bc->requestsInfo[i].peft_model_id]
-                  .weights.w1_ptr,
-              w1_num_elements,
-              w1_filepath);
+              (float *)weight.w1_ptr, w1_num_elements, w1_filepath);
         }
       } else if (m->input_type[0] == DT_HALF) {
-        save_peft_to_file(
-            (half *)m->model_state[bc->requestsInfo[i].peft_model_id]
-                .weights.w0_ptr,
-            w0_num_elements,
-            w0_filepath);
+        save_peft_to_file((half *)weight.w0_ptr, w0_num_elements, w0_filepath);
         if (shard_id == 0) {
           save_peft_to_file(
-              (half *)m->model_state[bc->requestsInfo[i].peft_model_id]
-                  .weights.w1_ptr,
-              w1_num_elements,
-              w1_filepath);
+              (half *)weight.w1_ptr, w1_num_elements, w1_filepath);
         }
       } else {
         assert(false && "Data type not supported");
@@ -1214,7 +1223,7 @@ void LoraLinear::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.model_id);
   sez.serialize(this->max_rank);
   sez.serialize(this->max_concurrent_adapters);
-#ifdef DEADCODE  
+#ifdef DEADCODE
   sez.serialize(this->op_type);
   sez.serialize(this->peft_configs.size());
   for (auto const &kv : this->peft_configs) {
@@ -1334,7 +1343,7 @@ Node LoraLinear::deserialize(FFModel &ff,
     params.peft_configs.emplace(
         std::make_pair(peft_model_id, *lora_linear_config));
   }
-#endif  
+#endif
   dez.deserialize(name_len);
   dez.deserialize(name, name_len);
   LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
@@ -1384,7 +1393,7 @@ size_t hash<FlexFlow::LoraLinearParams>::operator()(
   hash_combine(key, params.layer_guid.model_id);
   hash_combine(key, params.max_rank);
   hash_combine(key, params.max_concurrent_adapters);
-#ifdef DEADCODE  
+#ifdef DEADCODE
   for (auto const &kv : params.peft_configs) {
     hash_combine(key, kv.first.id);
     hash_combine(key, kv.second.rank);
diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc
index c7b9fcc711..61c9c15336 100644
--- a/src/ops/lora_linear_params.cc
+++ b/src/ops/lora_linear_params.cc
@@ -12,6 +12,18 @@ namespace FlexFlow {
 // empty optimizer
 LoraOptimizerConfig::LoraOptimizerConfig() {}
 
+std::unique_ptr<LoraOptimizerConfig>
+    LoraOptimizerConfig::fromJson(nlohmann::json const &j) {
+  std::string type = j["type"];
+  if (type == "SGD") {
+    return LoraSGDOptimizerConfig::fromJson(j);
+  }
+  if (type == "Adam") {
+    return LoraAdamOptimizerConfig::fromJson(j);
+  }
+  throw std::runtime_error("Unknown optimizer type");
+}
+
 // SGD optimizer
 LoraSGDOptimizerConfig::LoraSGDOptimizerConfig()
     : lr(0.001f), momentum(0.0f), nesterov(false), weight_decay(0.0f) {}
@@ -30,6 +42,24 @@ std::ostream &operator<<(std::ostream &os, LoraSGDOptimizerConfig const &llc) {
   return os;
 }
 
+nlohmann::json LoraSGDOptimizerConfig::toJson() const {
+  return {{"type", "SGD"},
+          {"lr", lr},
+          {"momentum", momentum},
+          {"nesterov", nesterov},
+          {"weight_decay", weight_decay}};
+}
+
+std::unique_ptr<LoraSGDOptimizerConfig>
+    LoraSGDOptimizerConfig::fromJson(nlohmann::json const &j) {
+  auto sgd = std::make_unique<LoraSGDOptimizerConfig>();
+  sgd->lr = j["lr"];
+  sgd->momentum = j["momentum"];
+  sgd->nesterov = j["nesterov"];
+  sgd->weight_decay = j["weight_decay"];
+  return sgd;
+}
+
 // Adam optimizer
 LoraAdamOptimizerConfig::LoraAdamOptimizerConfig()
     : alpha(0.001f), beta1(0.9f), beta2(0.999f), weight_decay(0.0f),
@@ -50,6 +80,26 @@ std::ostream &operator<<(std::ostream &os, LoraAdamOptimizerConfig const &llc) {
   return os;
 }
 
+nlohmann::json LoraAdamOptimizerConfig::toJson() const {
+  return {{"type", "Adam"},
+          {"alpha", alpha},
+          {"beta1", beta1},
+          {"beta2", beta2},
+          {"weight_decay", weight_decay},
+          {"epsilon", epsilon}};
+}
+
+std::unique_ptr<LoraAdamOptimizerConfig>
+    LoraAdamOptimizerConfig::fromJson(nlohmann::json const &j) {
+  auto adam = std::make_unique<LoraAdamOptimizerConfig>();
+  adam->alpha = j["alpha"];
+  adam->beta1 = j["beta1"];
+  adam->beta2 = j["beta2"];
+  adam->weight_decay = j["weight_decay"];
+  adam->epsilon = j["epsilon"];
+  return adam;
+}
+
 // ------------------ LoRA configs -------------------
 // ---------------------------------------------------
 const LoraLinearConfig LoraLinearConfig::EmptyConfig = LoraLinearConfig("", "");
@@ -171,9 +221,11 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) {
   if (llc.optimizer_config != nullptr) {
     os << "optimizer_config: ";
     if (llc.optimizer_config.get()->getType() == "SGD") {
-      os << *static_cast<LoraSGDOptimizerConfig const *>(llc.optimizer_config.get());
+      os << *static_cast<LoraSGDOptimizerConfig const *>(
+          llc.optimizer_config.get());
     } else if (llc.optimizer_config.get()->getType() == "Adam") {
-      os << *static_cast<LoraAdamOptimizerConfig const *>(llc.optimizer_config.get());
+      os << *static_cast<LoraAdamOptimizerConfig const *>(
+          llc.optimizer_config.get());
     } else {
       os << "Unknown optimizer config type";
     }
@@ -185,4 +237,63 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) {
   return os;
 }
 
+std::string LoraLinearConfig::serialize_to_json_string(int indent) const {
+  nlohmann::json j = {{"cache_folder", cache_folder},
+                      {"peft_model_id", peft_model_id},
+                      {"rank", rank},
+                      {"lora_alpha", lora_alpha},
+                      {"lora_dropout", lora_dropout},
+                      {"target_modules", target_modules},
+                      {"trainable", trainable},
+                      {"init_lora_weights", init_lora_weights},
+                      {"base_model_name_or_path", base_model_name_or_path},
+                      {"precision", precision},
+                      // {"optimizer_config", optimizer_config ?
+                      // optimizer_config->toJson() : nullptr}
+                      {"optimizer_config",
+                       optimizer_config
+                           ? nlohmann::json(optimizer_config->toJson())
+                           : nlohmann::json()}};
+
+  return j.dump(indent); // No indentation
+}
+
+void LoraLinearConfig::serialize_to_json_file(
+    std::string const &filename) const {
+  std::string j = serialize_to_json_string(4);
+  std::ofstream file(filename);
+  file << j;
+}
+
+// Deserialization method
+LoraLinearConfig LoraLinearConfig::deserialize_from_json_string(
+    std::string const &json_string) {
+  nlohmann::json j = nlohmann::json::parse(json_string);
+  LoraLinearConfig config(
+      j["cache_folder"].get<std::string>(),
+      j["peft_model_id"].get<std::string>(),
+      j["trainable"].get<bool>(),
+      nullptr, // optimizer_config will be set later if present
+      j["init_lora_weights"].get<bool>(),
+      j["base_model_name_or_path"].get<std::string>(),
+      j["precision"].get<std::string>(),
+      j["rank"].get<int>(),
+      j["lora_alpha"].get<float>(),
+      j["lora_dropout"].get<float>(),
+      j["target_modules"].get<std::vector<std::string>>());
+  if (!j["optimizer_config"].is_null()) {
+    config.setOptimizer(LoraOptimizerConfig::fromJson(j["optimizer_config"]));
+  }
+  return config;
+}
+
+// Deserialization method
+LoraLinearConfig
+    LoraLinearConfig::deserialize_from_json_file(std::string const &filename) {
+  std::ifstream file(filename);
+  std::string j;
+  file >> j;
+  return deserialize_from_json_string(j);
+}
+
 }; // namespace FlexFlow
diff --git a/src/runtime/fftype.cc b/src/runtime/fftype.cc
index 0af5f45350..31937cef66 100644
--- a/src/runtime/fftype.cc
+++ b/src/runtime/fftype.cc
@@ -46,7 +46,9 @@ bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs) {
   return lhs.id == rhs.id;
 }
 
-bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs) { return !(lhs == rhs); }
+bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs) {
+  return !(lhs == rhs);
+}
 
 std::ostream &operator<<(std::ostream &os, PEFTModelID const &peft_model_id) {
   if (peft_model_id == PEFTModelID::NO_ID) {
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 20b2a5b963..1b65dfd869 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -837,5 +837,4 @@ std::string join_path(std::vector<std::string> const &paths) {
   return joined;
 }
 
-
 }; // namespace FlexFlow
diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc
index 287eb7e20a..81b412e049 100644
--- a/src/runtime/peft_weight_allocator.cc
+++ b/src/runtime/peft_weight_allocator.cc
@@ -21,93 +21,99 @@ using Legion::TaskArgument;
 using Legion::TaskLauncher;
 
 void PEFTMemoryManager::allocate_inference_memory() {
-    // allocate chunk of memory for all the PEFT adapters
-    Realm::Rect<1, coord_t> bounds(
-        Realm::Point<1, coord_t>(0),
-        Realm::Point<1, coord_t>(max_lora_size - 1));
-    std::vector<size_t> field_sizes;
-    field_sizes.push_back(sizeof(char));
-    Realm::RegionInstance::create_instance(peftLegionInst,
-                                           gpu_mem,
-                                           bounds,
-                                           field_sizes,
-                                           0,
-                                           Realm::ProfilingRequestSet())
-        .wait();
-    base_ptr = peftLegionInst.pointer_untyped(0, sizeof(char));
+  // allocate chunk of memory for all the PEFT adapters
+  Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0),
+                                 Realm::Point<1, coord_t>(max_lora_size - 1));
+  std::vector<size_t> field_sizes;
+  field_sizes.push_back(sizeof(char));
+  Realm::RegionInstance::create_instance(peftLegionInst,
+                                         gpu_mem,
+                                         bounds,
+                                         field_sizes,
+                                         0,
+                                         Realm::ProfilingRequestSet())
+      .wait();
+  base_ptr = peftLegionInst.pointer_untyped(0, sizeof(char));
 }
 
 void PEFTMemoryManager::allocate_finetuning_memory() {
-    size_t ft_size = max_lora_size*3; // weights, gradients, momentum values
-    ft_size += max_peft_tokens * (in_dim + max_rank); // input, low-rank activations
-    // allocate chunk of memory for PEFT adapter
-    Realm::Rect<1, coord_t> bounds(
-        Realm::Point<1, coord_t>(0),
-        Realm::Point<1, coord_t>(ft_size - 1));
-    std::vector<size_t> field_sizes;
-    field_sizes.push_back(sizeof(char));
-    Realm::RegionInstance::create_instance(peftLegionInst,
-                                           gpu_mem,
-                                           bounds,
-                                           field_sizes,
-                                           0,
-                                           Realm::ProfilingRequestSet())
-        .wait();
-    finetuning_ptr = peftLegionInst.pointer_untyped(0, sizeof(char));
+  size_t ft_size = max_lora_size * 3; // weights, gradients, momentum values
+  ft_size +=
+      max_peft_tokens * (in_dim + max_rank); // input, low-rank activations
+  // allocate chunk of memory for PEFT adapter
+  Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0),
+                                 Realm::Point<1, coord_t>(ft_size - 1));
+  std::vector<size_t> field_sizes;
+  field_sizes.push_back(sizeof(char));
+  Realm::RegionInstance::create_instance(peftLegionInst,
+                                         gpu_mem,
+                                         bounds,
+                                         field_sizes,
+                                         0,
+                                         Realm::ProfilingRequestSet())
+      .wait();
+  finetuning_ptr = peftLegionInst.pointer_untyped(0, sizeof(char));
 }
 
-void PEFTMemoryManager::get_finetuning_slot(PEFTModelID const &model_id, bool *cache_miss) {
-    if (finetuning_ptr == nullptr) {
-      allocate_finetuning_memory();
-    }
-    assert(finetuning_ptr != nullptr && "PEFT Memory Manager finetuning_ptr is null");
-    *cache_miss = (model_id.id != finetuning_model_id.id);
+void PEFTMemoryManager::get_finetuning_slot(PEFTModelID const &model_id,
+                                            bool *cache_miss) {
+  if (finetuning_ptr == nullptr) {
+    allocate_finetuning_memory();
+  }
+  assert(finetuning_ptr != nullptr &&
+         "PEFT Memory Manager finetuning_ptr is null");
+  *cache_miss = (model_id.id != finetuning_model_id.id);
 }
 
-int PEFTMemoryManager::get_inference_peft_slot(PEFTModelID const &model_id, bool *cache_miss) {
-    assert(base_ptr != nullptr && "PEFT Memory Manager not initialized");
-    assert(lru_hashtable.size() == lru_list.size() &&
-           lru_list.size() == peft2mem_slot.size() &&
-           "PEFT Memory Manager LRU hashtable/list and/or peft2mem_slot are out of sync");
-    // check for cache hit
-    if (lru_hashtable.find(model_id) != lru_hashtable.end()) {
-      int lru_list_index = lru_hashtable[model_id];
-      assert(lru_list[lru_list_index] == model_id &&
-             "PEFT Memory Manager LRU hashtable/list are out of sync");
-      // move the model to the end of the LRU list
-      lru_list.erase(lru_list.begin() + lru_list_index);
-      lru_list.push_back(model_id);
-      // update the LRU hashtable
-      lru_hashtable[model_id] = lru_list.size() - 1;
-      // get memory slot
-      assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() && "PEFT Memory Manager peft2mem_slot is out of sync");
-      *cache_miss = false;
+int PEFTMemoryManager::get_inference_peft_slot(PEFTModelID const &model_id,
+                                               bool *cache_miss) {
+  assert(base_ptr != nullptr && "PEFT Memory Manager not initialized");
+  assert(lru_hashtable.size() == lru_list.size() &&
+         lru_list.size() == peft2mem_slot.size() &&
+         "PEFT Memory Manager LRU hashtable/list and/or peft2mem_slot are out "
+         "of sync");
+  // check for cache hit
+  if (lru_hashtable.find(model_id) != lru_hashtable.end()) {
+    int lru_list_index = lru_hashtable[model_id];
+    assert(lru_list[lru_list_index] == model_id &&
+           "PEFT Memory Manager LRU hashtable/list are out of sync");
+    // move the model to the end of the LRU list
+    lru_list.erase(lru_list.begin() + lru_list_index);
+    lru_list.push_back(model_id);
+    // update the LRU hashtable
+    lru_hashtable[model_id] = lru_list.size() - 1;
+    // get memory slot
+    assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() &&
+           "PEFT Memory Manager peft2mem_slot is out of sync");
+    *cache_miss = false;
+  } else {
+    // cache miss
+    // check if you need to evict
+    bool need_to_evict = lru_list.size() == max_concurrent_adapters;
+    int mem_slot = -1;
+    if (need_to_evict) {
+      // evict the least recently used model
+      PEFTModelID lru_model_id = lru_list[0];
+      lru_list.erase(lru_list.begin());
+      lru_hashtable.erase(lru_model_id);
+      mem_slot = peft2mem_slot[lru_model_id];
+      peft2mem_slot.erase(lru_model_id);
     } else {
-      // cache miss
-      // check if you need to evict
-      bool need_to_evict = lru_list.size() == max_concurrent_adapters;
-      int mem_slot = -1;
-      if (need_to_evict) {
-        // evict the least recently used model
-        PEFTModelID lru_model_id = lru_list[0];
-        lru_list.erase(lru_list.begin());
-        lru_hashtable.erase(lru_model_id);
-        mem_slot = peft2mem_slot[lru_model_id];
-        peft2mem_slot.erase(lru_model_id);
-      } else {
-        mem_slot = lru_list.size();
-      }
-      // update the LRU list and hashtable
-      lru_list.push_back(model_id);
-      lru_hashtable[model_id] = lru_list.size() - 1;
-      // update the memory slot
-      peft2mem_slot[model_id] = mem_slot;
-      *cache_miss = true;
+      mem_slot = lru_list.size();
     }
-    assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() && "PEFT Memory Manager peft2mem_slot is out of sync");
-    int slot = peft2mem_slot[model_id];
-    assert(slot >= 0 && slot < max_concurrent_adapters && "PEFT Memory Manager peft2mem_slot is out of bounds");
-    return slot;
+    // update the LRU list and hashtable
+    lru_list.push_back(model_id);
+    lru_hashtable[model_id] = lru_list.size() - 1;
+    // update the memory slot
+    peft2mem_slot[model_id] = mem_slot;
+    *cache_miss = true;
+  }
+  assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() &&
+         "PEFT Memory Manager peft2mem_slot is out of sync");
+  int slot = peft2mem_slot[model_id];
+  assert(slot >= 0 && slot < max_concurrent_adapters &&
+         "PEFT Memory Manager peft2mem_slot is out of bounds");
+  return slot;
 }
 
 template <typename DT>
@@ -160,138 +166,152 @@ void load_peft_from_file(DT *ptr,
   in.close();
 }
 
-void PEFTMemoryManager::load_peft_model(LoraLinearWeight &weight, LoraLinearConfig const &lora_config) {
-    // Load weights
-    assert(weight.w0_ptr != nullptr && weight.w1_ptr != nullptr && "PEFT Memory Manager weight ptr null");
-    int w0_num_elements = lora_config.rank * in_dim;
-    int w1_num_elements = lora_config.rank * out_dim;
-    // values below represent total weight sizes before sharding. Lora B is not
-    // sharded.
-    int lora_A_num_rows = in_dim * num_shards;
-    int lora_A_num_cols = lora_config.rank;
-    int lora_B_num_rows = lora_config.rank;
-    int lora_B_num_cols = out_dim;
-    int lora_A_num_shards = num_shards;
-    int lora_B_num_shards = 1;
-    if (lora_config.init_lora_weights) {
-        // initialize weights randomly
-        int seed = 0;
-        init_peft_weight_wrapper(weight, in_dim, out_dim, lora_config.rank, dt, seed);
+void PEFTMemoryManager::load_peft_model(LoraLinearWeight &weight,
+                                        LoraLinearConfig const &lora_config) {
+  // Load weights
+  assert(weight.w0_ptr != nullptr && weight.w1_ptr != nullptr &&
+         "PEFT Memory Manager weight ptr null");
+  int w0_num_elements = lora_config.rank * in_dim;
+  int w1_num_elements = lora_config.rank * out_dim;
+  // values below represent total weight sizes before sharding. Lora B is not
+  // sharded.
+  int lora_A_num_rows = in_dim * num_shards;
+  int lora_A_num_cols = lora_config.rank;
+  int lora_B_num_rows = lora_config.rank;
+  int lora_B_num_cols = out_dim;
+  int lora_A_num_shards = num_shards;
+  int lora_B_num_shards = 1;
+  if (lora_config.init_lora_weights) {
+    // initialize weights randomly
+    int seed = 0;
+    init_peft_weight_wrapper(
+        weight, in_dim, out_dim, lora_config.rank, dt, seed);
+  } else {
+    // load weights from file
+    std::string weights_folder_filepath = join_path({
+        lora_config.cache_folder,
+        "weights",
+        lora_config.peft_model_id,
+        dt == DT_FLOAT ? "full-precision" : "half-precision",
+    });
+    std::string w0_filepath = join_path(
+        {weights_folder_filepath, lora_layername_substr + "_A.weight"});
+    std::string w1_filepath = join_path(
+        {weights_folder_filepath, lora_layername_substr + "_B.weight"});
+    if (dt == DT_FLOAT) {
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_A.weight"
+                << ", num_rows: " << lora_A_num_rows
+                << ", num_cols: " << lora_A_num_cols
+                << ", num_shards: " << lora_A_num_shards
+                << ", shard_id: " << shard_id << std::endl;
+      load_peft_from_file((float *)weight.w0_ptr,
+                          lora_A_num_rows,
+                          lora_A_num_cols,
+                          lora_A_num_shards,
+                          shard_id,
+                          w0_filepath);
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_B.weight"
+                << ", num_rows: " << lora_B_num_rows
+                << ", num_cols: " << lora_B_num_cols
+                << ", num_shards: " << lora_B_num_shards
+                << ", shard_id: " << shard_id << std::endl;
+      load_peft_from_file((float *)weight.w1_ptr,
+                          lora_B_num_rows,
+                          lora_B_num_cols,
+                          lora_B_num_shards,
+                          shard_id,
+                          w1_filepath);
+    } else if (dt == DT_HALF) {
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_A.weight"
+                << ", num_rows: " << lora_A_num_rows
+                << ", num_cols: " << lora_A_num_cols
+                << ", num_shards: " << lora_A_num_shards
+                << ", shard_id: " << shard_id << std::endl;
+      load_peft_from_file((half *)weight.w0_ptr,
+                          lora_A_num_rows,
+                          lora_A_num_cols,
+                          lora_A_num_shards,
+                          shard_id,
+                          w0_filepath);
+      std::cout << "Loading LORA weight " << lora_layername_substr + "_B.weight"
+                << ", num_rows: " << lora_B_num_rows
+                << ", num_cols: " << lora_B_num_cols
+                << ", num_shards: " << lora_B_num_shards
+                << ", shard_id: " << shard_id << std::endl;
+      load_peft_from_file((half *)weight.w1_ptr,
+                          lora_B_num_rows,
+                          lora_B_num_cols,
+                          lora_B_num_shards,
+                          shard_id,
+                          w1_filepath);
     } else {
-        // load weights from file
-        std::string weights_folder_filepath = join_path({
-            lora_config.cache_folder,
-            "weights",
-            lora_config.peft_model_id,
-            dt == DT_FLOAT ? "full-precision" : "half-precision",
-        });
-        std::string w0_filepath = join_path(
-            {weights_folder_filepath, lora_layername_substr + "_A.weight"});
-        std::string w1_filepath = join_path(
-            {weights_folder_filepath, lora_layername_substr + "_B.weight"});
-        if (dt == DT_FLOAT) {
-            std::cout << "Loading LORA weight "
-                        << lora_layername_substr + "_A.weight"
-                        << ", num_rows: " << lora_A_num_rows
-                        << ", num_cols: " << lora_A_num_cols
-                        << ", num_shards: " << lora_A_num_shards
-                        << ", shard_id: " << shard_id << std::endl;
-            load_peft_from_file((float *)weight.w0_ptr,
-                                lora_A_num_rows,
-                                lora_A_num_cols,
-                                lora_A_num_shards,
-                                shard_id,
-                                w0_filepath);
-            std::cout << "Loading LORA weight "
-                        << lora_layername_substr + "_B.weight"
-                        << ", num_rows: " << lora_B_num_rows
-                        << ", num_cols: " << lora_B_num_cols
-                        << ", num_shards: " << lora_B_num_shards
-                        << ", shard_id: " << shard_id << std::endl;
-            load_peft_from_file((float *)weight.w1_ptr,
-                                lora_B_num_rows,
-                                lora_B_num_cols,
-                                lora_B_num_shards,
-                                shard_id,
-                                w1_filepath);
-        } else if (dt == DT_HALF) {
-            std::cout << "Loading LORA weight "
-                        << lora_layername_substr + "_A.weight"
-                        << ", num_rows: " << lora_A_num_rows
-                        << ", num_cols: " << lora_A_num_cols
-                        << ", num_shards: " << lora_A_num_shards
-                        << ", shard_id: " << shard_id << std::endl;
-            load_peft_from_file((half *)weight.w0_ptr,
-                                lora_A_num_rows,
-                                lora_A_num_cols,
-                                lora_A_num_shards,
-                                shard_id,
-                                w0_filepath);
-            std::cout << "Loading LORA weight "
-                        << lora_layername_substr + "_B.weight"
-                        << ", num_rows: " << lora_B_num_rows
-                        << ", num_cols: " << lora_B_num_cols
-                        << ", num_shards: " << lora_B_num_shards
-                        << ", shard_id: " << shard_id << std::endl;
-            load_peft_from_file((half *)weight.w1_ptr,
-                                lora_B_num_rows,
-                                lora_B_num_cols,
-                                lora_B_num_shards,
-                                shard_id,
-                                w1_filepath);
-        } else {
-            assert(false && "Data type not supported");
-        }
+      assert(false && "Data type not supported");
     }
+  }
 }
 
-LoraLinearWeight PEFTMemoryManager::get_inference_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config) {
-    assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set");
-    bool cache_miss;
-    int mem_slot = get_inference_peft_slot(model_id, &cache_miss);
-    int w0_num_elements = lora_config.rank * in_dim;
-    int data_size = data_type_size(dt);
-    LoraLinearWeight result;
-    result.w0_ptr = static_cast<char *>(base_ptr) + mem_slot * max_lora_size;
-    result.w1_ptr = static_cast<char *>(result.w0_ptr) + w0_num_elements * data_size;
-    if (cache_miss) {
-      load_peft_model(result, lora_config);
-    }
-    return result;
+LoraLinearWeight
+    PEFTMemoryManager::get_inference_peft(PEFTModelID const &model_id,
+                                          LoraLinearConfig const &lora_config) {
+  assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set");
+  bool cache_miss;
+  int mem_slot = get_inference_peft_slot(model_id, &cache_miss);
+  int w0_num_elements = lora_config.rank * in_dim;
+  int data_size = data_type_size(dt);
+  LoraLinearWeight result;
+  result.w0_ptr = static_cast<char *>(base_ptr) + mem_slot * max_lora_size;
+  result.w1_ptr =
+      static_cast<char *>(result.w0_ptr) + w0_num_elements * data_size;
+  if (cache_miss) {
+    load_peft_model(result, lora_config);
+  }
+  return result;
 }
 
-LoraLinearWeight PEFTMemoryManager::get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config) {
-    assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set");
-    bool cache_miss;
-    get_finetuning_slot(model_id, &cache_miss);
-    int w0_num_elements = lora_config.rank * in_dim;
-    int w1_num_elements = lora_config.rank * out_dim;
-    int data_size = data_type_size(dt);
-    LoraLinearWeight result;
-    result.w0_ptr = finetuning_ptr;
-    result.w1_ptr = static_cast<char *>(result.w0_ptr)+ w0_num_elements*data_size;
-    result.w0_grad_ptr = static_cast<char *>(result.w1_ptr) + w1_num_elements*data_size;
-    result.w1_grad_ptr = static_cast<char *>(result.w0_grad_ptr) + w0_num_elements*data_size;
-    result.w0_v_values_ptr = static_cast<char *>(result.w1_grad_ptr) + w1_num_elements*data_size;
-    result.w1_v_values_ptr = static_cast<char *>(result.w0_v_values_ptr) + w0_num_elements*data_size;
-    result.input_activation = static_cast<char *>(result.w1_v_values_ptr) + w1_num_elements*data_size; // max_peft_tokens*in_dim
-    result.low_rank_activation = static_cast<char *>(result.input_activation) + max_peft_tokens*in_dim*data_size; // max_peft_tokens*rank
-    if (cache_miss) {
-      load_peft_model(result, lora_config);
-    }
-    return result;
+LoraLinearWeight PEFTMemoryManager::get_finetuning_peft(
+    PEFTModelID const &model_id, LoraLinearConfig const &lora_config) {
+  assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set");
+  bool cache_miss;
+  get_finetuning_slot(model_id, &cache_miss);
+  int w0_num_elements = lora_config.rank * in_dim;
+  int w1_num_elements = lora_config.rank * out_dim;
+  int data_size = data_type_size(dt);
+  LoraLinearWeight result;
+  result.w0_ptr = finetuning_ptr;
+  result.w1_ptr =
+      static_cast<char *>(result.w0_ptr) + w0_num_elements * data_size;
+  result.w0_grad_ptr =
+      static_cast<char *>(result.w1_ptr) + w1_num_elements * data_size;
+  result.w1_grad_ptr =
+      static_cast<char *>(result.w0_grad_ptr) + w0_num_elements * data_size;
+  result.w0_v_values_ptr =
+      static_cast<char *>(result.w1_grad_ptr) + w1_num_elements * data_size;
+  result.w1_v_values_ptr =
+      static_cast<char *>(result.w0_v_values_ptr) + w0_num_elements * data_size;
+  result.input_activation =
+      static_cast<char *>(result.w1_v_values_ptr) +
+      w1_num_elements * data_size; // max_peft_tokens*in_dim
+  result.low_rank_activation =
+      static_cast<char *>(result.input_activation) +
+      max_peft_tokens * in_dim * data_size; // max_peft_tokens*rank
+  if (cache_miss) {
+    load_peft_model(result, lora_config);
+  }
+  return result;
 }
 
-LoraLinearWeight PEFTMemoryManager::get_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config) {
-    if (lora_config.trainable) {
-      return get_finetuning_peft(model_id, lora_config);
-    } else {
-      return get_inference_peft(model_id, lora_config);
-    }
+LoraLinearWeight
+    PEFTMemoryManager::get_peft(PEFTModelID const &model_id,
+                                LoraLinearConfig const &lora_config) {
+  if (lora_config.trainable) {
+    return get_finetuning_peft(model_id, lora_config);
+  } else {
+    return get_inference_peft(model_id, lora_config);
+  }
 }
 
 void PEFTMemoryManager::check_ft_model_id(PEFTModelID const &model_id) {
-    assert(finetuning_model_id == model_id && "PEFT bwd model is not in memory!");
+  assert(finetuning_model_id == model_id && "PEFT bwd model is not in memory!");
 }
 
 }; // namespace FlexFlow
\ No newline at end of file
diff --git a/src/runtime/peft_weight_allocator.cu b/src/runtime/peft_weight_allocator.cu
index bc9ab443cb..3c4ea91db3 100644
--- a/src/runtime/peft_weight_allocator.cu
+++ b/src/runtime/peft_weight_allocator.cu
@@ -1,60 +1,70 @@
 
 
 #include "flexflow/ops/kernels/decompress_kernels.h"
-#include "flexflow/utils/peft_weight_allocator.h"
 #include "flexflow/utils/cuda_helper.h"
+#include "flexflow/utils/peft_weight_allocator.h"
 #include <random>
 #include <vector>
 namespace FlexFlow {
 
 template <typename DT>
-void lora_init_kernel(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, int seed, cudaStream_t stream) {
-    // Initialize generator
-    std::mt19937 gen(seed);
+void lora_init_kernel(LoraLinearWeight const &weight,
+                      int in_dim,
+                      int out_dim,
+                      int rank,
+                      int seed,
+                      cudaStream_t stream) {
+  // Initialize generator
+  std::mt19937 gen(seed);
 
-    // Get handle to weights by iterating over m->model_state to get each
-    // LoraLinearWeight object
-    int w0_num_elements = rank * in_dim;
-    int w1_num_elements = rank * out_dim;
+  // Get handle to weights by iterating over m->model_state to get each
+  // LoraLinearWeight object
+  int w0_num_elements = rank * in_dim;
+  int w1_num_elements = rank * out_dim;
 
-    // LoRA_A weight: [in_dim, rank]
-    float stdv_lora_a = 1.0f / sqrt(in_dim);
-    std::uniform_real_distribution<float> dis_lora_a(-stdv_lora_a, stdv_lora_a);
-    std::vector<DT> lora_a_random_init(w0_num_elements);
-    for (auto &num : lora_a_random_init) {
-        float num_float = dis_lora_a(gen);
-        if (std::is_same<DT, half>::value) {
-            num = __float2half(num_float);
-        } else {
-            num = num_float;
-        }
+  // LoRA_A weight: [in_dim, rank]
+  float stdv_lora_a = 1.0f / sqrt(in_dim);
+  std::uniform_real_distribution<float> dis_lora_a(-stdv_lora_a, stdv_lora_a);
+  std::vector<DT> lora_a_random_init(w0_num_elements);
+  for (auto &num : lora_a_random_init) {
+    float num_float = dis_lora_a(gen);
+    if (std::is_same<DT, half>::value) {
+      num = __float2half(num_float);
+    } else {
+      num = num_float;
     }
-    checkCUDA(cudaMemcpyAsync(static_cast<DT *>(weight.w0_ptr),
-                                lora_a_random_init.data(),
-                                w0_num_elements * sizeof(DT),
-                                cudaMemcpyHostToDevice,
-                                stream));
+  }
+  checkCUDA(cudaMemcpyAsync(static_cast<DT *>(weight.w0_ptr),
+                            lora_a_random_init.data(),
+                            w0_num_elements * sizeof(DT),
+                            cudaMemcpyHostToDevice,
+                            stream));
 
-    // LoRA_B weight: [rank, out_dim]
-    float stdv_lora_b = 1.0f / sqrt(rank);
-    std::uniform_real_distribution<float> dis_lora_b(-stdv_lora_b, stdv_lora_b);
-    std::vector<float> lora_b_random_init(w1_num_elements);
-    for (auto &num : lora_b_random_init) {
-        float num_float = dis_lora_b(gen);
-        if (std::is_same<DT, half>::value) {
-            num = __float2half(num_float);
-        } else {
-            num = num_float;
-        }
+  // LoRA_B weight: [rank, out_dim]
+  float stdv_lora_b = 1.0f / sqrt(rank);
+  std::uniform_real_distribution<float> dis_lora_b(-stdv_lora_b, stdv_lora_b);
+  std::vector<float> lora_b_random_init(w1_num_elements);
+  for (auto &num : lora_b_random_init) {
+    float num_float = dis_lora_b(gen);
+    if (std::is_same<DT, half>::value) {
+      num = __float2half(num_float);
+    } else {
+      num = num_float;
     }
-    checkCUDA(cudaMemcpyAsync(static_cast<DT *>(weight.w1_ptr),
-                                lora_b_random_init.data(),
-                                w1_num_elements * sizeof(DT),
-                                cudaMemcpyHostToDevice,
-                                stream));
+  }
+  checkCUDA(cudaMemcpyAsync(static_cast<DT *>(weight.w1_ptr),
+                            lora_b_random_init.data(),
+                            w1_num_elements * sizeof(DT),
+                            cudaMemcpyHostToDevice,
+                            stream));
 }
 
-void init_peft_weight_wrapper(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, DataType dt, int seed) {
+void init_peft_weight_wrapper(LoraLinearWeight const &weight,
+                              int in_dim,
+                              int out_dim,
+                              int rank,
+                              DataType dt,
+                              int seed) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
 
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 2377a4f938..9986fc2274 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -256,16 +256,16 @@ size_t RequestManager::get_num_ssms() {
 }
 
 void RequestManager::register_peft_config(PEFTModelID const &peft_model_id,
-                                         LoraLinearConfig const &peft_config) {
+                                          LoraLinearConfig const &peft_config) {
   // check that peft_model_id is not already in use
   assert(peft_configs.find(peft_model_id) == peft_configs.end() &&
          "PEFT model ID already in use");
-  // peft_configs[peft_model_id] = std::move(peft_config);
-  peft_configs.emplace(peft_model_id, std::move(peft_config));
+  peft_configs[peft_model_id] = LoraLinearConfig::deserialize_from_json_string(
+      peft_config.serialize_to_json_string());
 }
 
-LoraLinearConfig const &RequestManager::get_peft_config(
-    PEFTModelID const &peft_model_id) {
+LoraLinearConfig const &
+    RequestManager::get_peft_config(PEFTModelID const &peft_model_id) {
   assert(peft_configs.find(peft_model_id) != peft_configs.end() &&
          "PEFT model ID not found");
   return peft_configs[peft_model_id];
@@ -279,13 +279,16 @@ void RequestManager::set_max_concurrent_adapters(int max_concurrent_adapters_) {
   max_concurrent_adapters = max_concurrent_adapters_;
 }
 
-int RequestManager::get_max_lora_rank() { return max_lora_rank; }
+int RequestManager::get_max_lora_rank() {
+  return max_lora_rank;
+}
 
 int RequestManager::get_max_concurrent_adapters() {
   return max_concurrent_adapters;
 }
 
-PEFTModelID *FFModel::register_peft_adapter(LoraLinearConfig const &peft_config) {
+PEFTModelID *
+    FFModel::register_peft_adapter(LoraLinearConfig const &peft_config) {
   assert(config.enable_peft &&
          "Cannot add a LoRA layer if PEFT mode is not enabled");
   if (peft_config.target_modules.size() == 0) {
@@ -293,16 +296,21 @@ PEFTModelID *FFModel::register_peft_adapter(LoraLinearConfig const &peft_config)
     std::cout << peft_config << std::endl;
     assert(false);
   }
-  // go over base_layer_to_peft_layer and check that you can find at least one match
-  for (int i=0; i<peft_config.target_modules.size(); i++) {
+  // go over base_layer_to_peft_layer and check that you can find at least one
+  // match
+  for (int i = 0; i < peft_config.target_modules.size(); i++) {
     bool found = false;
-    for (auto const &base_layer : peft_config.base_layer_to_peft_layer) {
-      if (base_layer.name != nullptr && strlen(base_layer.name) > 0 && std::string(base_layer.name).find(peft_config.target_modules[0]) != std::string::npos) {
+    for (auto const &pair : base_layer_to_peft_layer) {
+      Layer *base_layer = pair.first;
+      if (base_layer->name != nullptr && strlen(base_layer->name) > 0 &&
+          std::string(base_layer->name).find(peft_config.target_modules[0]) !=
+              std::string::npos) {
         found = true;
         break;
       }
     }
-    assert(found && "Attempting to add LoRA to a LLM target module that does not exist or does not support LoRA");
+    assert(found && "Attempting to add LoRA to a LLM target module that does "
+                    "not exist or does not support LoRA");
   }
   PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++);
   RequestManager *rm = RequestManager::get_request_manager();
@@ -787,11 +795,13 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
         new_bc.requestsInfo[i].request_guid =
             old_bc.requestsInfo[i].request_guid;
-        // new_bc.requestsInfo[i].peft_model_id =
-        //     old_bc.requestsInfo[i].peft_model_id;
-        new_bc.requestsInfo[i].peft_adapters =
-            old_bc.requestsInfo[i].peft_adapters;
-        num_concurrent_adapters += new_bc.requestsInfo[i].peft_adapters.size();
+        new_bc.requestsInfo[i].peft_model_id =
+            old_bc.requestsInfo[i].peft_model_id;
+        new_bc.requestsInfo[i].peft_model_config =
+            old_bc.requestsInfo[i].peft_model_config;
+        if (old_bc.requestsInfo[i].peft_model_id != PEFTModelID::NO_ID) {
+          num_concurrent_adapters += 1;
+        }
         new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd;
         new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length;
         num_active_req++;
@@ -853,12 +863,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
           new_bc.num_tokens < get_max_tokens_per_batch()) {
         Request new_request = pending_infr_request_queue.front();
         assert(new_request.req_type == RequestType::REQ_INFERENCE);
-        
-        // if the request has peft adapters and we are at capacity, don't add it yet
-        if (new_request.peft_model_id != PEFTModelID::NO_ID && num_concurrent_adapters == get_max_concurrent_adapters()) {
+
+        // if the request has peft adapters and we are at capacity, don't add it
+        // yet
+        if (new_request.peft_model_id != PEFTModelID::NO_ID &&
+            num_concurrent_adapters == get_max_concurrent_adapters()) {
           break;
         }
-        
+
         pending_infr_request_queue.pop();
         // all_requests[new_request.guid] = new_request;
 
@@ -869,9 +881,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
             std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
                      (int)new_request.tokens.size());
         new_bc.requestsInfo[i].max_length = new_request.max_length;
-        // new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id;
+        new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id;
         if (new_request.peft_model_id != PEFTModelID::NO_ID) {
-          new_bc.requestsInfo[i].peft_adapters[new_request.peft_model_id] = get_peft_config(new_request.peft_model_id).serialize_to_json_string();
+          new_bc.requestsInfo[i].peft_model_config =
+              get_peft_config(new_request.peft_model_id)
+                  .serialize_to_json_string();
         }
         new_bc.requestsInfo[i].peft_bwd = false;
         new_bc.request_completed[i] = false;
@@ -1027,7 +1041,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
     int num_peft_label_tokens = request.dataset[dataset_entry].second.size();
     assert(num_peft_label_tokens == 0);
 
-    if (num_peft_tokens > 0 && num_concurrent_adapters < get_max_concurrent_adapters()) {
+    if (num_peft_tokens > 0 &&
+        num_concurrent_adapters < get_max_concurrent_adapters()) {
       assert(new_bc.request_completed[inference_batch_size]);
       // request info
       new_bc.request_completed[inference_batch_size] = false;
@@ -1039,9 +1054,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
           num_peft_tokens;
       new_bc.requestsInfo[inference_batch_size].max_length = request.max_length;
       new_bc.requestsInfo[inference_batch_size].request_guid = request.guid;
-      // new_bc.requestsInfo[inference_batch_size].peft_model_id =
-      //     request.peft_model_id;
-      new_bc.requestsInfo[inference_batch_size].peft_adapters[request.peft_model_id] = get_peft_config(request.peft_model_id).serialize_to_json_string();
+      new_bc.requestsInfo[inference_batch_size].peft_model_id =
+          request.peft_model_id;
+      new_bc.requestsInfo[inference_batch_size].peft_model_config =
+          get_peft_config(request.peft_model_id).serialize_to_json_string();
       new_bc.requestsInfo[inference_batch_size].peft_bwd = true;
       set_optimizer_tasks(
           new_bc.requestsInfo[inference_batch_size].optimizer_tasks,
@@ -1060,7 +1076,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         new_bc.num_tokens++;
         new_bc.num_peft_tokens++;
       }
-      num_concurrent_adapters +=1;
+      num_concurrent_adapters += 1;
     }
   }
   assert(num_concurrent_adapters <= get_max_concurrent_adapters() &&

From 92c2c374e0d287d105489d9c009c1acc214e8f78 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 5 Nov 2024 04:21:07 +0000
Subject: [PATCH 19/37] fix

---
 .../ops/kernels/lora_linear_kernels.h         |  9 +++--
 include/flexflow/ops/lora_linear_params.h     | 40 ++++---------------
 include/flexflow/request_manager.h            |  4 +-
 inference/peft/peft.cc                        | 13 +++---
 inference/peft/peft_bwd_benchmark.cc          |  8 ++--
 inference/peft/peft_fwd_benchmark.cc          |  8 ++--
 inference/peft/req_rate_benchmark.cc          |  6 +--
 src/ops/kernels/lora_linear_kernels.cu        |  5 ++-
 src/ops/lora_linear_params.cc                 | 29 ++++++--------
 src/runtime/request_manager.cc                |  6 +--
 10 files changed, 52 insertions(+), 76 deletions(-)

diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
index dfff2ec5c5..7138f62e90 100644
--- a/include/flexflow/ops/kernels/lora_linear_kernels.h
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -10,6 +10,9 @@
 
 namespace FlexFlow {
 
+using Legion::Context;
+using Legion::Runtime;
+
 #ifdef DEADCODE
 struct LoraLinearModelState {
   LoraLinearWeight weights;
@@ -40,7 +43,7 @@ namespace LoraLinear {
 bool lora_applies_to_this_layer(LoraLinearMeta *m,
                                 LoraLinearConfig const &config);
 
-void init_kernel_wrapper(LoraLinearMeta *m, int seed);
+// void init_kernel_wrapper(LoraLinearMeta *m, int seed);
 void inference_kernel_wrapper(LoraLinearMeta *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
@@ -53,8 +56,8 @@ void peft_bwd_kernel_wrapper(Context ctx,
                              GenericTensorAccessorR const &output_grad);
 
 namespace Internal {
-template <typename DT>
-void init_kernel(LoraLinearMeta *m, int seed, ffStream_t stream);
+// template <typename DT>
+// void init_kernel(LoraLinearMeta *m, int seed, ffStream_t stream);
 template <typename DT>
 void inference_kernel(LoraLinearMeta *m,
                       BatchConfig const *bc,
diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
index 1dfe5f17bd..46b88c9690 100644
--- a/include/flexflow/ops/lora_linear_params.h
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -19,8 +19,8 @@ class LoraOptimizerConfig {
   LoraOptimizerConfig();
   virtual std::string getType() const = 0;
   virtual nlohmann::json toJson() const = 0;
-  static std::unique_ptr<LoraOptimizerConfig> fromJson(nlohmann::json const &j);
-  virtual ~LoraOptimizerConfig() = default;
+  static LoraOptimizerConfig *fromJson(nlohmann::json const &j);
+  virtual ~LoraOptimizerConfig() {}
 };
 
 class LoraSGDOptimizerConfig : public LoraOptimizerConfig {
@@ -32,15 +32,11 @@ class LoraSGDOptimizerConfig : public LoraOptimizerConfig {
                          bool weight_decay_ = 0.0f);
   friend std::ostream &operator<<(std::ostream &os,
                                   LoraSGDOptimizerConfig const &llc);
-
   std::string getType() const override {
     return "SGD";
   }
-
   nlohmann::json toJson() const override;
-
-  static std::unique_ptr<LoraSGDOptimizerConfig>
-      fromJson(nlohmann::json const &j);
+  static LoraSGDOptimizerConfig *fromJson(nlohmann::json const &j);
 
 public:
   double lr = 0.001f;
@@ -63,11 +59,8 @@ class LoraAdamOptimizerConfig : public LoraOptimizerConfig {
   std::string getType() const override {
     return "Adam";
   }
-
   nlohmann::json toJson() const override;
-
-  static std::unique_ptr<LoraAdamOptimizerConfig>
-      fromJson(nlohmann::json const &j);
+  static LoraAdamOptimizerConfig *fromJson(nlohmann::json const &j);
 
 public:
   // Adam
@@ -94,29 +87,11 @@ class LoraLinearConfig {
                    std::vector<std::string> const &target_modules_ = {});
   // constructor used to support std::unordered_map
   LoraLinearConfig();
-
-  // Method to set optimizer
-  template <typename T>
-  void setOptimizer(T &&opt) {
-    if constexpr (std::is_base_of_v<LoraOptimizerConfig,
-                                    std::remove_reference_t<T>>) {
-      optimizer_config =
-          std::make_unique<std::remove_reference_t<T>>(std::forward<T>(opt));
-    } else if constexpr (std::is_same_v<std::unique_ptr<LoraOptimizerConfig>,
-                                        std::remove_reference_t<T>>) {
-      optimizer_config = std::move(opt);
-    } else {
-      static_assert(always_false<T>, "Unsupported optimizer type");
-    }
-  }
-  // Helper template for static_assert
-  template <typename>
-  static inline constexpr bool always_false = false;
-
   friend bool operator==(LoraLinearConfig const &lhs,
                          LoraLinearConfig const &rhs);
   friend std::ostream &operator<<(std::ostream &os,
                                   LoraLinearConfig const &llc);
+
   std::string serialize_to_json_string(int indent = -1) const;
   void serialize_to_json_file(std::string const &filename) const;
   // Deserialization method
@@ -138,8 +113,7 @@ class LoraLinearConfig {
   // whether the weights are trainable (fine-tuning scenario) or not
   // (inference-only). If set to true, allocate space for the gradients
   bool trainable = false;
-  // LoraOptimizerConfig *optimizer_config;
-  std::unique_ptr<LoraOptimizerConfig> optimizer_config;
+  LoraOptimizerConfig *optimizer_config;
   // whether to initialize weights randomly (instead of attempting to load them
   // from file)
   bool init_lora_weights;
@@ -170,4 +144,4 @@ struct hash<FlexFlow::LoraLinearParams> {
 };
 } // namespace std
 
-#endif // _FLEXFLOW_LORA_LINEAR_PARAMS_H
+#endif // _FLEXFLOW_LORA_LINEAR_PARAMS_H
\ No newline at end of file
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 39f213752e..3b4e8c4c8d 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -150,8 +150,8 @@ class RequestManager {
                           std::vector<int> eos_token_ids,
                           std::string const &path);
   void register_output_filepath(std::string const &);
-  void register_peft_config(PEFTModelID const &peft_model_id,
-                            LoraLinearConfig const &peft_config);
+  void set_peft_config(PEFTModelID const &peft_model_id,
+                       LoraLinearConfig const &peft_config);
   LoraLinearConfig const &get_peft_config(PEFTModelID const &peft_model_id);
   void set_max_lora_rank(int max_lora_rank);
   void set_max_concurrent_adapters(int max_concurrent_adapters);
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index 0ab0b62ee8..af9e5743c7 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -320,18 +320,19 @@ void FlexFlow::top_level_task(Task const *task,
     assert(false && "unknow model type");
   }
 
-  // Add PEFT layer
+  // Start background server
+  rm->start_background_server(&model);
+
+  // Add PEFT adapter(s)
   PEFTModelID *peft_model_id = nullptr, *peft_model_id_finetuning = nullptr;
   if (!peft_model_name.empty()) {
-    peft_model_id = model.add_lora_layer(peft_config);
+    peft_model_id = model.register_peft_adapter(peft_config);
     if (enable_peft_finetuning) {
-      peft_model_id_finetuning = model.add_lora_layer(peft_config_finetuning);
+      peft_model_id_finetuning =
+          model.register_peft_adapter(peft_config_finetuning);
     }
   }
 
-  // Start background server
-  rm->start_background_server(&model);
-
   // Run workload
   {
     std::vector<Request> requests;
diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc
index 85e97ec4e8..9da4fa1994 100644
--- a/inference/peft/peft_bwd_benchmark.cc
+++ b/inference/peft/peft_bwd_benchmark.cc
@@ -304,15 +304,15 @@ void FlexFlow::top_level_task(Task const *task,
     assert(false && "unknow model type");
   }
 
+  // Start background server
+  rm->start_background_server(&model);
+
   // Add PEFT layer
   PEFTModelID *peft_model_id = nullptr;
   if (!peft_model_name.empty()) {
-    peft_model_id = model.add_lora_layer(peft_config);
+    peft_model_id = model.register_peft_adapter(peft_config);
   }
 
-  // Start background server
-  rm->start_background_server(&model);
-
   // Warmup stage
   {
     std::vector<Request> requests;
diff --git a/inference/peft/peft_fwd_benchmark.cc b/inference/peft/peft_fwd_benchmark.cc
index 87322a42dd..3274f2e535 100644
--- a/inference/peft/peft_fwd_benchmark.cc
+++ b/inference/peft/peft_fwd_benchmark.cc
@@ -304,15 +304,15 @@ void FlexFlow::top_level_task(Task const *task,
     assert(false && "unknow model type");
   }
 
+  // Start background server
+  rm->start_background_server(&model);
+
   // Add PEFT layer
   PEFTModelID *peft_model_id = nullptr;
   if (!peft_model_name.empty()) {
-    peft_model_id = model.add_lora_layer(peft_config);
+    peft_model_id = model.register_peft_adapter(peft_config);
   }
 
-  // Start background server
-  rm->start_background_server(&model);
-
   // Run workload
   {
     std::vector<Request> requests;
diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc
index ffa77478e1..8a94f6e68b 100644
--- a/inference/peft/req_rate_benchmark.cc
+++ b/inference/peft/req_rate_benchmark.cc
@@ -366,14 +366,14 @@ void FlexFlow::top_level_task(Task const *task,
     assert(false && "unknow model type");
   }
 
+  rm->start_background_server(&model);
+
   // Add PEFT layer
   PEFTModelID *peft_model_id = nullptr;
   if (!peft_model_name.empty()) {
-    peft_model_id = model.add_lora_layer(peft_config);
+    peft_model_id = model.register_peft_adapter(peft_config);
   }
 
-  rm->start_background_server(&model);
-
   // Warmup stage
   {
     std::vector<Request> requests;
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 28a2d6b23e..eb2a472ee3 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -35,6 +35,7 @@ LoraLinearMeta::~LoraLinearMeta(void) {}
 namespace Kernels {
 namespace LoraLinear {
 
+#ifdef DEADCODE
 void init_kernel_wrapper(LoraLinearMeta *m, int seed) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
@@ -47,6 +48,7 @@ void init_kernel_wrapper(LoraLinearMeta *m, int seed) {
     assert(false && "Unsupported data type");
   }
 }
+#endif
 
 void inference_kernel_wrapper(LoraLinearMeta *m,
                               BatchConfig const *bc,
@@ -314,7 +316,6 @@ void inference_kernel(LoraLinearMeta *m,
                       DT *output_ptr,
                       int in_dim,
                       int out_dim,
-                      int num_shards,
                       ffStream_t stream) {
   checkCUDA(cublasSetStream(m->handle.blas, stream));
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
@@ -593,7 +594,7 @@ void peft_bwd_kernel(Context ctx,
       if (lora_config.optimizer_config->getType() == "SGD") {
         LoraSGDOptimizerConfig const *sgd_config =
             static_cast<LoraSGDOptimizerConfig const *>(
-                lora_config.optimizer_config.get());
+                lora_config.optimizer_config);
         // LoRA_A weight is split in tensor parallelism, so no need to apply
         // all-reduce
         sgd_update<<<GET_BLOCKS(w0_num_elements),
diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc
index 61c9c15336..21648089da 100644
--- a/src/ops/lora_linear_params.cc
+++ b/src/ops/lora_linear_params.cc
@@ -12,8 +12,7 @@ namespace FlexFlow {
 // empty optimizer
 LoraOptimizerConfig::LoraOptimizerConfig() {}
 
-std::unique_ptr<LoraOptimizerConfig>
-    LoraOptimizerConfig::fromJson(nlohmann::json const &j) {
+LoraOptimizerConfig *LoraOptimizerConfig::fromJson(nlohmann::json const &j) {
   std::string type = j["type"];
   if (type == "SGD") {
     return LoraSGDOptimizerConfig::fromJson(j);
@@ -50,9 +49,9 @@ nlohmann::json LoraSGDOptimizerConfig::toJson() const {
           {"weight_decay", weight_decay}};
 }
 
-std::unique_ptr<LoraSGDOptimizerConfig>
+LoraSGDOptimizerConfig *
     LoraSGDOptimizerConfig::fromJson(nlohmann::json const &j) {
-  auto sgd = std::make_unique<LoraSGDOptimizerConfig>();
+  LoraSGDOptimizerConfig *sgd = new LoraSGDOptimizerConfig();
   sgd->lr = j["lr"];
   sgd->momentum = j["momentum"];
   sgd->nesterov = j["nesterov"];
@@ -89,9 +88,9 @@ nlohmann::json LoraAdamOptimizerConfig::toJson() const {
           {"epsilon", epsilon}};
 }
 
-std::unique_ptr<LoraAdamOptimizerConfig>
+LoraAdamOptimizerConfig *
     LoraAdamOptimizerConfig::fromJson(nlohmann::json const &j) {
-  auto adam = std::make_unique<LoraAdamOptimizerConfig>();
+  LoraAdamOptimizerConfig *adam = new LoraAdamOptimizerConfig();
   adam->alpha = j["alpha"];
   adam->beta1 = j["beta1"];
   adam->beta2 = j["beta2"];
@@ -220,12 +219,11 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) {
   os << "trainable: " << llc.trainable << ", ";
   if (llc.optimizer_config != nullptr) {
     os << "optimizer_config: ";
-    if (llc.optimizer_config.get()->getType() == "SGD") {
-      os << *static_cast<LoraSGDOptimizerConfig const *>(
-          llc.optimizer_config.get());
-    } else if (llc.optimizer_config.get()->getType() == "Adam") {
-      os << *static_cast<LoraAdamOptimizerConfig const *>(
-          llc.optimizer_config.get());
+    if (typeid(*llc.optimizer_config) == typeid(LoraSGDOptimizerConfig)) {
+      os << *static_cast<LoraSGDOptimizerConfig *>(llc.optimizer_config);
+    } else if (typeid(*llc.optimizer_config) ==
+               typeid(LoraAdamOptimizerConfig)) {
+      os << *static_cast<LoraAdamOptimizerConfig *>(llc.optimizer_config);
     } else {
       os << "Unknown optimizer config type";
     }
@@ -248,8 +246,6 @@ std::string LoraLinearConfig::serialize_to_json_string(int indent) const {
                       {"init_lora_weights", init_lora_weights},
                       {"base_model_name_or_path", base_model_name_or_path},
                       {"precision", precision},
-                      // {"optimizer_config", optimizer_config ?
-                      // optimizer_config->toJson() : nullptr}
                       {"optimizer_config",
                        optimizer_config
                            ? nlohmann::json(optimizer_config->toJson())
@@ -282,7 +278,8 @@ LoraLinearConfig LoraLinearConfig::deserialize_from_json_string(
       j["lora_dropout"].get<float>(),
       j["target_modules"].get<std::vector<std::string>>());
   if (!j["optimizer_config"].is_null()) {
-    config.setOptimizer(LoraOptimizerConfig::fromJson(j["optimizer_config"]));
+    config.optimizer_config =
+        LoraOptimizerConfig::fromJson(j["optimizer_config"]);
   }
   return config;
 }
@@ -296,4 +293,4 @@ LoraLinearConfig
   return deserialize_from_json_string(j);
 }
 
-}; // namespace FlexFlow
+}; // namespace FlexFlow
\ No newline at end of file
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index a65be9984c..db8f6b0042 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -263,8 +263,8 @@ size_t RequestManager::get_num_ssms() {
   return ssm_models.size();
 }
 
-void RequestManager::register_peft_config(PEFTModelID const &peft_model_id,
-                                          LoraLinearConfig const &peft_config) {
+void RequestManager::set_peft_config(PEFTModelID const &peft_model_id,
+                                     LoraLinearConfig const &peft_config) {
   // check that peft_model_id is not already in use
   assert(peft_configs.find(peft_model_id) == peft_configs.end() &&
          "PEFT model ID already in use");
@@ -322,7 +322,7 @@ PEFTModelID *
   }
   PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++);
   RequestManager *rm = RequestManager::get_request_manager();
-  rm->register_peft_config(*peft_model_id, peft_config);
+  rm->set_peft_config(*peft_model_id, peft_config);
   return peft_model_id;
 }
 

From fbdf74e0b4e67c905f7cb4acb4067c255d2608b0 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 5 Nov 2024 16:15:49 +0000
Subject: [PATCH 20/37] updates

---
 include/flexflow/config.h                     |   4 -
 include/flexflow/request_manager.h            |   4 +-
 inference/python/chat.py                      |   1 -
 inference/python/ff_peft.py                   |   1 -
 inference/python/incr_decoding.py             |   1 -
 inference/python/peft_demo/demo.ipynb         |   2 -
 inference/python/peft_demo/demo.py            |   1 -
 inference/python/spec_infer.py                |   1 -
 inference/python/streamlit/fastapi_incr.py    |   1 -
 python/flexflow/core/__init__.py              |   1 -
 python/flexflow/serve/__init__.py             |   9 -
 src/ops/lora_linear.cc                        | 159 ------------------
 src/runtime/model.cc                          |   7 -
 src/runtime/model.cu                          |  30 +---
 .../python_test_configs/generate_configs.py   |   1 -
 15 files changed, 4 insertions(+), 219 deletions(-)

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index dd9d657117..37afa0df27 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -104,8 +104,6 @@ struct FFHandler {
   // PEFT related fields
   MemoryAllocator *peft_activation_allocator;
   size_t peft_activation_reserve_space_size;
-  PEFTWeightAllocator *peft_weight_allocator;
-  size_t peft_weight_reserve_space_size;
   // Quantization fields
   DataType quantization_type;
   bool allowTensorOpMathConversion;
@@ -118,7 +116,6 @@ struct FFInitInfo {
   size_t workSpaceSize;
   size_t offload_reserve_space_size;
   size_t peft_activation_reserve_space_size;
-  size_t peft_weight_reserve_space_size;
   DataType quantization_type;
   bool allowTensorOpMathConversion;
   // int myRank, allRanks;
@@ -179,7 +176,6 @@ class FFConfig {
   // PEFT related fields
   bool enable_peft;
   size_t peft_activation_reserve_space_size;
-  size_t peft_weight_reserve_space_size;
   // Control parallelizable dimensions
   bool only_data_parallel;
   bool enable_sample_parallel;
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 3b4e8c4c8d..e4a8f57900 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -300,8 +300,8 @@ class RequestManager {
 
   // peft
   std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
-  int max_lora_rank;
-  int max_concurrent_adapters;
+  int max_lora_rank=0;
+  int max_concurrent_adapters=0;
   // peft benchmarking
   bool enable_peft_finetuning = false;
   static bool inference_finished;
diff --git a/inference/python/chat.py b/inference/python/chat.py
index 13ece116a6..70b8ee0067 100644
--- a/inference/python/chat.py
+++ b/inference/python/chat.py
@@ -36,7 +36,6 @@ def get_configs():
         "use_8bit_quantization": False,
         "enable_peft": False,
         "peft_activation_reserve_space_size": 1024,  # 1GB
-        "peft_weight_reserve_space_size": 1024,  # 1GB
         "profiling": False,
         "benchmarking": False,
         "inference_debugging": False,
diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py
index 13da7aee20..35338f5227 100644
--- a/inference/python/ff_peft.py
+++ b/inference/python/ff_peft.py
@@ -56,7 +56,6 @@ def get_configs():
             "use_8bit_quantization": False,
             "enable_peft": True,
             "peft_activation_reserve_space_size": 1024,  # 1GB
-            "peft_weight_reserve_space_size": 1024,  # 1GB
             "profiling": False,
             "inference_debugging": True,
             "fusion": False,
diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py
index 232ef1699c..4bb6892a6b 100644
--- a/inference/python/incr_decoding.py
+++ b/inference/python/incr_decoding.py
@@ -56,7 +56,6 @@ def get_configs():
             "use_8bit_quantization": False,
             "enable_peft": False,
             "peft_activation_reserve_space_size": 1024,  # 1GB
-            "peft_weight_reserve_space_size": 1024,  # 1GB
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
diff --git a/inference/python/peft_demo/demo.ipynb b/inference/python/peft_demo/demo.ipynb
index dfb5193a1d..d29ad5ad2f 100644
--- a/inference/python/peft_demo/demo.ipynb
+++ b/inference/python/peft_demo/demo.ipynb
@@ -91,7 +91,6 @@
     "    \"use_8bit_quantization\": False,\n",
     "    \"enable_peft\": True,\n",
     "    \"peft_activation_reserve_space_size\": 1024,  # 1GB\n",
-    "    \"peft_weight_reserve_space_size\": 1024,  # 1GB\n",
     "    \"profiling\": False,\n",
     "    \"inference_debugging\": False,\n",
     "    \"fusion\": False,\n",
@@ -1773,7 +1772,6 @@
     "    \"use_8bit_quantization\": False,\n",
     "    \"enable_peft\": True,\n",
     "    \"peft_activation_reserve_space_size\": 1024,  # 1GB\n",
-    "    \"peft_weight_reserve_space_size\": 1024,  # 1GB\n",
     "    \"profiling\": False,\n",
     "    \"inference_debugging\": False,\n",
     "    \"fusion\": False,\n",
diff --git a/inference/python/peft_demo/demo.py b/inference/python/peft_demo/demo.py
index 9e01b4645b..34b15b9a76 100644
--- a/inference/python/peft_demo/demo.py
+++ b/inference/python/peft_demo/demo.py
@@ -47,7 +47,6 @@ def create_datasets(finetune_dataset_size=2, inference_file_path='inference_data
     "use_8bit_quantization": False,
     "enable_peft": True,
     "peft_activation_reserve_space_size": 1024,  # 1GB
-    "peft_weight_reserve_space_size": 1024,  # 1GB
     "profiling": False,
     "inference_debugging": False,
     "fusion": False,
diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py
index 7ae752cffc..8cf96c1eba 100644
--- a/inference/python/spec_infer.py
+++ b/inference/python/spec_infer.py
@@ -56,7 +56,6 @@ def get_configs():
             "use_8bit_quantization": False,
             "enable_peft": False,
             "peft_activation_reserve_space_size": 1024,  # 1GB
-            "peft_weight_reserve_space_size": 1024,  # 1GB
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
diff --git a/inference/python/streamlit/fastapi_incr.py b/inference/python/streamlit/fastapi_incr.py
index 622f50008e..a1095e13dc 100644
--- a/inference/python/streamlit/fastapi_incr.py
+++ b/inference/python/streamlit/fastapi_incr.py
@@ -91,7 +91,6 @@ def get_configs():
             "use_8bit_quantization": False,
             "enable_peft": False,
             "peft_activation_reserve_space_size": 1024, # 1GB
-            "peft_weight_reserve_space_size": 1024, # 1GB
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py
index b8ed15eaea..52fe331bf3 100644
--- a/python/flexflow/core/__init__.py
+++ b/python/flexflow/core/__init__.py
@@ -91,7 +91,6 @@
     "use_8bit_quantization": "--8bit-quantization",
     "enable_peft": "-enable-peft",
     "peft_activation_reserve_space_size": "-peft-activation-reserve-space-size",
-    "peft_weight_reserve_space_size": "-peft-weight-reserve-space-size",
 }
 
 
diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
index fd29080a6a..55044d1838 100644
--- a/python/flexflow/serve/__init__.py
+++ b/python/flexflow/serve/__init__.py
@@ -55,7 +55,6 @@ def init(
     use_8bit_quantization: Optional[bool] = None,
     enable_peft: Optional[bool] = None,
     peft_activation_reserve_space_size: Optional[int] = None,
-    peft_weight_reserve_space_size: Optional[int] = None,
     profiling: Optional[bool] = None,
     benchmarking: Optional[bool] = None,
     inference_debugging: Optional[bool] = None,
@@ -86,7 +85,6 @@ def init(
     - use_8bit_quantization: whether to use 8-bit quantization, defaults to False
     - enable_peft: whether to enable the use of PEFT, defaults to False
     - peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB
-    - peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB
     - profiling: whether to enable the FlexFlow profiling mode, defaults to False
     - benchmarking: whether to run benchmaking only, without loading real weights, defaults to False
     - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
@@ -125,8 +123,6 @@ def init(
     :type enable_peft: Optional[bool], optional
     :param peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB
     :type peft_activation_reserve_space_size: Optional[int], optional
-    :param peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB
-    :type peft_weight_reserve_space_size: Optional[int], optional
     :param profiling: whether to enable the FlexFlow profiling mode, defaults to False
     :type profiling: Optional[bool], optional
     :param benchmarking: whether to run benchmaking only, without loading real weights, defaults to False
@@ -158,7 +154,6 @@ def init(
             use_8bit_quantization is not None,
             enable_peft is not None,
             peft_activation_reserve_space_size is not None,
-            peft_weight_reserve_space_size is not None,
             profiling is not None,
             benchmarking is not None,
             inference_debugging is not None,
@@ -187,7 +182,6 @@ def init(
             "use_8bit_quantization": use_8bit_quantization,
             "enable_peft": enable_peft,
             "peft_activation_reserve_space_size": peft_activation_reserve_space_size,
-            "peft_weight_reserve_space_size": peft_weight_reserve_space_size,
             "profiling": profiling,
             "benchmarking": benchmarking,
             "inference_debugging": inference_debugging,
@@ -210,7 +204,6 @@ def init(
         "pipeline_parallelism_degree",
         "offload_reserve_space_size",
         "peft_activation_reserve_space_size",
-        "peft_weight_reserve_space_size",
     ]
     for param in positive_int_params:
         __check_positive_int(configs_dict, param)
@@ -238,8 +231,6 @@ def init(
         configs_dict["enable_peft"] = False
     if configs_dict.get("peft_activation_reserve_space_size", None) is None:
         configs_dict["peft_activation_reserve_space_size"] = 8 * 1024**3
-    if configs_dict.get("peft_weight_reserve_space_size", None) is None:
-        configs_dict["peft_weight_reserve_space_size"] = 1024**3
     if configs_dict.get("profiling", None) is None:
         configs_dict["profiling"] = False
     if configs_dict.get("benchmarking", None) is None:
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 23e493b6bd..c61e3f94ac 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -492,166 +492,7 @@ OpMeta *LoraLinear::init_task(Task const *task,
   return m;
 }
 
-#ifdef DEADCODE
-void load_peft_adapters(BatchConfig const *bc) {
-  for (auto const &kv : bc->peft_configs) {
-    PEFTModelID const &model_id = kv.first;
-    LoraLinearConfig const &lora_config = kv.second;
-
-    int rank = lora_config.rank;
-
-    int w0_num_elements = rank * in_dim;
-    int w1_num_elements = rank * out_dim;
-    // values below represent total weight sizes before sharding. Lora B is not
-    // sharded.
-    int lora_A_num_rows = in_dim * num_shards;
-    int lora_A_num_cols = rank;
-    int lora_B_num_rows = rank;
-    int lora_B_num_cols = out_dim;
-    int lora_A_num_shards = num_shards;
-    int lora_B_num_shards = 1;
-
-    LoraLinearWeight weight;
-    weight.in_dim = in_dim;
-    weight.out_dim = out_dim;
-    weight.rank = rank;
-    weight.num_shards = num_shards;
-    PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator;
-    weight.w0_ptr = allocator->allocate_local_weights_untyped(
-        model_id, w0_num_elements * data_type_size(dt));
-    weight.w1_ptr = allocator->allocate_local_weights_untyped(
-        model_id, w1_num_elements * data_type_size(dt));
-
-    if (!lora_config.init_lora_weights) {
-      // load weights from file
-      std::string weights_folder_filepath = join_path({
-          lora_config.cache_folder,
-          "weights",
-          lora_config.peft_model_id,
-          dt == DT_FLOAT ? "full-precision" : "half-precision",
-      });
-      std::string w0_filepath = join_path(
-          {weights_folder_filepath, lora_layername_substr + "_A.weight"});
-      std::string w1_filepath = join_path(
-          {weights_folder_filepath, lora_layername_substr + "_B.weight"});
-      if (dt == DT_FLOAT) {
-        std::cout << "Loading LORA weight "
-                  << lora_layername_substr + "_A.weight"
-                  << ", num_rows: " << lora_A_num_rows
-                  << ", num_cols: " << lora_A_num_cols
-                  << ", num_shards: " << lora_A_num_shards
-                  << ", shard_id: " << shard_id << std::endl;
-        load_peft_from_file((float *)weight.w0_ptr,
-                            lora_A_num_rows,
-                            lora_A_num_cols,
-                            lora_A_num_shards,
-                            shard_id,
-                            w0_filepath);
-        std::cout << "Loading LORA weight "
-                  << lora_layername_substr + "_B.weight"
-                  << ", num_rows: " << lora_B_num_rows
-                  << ", num_cols: " << lora_B_num_cols
-                  << ", num_shards: " << lora_B_num_shards
-                  << ", shard_id: " << shard_id << std::endl;
-        load_peft_from_file((float *)weight.w1_ptr,
-                            lora_B_num_rows,
-                            lora_B_num_cols,
-                            lora_B_num_shards,
-                            shard_id,
-                            w1_filepath);
-      } else if (dt == DT_HALF) {
-        std::cout << "Loading LORA weight "
-                  << lora_layername_substr + "_A.weight"
-                  << ", num_rows: " << lora_A_num_rows
-                  << ", num_cols: " << lora_A_num_cols
-                  << ", num_shards: " << lora_A_num_shards
-                  << ", shard_id: " << shard_id << std::endl;
-        load_peft_from_file((half *)weight.w0_ptr,
-                            lora_A_num_rows,
-                            lora_A_num_cols,
-                            lora_A_num_shards,
-                            shard_id,
-                            w0_filepath);
-        std::cout << "Loading LORA weight "
-                  << lora_layername_substr + "_B.weight"
-                  << ", num_rows: " << lora_B_num_rows
-                  << ", num_cols: " << lora_B_num_cols
-                  << ", num_shards: " << lora_B_num_shards
-                  << ", shard_id: " << shard_id << std::endl;
-        load_peft_from_file((half *)weight.w1_ptr,
-                            lora_B_num_rows,
-                            lora_B_num_cols,
-                            lora_B_num_shards,
-                            shard_id,
-                            w1_filepath);
-      } else {
-        assert(false && "Data type not supported");
-      }
-    } else {
-      // initialize weights
-      int seed = 0;
-      init_kernel_wrapper(m, seed);
-    }
 
-    // allocate space for gradients if the LoRA layer is trainable
-    if (lora_config.trainable) {
-      // Ensure we have an optimizer
-      assert(lora_config.optimizer_config != nullptr && "Optimizer not set");
-      assert(typeid(*lora_config.optimizer_config) !=
-                 typeid(LoraOptimizerConfig) &&
-             "Optimizer config is not a subclass of LoraOptimizerConfig");
-      if (lora->inputs[0]->dims[num_dims - 1].degree == 1) {
-        // Input is partitioned (no replication)
-        // w0_grad is local weight gradients
-        weight.w0_grad_ptr = allocator->allocate_local_weights_untyped(
-            model_id, w0_num_elements * data_type_size(dt));
-        // w1_grad is sync weight gradients
-        weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped(
-            model_id, w1_num_elements * data_type_size(dt));
-      } else {
-        // Input is replicated
-        // w0_grad is sync weight gradients
-        weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped(
-            model_id, w0_num_elements * data_type_size(dt));
-        // w1_grad is local weight gradients
-        weight.w1_grad_ptr = allocator->allocate_local_weights_untyped(
-            model_id, w1_num_elements * data_type_size(dt));
-      }
-      // allocate space for v_values if needed by optimizer
-      if (typeid(*lora_config.optimizer_config) ==
-          typeid(LoraSGDOptimizerConfig)) {
-        LoraSGDOptimizerConfig const *sgd_config =
-            static_cast<LoraSGDOptimizerConfig const *>(
-                lora_config.optimizer_config);
-        if (sgd_config->momentum > 0.0f) {
-          if (lora->inputs[0]->dims[num_dims - 1].degree == 1) {
-            weight.w0_v_values_ptr = allocator->allocate_local_weights_untyped(
-                model_id, w0_num_elements * data_type_size(dt));
-            weight.w1_v_values_ptr = allocator->allocate_sync_weights_untyped(
-                model_id, w1_num_elements * data_type_size(dt));
-          } else {
-            weight.w0_v_values_ptr = allocator->allocate_sync_weights_untyped(
-                model_id, w0_num_elements * data_type_size(dt));
-            weight.w1_v_values_ptr = allocator->allocate_local_weights_untyped(
-                model_id, w1_num_elements * data_type_size(dt));
-          }
-        }
-      } else if (typeid(*lora_config.optimizer_config) ==
-                 typeid(LoraAdamOptimizerConfig)) {
-        assert(false && "Adam optim not yet implemented");
-      } else {
-        assert(false && "Optimizer not supported");
-      }
-    }
-    assert(m->model_state.find(model_id) == m->model_state.end());
-    m->model_state[model_id].weights = weight;
-    m->model_state[model_id].optimizer_config = lora_config.optimizer_config;
-    m->model_state[model_id].lora_alpha = lora_config.lora_alpha;
-    m->model_state[model_id].cache_folder = lora_config.cache_folder;
-    m->model_state[model_id].peft_model_id = lora_config.peft_model_id;
-  }
-}
-#endif
 
 void LoraLinear::forward(FFModel const &ff) {
   assert(false && "LoraLinear does not support normal init");
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 417cd2c056..de798890ef 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -1550,8 +1550,6 @@ FFRuntime::FFRuntime(FFConfig &config) {
         config.cpu_offload ? config.offload_reserve_space_size : 0;
     info.peft_activation_reserve_space_size =
         config.enable_peft ? config.peft_activation_reserve_space_size : 0;
-    info.peft_weight_reserve_space_size =
-        config.enable_peft ? config.peft_weight_reserve_space_size : 0;
     info.quantization_type = config.quantization_type;
     info.allowTensorOpMathConversion = config.allow_tensor_op_math_conversion;
     argmap.set_point(*it, TaskArgument(&info, sizeof(FFInitInfo)));
@@ -4400,7 +4398,6 @@ FFConfig::FFConfig() {
   enable_peft = DefaultConfig::enablePeft;
   peft_activation_reserve_space_size =
       DefaultConfig::peftActivationReserveSpaceSize;
-  peft_weight_reserve_space_size = DefaultConfig::peftWeightReserveSpaceSize;
   quantization_type = DT_NONE;
   only_data_parallel = DefaultConfig::onlyDataParallel;
   data_parallelism_degree = 1;
@@ -4535,10 +4532,6 @@ void FFConfig::parse_args(char **argv, int argc) {
       peft_activation_reserve_space_size = atoll(argv[++i]) * 1024 * 1024;
       continue;
     }
-    if (!strcmp(argv[i], "-peft-weight-reserve-space-size")) {
-      peft_weight_reserve_space_size = atoll(argv[++i]) * 1024 * 1024;
-      continue;
-    }
     if ((!strcmp(argv[i], "--only-data-parallel"))) {
       only_data_parallel = true;
       continue;
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index 136ce99edd..6a166835d6 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -168,7 +168,7 @@ FFHandler
   } else {
     handle.batch_config_metadata = nullptr;
   }
-#ifdef DEADCODE
+// #ifdef DEADCODE
   if (info->peft_activation_reserve_space_size > 0) {
     // allocate memory for peft activation reserve space
     Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
@@ -182,33 +182,7 @@ FFHandler
   } else {
     handle.peft_activation_allocator = nullptr;
   }
-
-  if (info->peft_weight_reserve_space_size > 0) {
-    // allocate memory for peft weight reserve space
-    Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                         .only_kind(Memory::GPU_FB_MEM)
-                         .best_affinity_to(task->target_proc)
-                         .first();
-    Realm::Rect<1, coord_t> bounds(
-        Realm::Point<1, coord_t>(0),
-        Realm::Point<1, coord_t>(info->peft_weight_reserve_space_size - 1));
-    std::vector<size_t> field_sizes;
-    field_sizes.push_back(sizeof(char));
-    Realm::RegionInstance workspaceInst;
-    Realm::RegionInstance::create_instance(workspaceInst,
-                                           gpu_mem,
-                                           bounds,
-                                           field_sizes,
-                                           0,
-                                           Realm::ProfilingRequestSet())
-        .wait();
-    void *ptr = workspaceInst.pointer_untyped(0, sizeof(char));
-    handle.peft_weight_allocator =
-        new PEFTWeightAllocator(ptr, info->peft_weight_reserve_space_size);
-  } else {
-    handle.peft_weight_allocator = nullptr;
-  }
-#endif
+// #endif
   // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize));
 #ifdef FF_USE_NCCL
   handle.ncclComm = NULL;
diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py
index 2720304d4f..4f7929e2db 100644
--- a/tests/inference/python_test_configs/generate_configs.py
+++ b/tests/inference/python_test_configs/generate_configs.py
@@ -19,7 +19,6 @@
     "use_8bit_quantization": False,
     "enable_peft": False,
     "peft_activation_reserve_space_size": 1024, # 1GB
-    "peft_weight_reserve_space_size": 1024, # 1GB
     "profiling": False,
     "benchmarking": False,
     "inference_debugging": False,

From 10fb496f780e24abb248b867ada989e0d1b8f5d5 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 5 Nov 2024 20:53:42 +0000
Subject: [PATCH 21/37] fix

---
 include/flexflow/batch_config.h        |  6 +++++-
 include/flexflow/request_manager.h     |  3 ++-
 inference/peft/peft.cc                 |  1 +
 src/ops/kernels/lora_linear_kernels.cu | 10 ++++------
 src/ops/lora_linear.cc                 | 15 ++++++--------
 src/ops/lora_linear_params.cc          | 27 +++++++++++++++++++-------
 src/runtime/request_manager.cc         | 17 ++++++++--------
 tests/peft_test.sh                     |  4 ++--
 8 files changed, 49 insertions(+), 34 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 44d829a7f7..2fb9413ae9 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -20,6 +20,8 @@
 #include "legion.h"
 #include <cstddef>
 #include <cstdlib>
+#include <cstring>
+
 
 // #define MAX_SEQ_LEN 1024
 // #define BATCH_SIZE 2
@@ -79,6 +81,7 @@ class BatchConfig {
   static int const MAX_NUM_REQUESTS = 65;
   static int const MAX_NUM_TOKENS = 1024;
   static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
+  static int const MAX_PEFT_CONFIG_SIZE = 1024;
 
   //  Set by update
 
@@ -99,6 +102,7 @@ class BatchConfig {
       batch_config_request_id = -1;
       peft_bwd = false;
       optimizer_tasks = {true, false, false, false};
+      std::memset(peft_model_config_str, 0, MAX_PEFT_CONFIG_SIZE);
     }
     int first_token_depth_in_request;
     int first_token_offset_in_batch;
@@ -111,7 +115,7 @@ class BatchConfig {
     RequestGuid request_guid;
     // PEFT fields
     PEFTModelID peft_model_id;
-    std::string peft_model_config;
+    char peft_model_config_str[MAX_PEFT_CONFIG_SIZE];
     bool peft_bwd;
     OptimizerTasks optimizer_tasks;
   };
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index e4a8f57900..d5e67d0c66 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -189,6 +189,7 @@ class RequestManager {
   bool is_eos_token(int token_id);
   bool check_inf_req_completion(BatchConfig const &old_bc, int i);
   void check_batch(BatchConfig const &old_bc, BatchConfig const &new_bc);
+  void add_peft_config_to_request_info(BatchConfig &bc, int req_idx, LoraLinearConfig const &peft_config);
   BatchConfig prepare_next_batch(BatchConfig const &bc,
                                  InferenceResult const &result);
   BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc,
@@ -300,7 +301,7 @@ class RequestManager {
 
   // peft
   std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
-  int max_lora_rank=0;
+  int max_lora_rank=32;
   int max_concurrent_adapters=0;
   // peft benchmarking
   bool enable_peft_finetuning = false;
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index af9e5743c7..96dd3a0562 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -275,6 +275,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_requests_per_batch(
       max_requests_per_batch +
       (int)enable_peft_finetuning); // add one slot for finetuning if needed
+  rm->set_max_concurrent_adapters(max_requests_per_batch + (int)enable_peft_finetuning);
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->register_tokenizer(
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index eb2a472ee3..e50805b6ca 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -335,9 +335,8 @@ void inference_kernel(LoraLinearMeta *m,
     if (bc->requestsInfo[i].peft_bwd) {
       num_peft_requests++;
     }
-    LoraLinearConfig lora_config =
-        LoraLinearConfig::deserialize_from_json_string(
-            bc->requestsInfo[i].peft_model_config);
+    std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str);
+    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
@@ -463,9 +462,8 @@ void peft_bwd_kernel(Context ctx,
         !bc->requestsInfo[i].peft_bwd) {
       continue;
     }
-    LoraLinearConfig lora_config =
-        LoraLinearConfig::deserialize_from_json_string(
-            bc->requestsInfo[i].peft_model_config);
+    std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str);
+    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index c61e3f94ac..3735aefc01 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -619,9 +619,8 @@ void LoraLinear::inference_task(Task const *task,
           bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
         continue;
       }
-      LoraLinearConfig lora_config =
-          LoraLinearConfig::deserialize_from_json_string(
-              bc->requestsInfo[i].peft_model_config);
+      std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str);
+      LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
       if (!lora_applies_to_this_layer(m, lora_config)) {
         continue;
       }
@@ -777,9 +776,8 @@ void lora_inference_debugging(LoraLinearMeta *m,
         !bc->requestsInfo[i].peft_bwd) {
       continue;
     }
-    LoraLinearConfig lora_config =
-        LoraLinearConfig::deserialize_from_json_string(
-            bc->requestsInfo[i].peft_model_config);
+    std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str);
+    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
@@ -911,9 +909,8 @@ void save_peft_weights_if_needed(LoraLinearMeta *m,
         !bc->requestsInfo[i].peft_bwd) {
       continue;
     }
-    LoraLinearConfig lora_config =
-        LoraLinearConfig::deserialize_from_json_string(
-            bc->requestsInfo[i].peft_model_config);
+    std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str);
+    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc
index 21648089da..4eb59bc53f 100644
--- a/src/ops/lora_linear_params.cc
+++ b/src/ops/lora_linear_params.cc
@@ -235,12 +235,23 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) {
   return os;
 }
 
+double ToThreeDecimalPlaces(float f) {
+  double d = static_cast<double>(f);
+  int i;
+  if (d >= 0) {
+    i = static_cast<int>(d * 1000 + 0.5);
+  } else {
+    i = static_cast<int>(d * 1000 - 0.5);
+  }
+  return (i / 1000.0);
+}
+
 std::string LoraLinearConfig::serialize_to_json_string(int indent) const {
   nlohmann::json j = {{"cache_folder", cache_folder},
                       {"peft_model_id", peft_model_id},
                       {"rank", rank},
-                      {"lora_alpha", lora_alpha},
-                      {"lora_dropout", lora_dropout},
+                      {"lora_alpha", ToThreeDecimalPlaces(lora_alpha)},
+                      {"lora_dropout", ToThreeDecimalPlaces(lora_dropout)},
                       {"target_modules", target_modules},
                       {"trainable", trainable},
                       {"init_lora_weights", init_lora_weights},
@@ -264,12 +275,18 @@ void LoraLinearConfig::serialize_to_json_file(
 // Deserialization method
 LoraLinearConfig LoraLinearConfig::deserialize_from_json_string(
     std::string const &json_string) {
+  // std::cout << "Attempting to deserialize from JSON string: " << json_string
+  //           << std::endl;
   nlohmann::json j = nlohmann::json::parse(json_string);
+  LoraOptimizerConfig *optimizer_config_ = nullptr;
+  if (!j["optimizer_config"].is_null()) {
+    optimizer_config_ = LoraOptimizerConfig::fromJson(j["optimizer_config"]);
+  }
   LoraLinearConfig config(
       j["cache_folder"].get<std::string>(),
       j["peft_model_id"].get<std::string>(),
       j["trainable"].get<bool>(),
-      nullptr, // optimizer_config will be set later if present
+      optimizer_config_, // optimizer_config will be set later if present
       j["init_lora_weights"].get<bool>(),
       j["base_model_name_or_path"].get<std::string>(),
       j["precision"].get<std::string>(),
@@ -277,10 +294,6 @@ LoraLinearConfig LoraLinearConfig::deserialize_from_json_string(
       j["lora_alpha"].get<float>(),
       j["lora_dropout"].get<float>(),
       j["target_modules"].get<std::vector<std::string>>());
-  if (!j["optimizer_config"].is_null()) {
-    config.optimizer_config =
-        LoraOptimizerConfig::fromJson(j["optimizer_config"]);
-  }
   return config;
 }
 
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index db8f6b0042..0bfbb7f8f4 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -691,6 +691,12 @@ void RequestManager::check_batch(BatchConfig const &old_bc,
   }
 }
 
+void RequestManager::add_peft_config_to_request_info(BatchConfig &bc, int req_idx, LoraLinearConfig const &peft_config) {
+  std::memset(bc.requestsInfo[req_idx].peft_model_config_str, 0, BatchConfig::MAX_PEFT_CONFIG_SIZE);
+  std::string peft_config_str = peft_config.serialize_to_json_string();
+  std::strcpy(bc.requestsInfo[req_idx].peft_model_config_str, peft_config_str.c_str());
+}
+
 BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                                                InferenceResult const &result) {
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
@@ -825,8 +831,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
             old_bc.requestsInfo[i].request_guid;
         new_bc.requestsInfo[i].peft_model_id =
             old_bc.requestsInfo[i].peft_model_id;
-        new_bc.requestsInfo[i].peft_model_config =
-            old_bc.requestsInfo[i].peft_model_config;
+        std::strcpy(new_bc.requestsInfo[i].peft_model_config_str, old_bc.requestsInfo[i].peft_model_config_str);
         if (old_bc.requestsInfo[i].peft_model_id != PEFTModelID::NO_ID) {
           num_concurrent_adapters += 1;
         }
@@ -911,9 +916,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         new_bc.requestsInfo[i].max_length = new_request.max_length;
         new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id;
         if (new_request.peft_model_id != PEFTModelID::NO_ID) {
-          new_bc.requestsInfo[i].peft_model_config =
-              get_peft_config(new_request.peft_model_id)
-                  .serialize_to_json_string();
+          add_peft_config_to_request_info(new_bc, i, get_peft_config(new_request.peft_model_id));
         }
         new_bc.requestsInfo[i].peft_bwd = false;
         new_bc.request_completed[i] = false;
@@ -1084,9 +1087,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       new_bc.requestsInfo[inference_batch_size].request_guid = request.guid;
       new_bc.requestsInfo[inference_batch_size].peft_model_id =
           request.peft_model_id;
-      new_bc.requestsInfo[inference_batch_size].peft_model_config =
-          get_peft_config(request.peft_model_id).serialize_to_json_string();
-      new_bc.requestsInfo[inference_batch_size].peft_bwd = true;
+      add_peft_config_to_request_info(new_bc, inference_batch_size, get_peft_config(request.peft_model_id));
       set_optimizer_tasks(
           new_bc.requestsInfo[inference_batch_size].optimizer_tasks,
           request.max_training_steps,
diff --git a/tests/peft_test.sh b/tests/peft_test.sh
index 5600d57edf..173fb37fd9 100755
--- a/tests/peft_test.sh
+++ b/tests/peft_test.sh
@@ -38,9 +38,9 @@ python ./tests/peft/hf_finetune.py --peft-model-id goliaro/llama-160m-lora --sav
 
 # Python test
 echo "Python test"
-python ./inference/python/ff_peft.py
+# python ./inference/python/ff_peft.py
 # Check alignment
-python ./tests/peft/peft_alignment_test.py -tp 2
+# python ./tests/peft/peft_alignment_test.py -tp 2
 
 # C++ test
 echo "C++ test"

From 79dc3a2b4666020de0d052f2bbc354900cc4e8cd Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 5 Nov 2024 21:02:46 +0000
Subject: [PATCH 22/37] fix

---
 include/flexflow/batch_config.h        |  1 -
 include/flexflow/request_manager.h     |  8 +++++---
 inference/peft/peft.cc                 |  3 ++-
 src/ops/kernels/lora_linear_kernels.cu | 12 ++++++++----
 src/ops/lora_linear.cc                 | 20 ++++++++++++--------
 src/runtime/model.cu                   |  4 ++--
 src/runtime/request_manager.cc         | 20 ++++++++++++++------
 7 files changed, 43 insertions(+), 25 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 2fb9413ae9..bbcfdb32fc 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -22,7 +22,6 @@
 #include <cstdlib>
 #include <cstring>
 
-
 // #define MAX_SEQ_LEN 1024
 // #define BATCH_SIZE 2
 // #define BATCH_SIZE 16
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index d5e67d0c66..c15c0ff8b4 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -189,7 +189,9 @@ class RequestManager {
   bool is_eos_token(int token_id);
   bool check_inf_req_completion(BatchConfig const &old_bc, int i);
   void check_batch(BatchConfig const &old_bc, BatchConfig const &new_bc);
-  void add_peft_config_to_request_info(BatchConfig &bc, int req_idx, LoraLinearConfig const &peft_config);
+  void add_peft_config_to_request_info(BatchConfig &bc,
+                                       int req_idx,
+                                       LoraLinearConfig const &peft_config);
   BatchConfig prepare_next_batch(BatchConfig const &bc,
                                  InferenceResult const &result);
   BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc,
@@ -301,8 +303,8 @@ class RequestManager {
 
   // peft
   std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
-  int max_lora_rank=32;
-  int max_concurrent_adapters=0;
+  int max_lora_rank = 32;
+  int max_concurrent_adapters = 0;
   // peft benchmarking
   bool enable_peft_finetuning = false;
   static bool inference_finished;
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index 96dd3a0562..da2993187c 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -275,7 +275,8 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_requests_per_batch(
       max_requests_per_batch +
       (int)enable_peft_finetuning); // add one slot for finetuning if needed
-  rm->set_max_concurrent_adapters(max_requests_per_batch + (int)enable_peft_finetuning);
+  rm->set_max_concurrent_adapters(max_requests_per_batch +
+                                  (int)enable_peft_finetuning);
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->register_tokenizer(
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index e50805b6ca..09d79809a7 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -335,8 +335,10 @@ void inference_kernel(LoraLinearMeta *m,
     if (bc->requestsInfo[i].peft_bwd) {
       num_peft_requests++;
     }
-    std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str);
-    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
+    std::string peft_model_config_str =
+        std::string(bc->requestsInfo[i].peft_model_config_str);
+    LoraLinearConfig lora_config =
+        LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
@@ -462,8 +464,10 @@ void peft_bwd_kernel(Context ctx,
         !bc->requestsInfo[i].peft_bwd) {
       continue;
     }
-    std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str);
-    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
+    std::string peft_model_config_str =
+        std::string(bc->requestsInfo[i].peft_model_config_str);
+    LoraLinearConfig lora_config =
+        LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 3735aefc01..f17f69a7c9 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -492,8 +492,6 @@ OpMeta *LoraLinear::init_task(Task const *task,
   return m;
 }
 
-
-
 void LoraLinear::forward(FFModel const &ff) {
   assert(false && "LoraLinear does not support normal init");
 }
@@ -619,8 +617,10 @@ void LoraLinear::inference_task(Task const *task,
           bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
         continue;
       }
-      std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str);
-      LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
+      std::string peft_model_config_str =
+          std::string(bc->requestsInfo[i].peft_model_config_str);
+      LoraLinearConfig lora_config =
+          LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
       if (!lora_applies_to_this_layer(m, lora_config)) {
         continue;
       }
@@ -776,8 +776,10 @@ void lora_inference_debugging(LoraLinearMeta *m,
         !bc->requestsInfo[i].peft_bwd) {
       continue;
     }
-    std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str);
-    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
+    std::string peft_model_config_str =
+        std::string(bc->requestsInfo[i].peft_model_config_str);
+    LoraLinearConfig lora_config =
+        LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
@@ -909,8 +911,10 @@ void save_peft_weights_if_needed(LoraLinearMeta *m,
         !bc->requestsInfo[i].peft_bwd) {
       continue;
     }
-    std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str);
-    LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
+    std::string peft_model_config_str =
+        std::string(bc->requestsInfo[i].peft_model_config_str);
+    LoraLinearConfig lora_config =
+        LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index 6a166835d6..3a250539c7 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -168,7 +168,7 @@ FFHandler
   } else {
     handle.batch_config_metadata = nullptr;
   }
-// #ifdef DEADCODE
+  // #ifdef DEADCODE
   if (info->peft_activation_reserve_space_size > 0) {
     // allocate memory for peft activation reserve space
     Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
@@ -183,7 +183,7 @@ FFHandler
     handle.peft_activation_allocator = nullptr;
   }
 // #endif
-  // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize));
+// checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize));
 #ifdef FF_USE_NCCL
   handle.ncclComm = NULL;
 #endif
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 0bfbb7f8f4..a25677b22e 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -691,10 +691,14 @@ void RequestManager::check_batch(BatchConfig const &old_bc,
   }
 }
 
-void RequestManager::add_peft_config_to_request_info(BatchConfig &bc, int req_idx, LoraLinearConfig const &peft_config) {
-  std::memset(bc.requestsInfo[req_idx].peft_model_config_str, 0, BatchConfig::MAX_PEFT_CONFIG_SIZE);
+void RequestManager::add_peft_config_to_request_info(
+    BatchConfig &bc, int req_idx, LoraLinearConfig const &peft_config) {
+  std::memset(bc.requestsInfo[req_idx].peft_model_config_str,
+              0,
+              BatchConfig::MAX_PEFT_CONFIG_SIZE);
   std::string peft_config_str = peft_config.serialize_to_json_string();
-  std::strcpy(bc.requestsInfo[req_idx].peft_model_config_str, peft_config_str.c_str());
+  std::strcpy(bc.requestsInfo[req_idx].peft_model_config_str,
+              peft_config_str.c_str());
 }
 
 BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
@@ -831,7 +835,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
             old_bc.requestsInfo[i].request_guid;
         new_bc.requestsInfo[i].peft_model_id =
             old_bc.requestsInfo[i].peft_model_id;
-        std::strcpy(new_bc.requestsInfo[i].peft_model_config_str, old_bc.requestsInfo[i].peft_model_config_str);
+        std::strcpy(new_bc.requestsInfo[i].peft_model_config_str,
+                    old_bc.requestsInfo[i].peft_model_config_str);
         if (old_bc.requestsInfo[i].peft_model_id != PEFTModelID::NO_ID) {
           num_concurrent_adapters += 1;
         }
@@ -916,7 +921,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         new_bc.requestsInfo[i].max_length = new_request.max_length;
         new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id;
         if (new_request.peft_model_id != PEFTModelID::NO_ID) {
-          add_peft_config_to_request_info(new_bc, i, get_peft_config(new_request.peft_model_id));
+          add_peft_config_to_request_info(
+              new_bc, i, get_peft_config(new_request.peft_model_id));
         }
         new_bc.requestsInfo[i].peft_bwd = false;
         new_bc.request_completed[i] = false;
@@ -1085,9 +1091,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
           num_peft_tokens;
       new_bc.requestsInfo[inference_batch_size].max_length = request.max_length;
       new_bc.requestsInfo[inference_batch_size].request_guid = request.guid;
+      new_bc.requestsInfo[inference_batch_size].peft_bwd = true;
       new_bc.requestsInfo[inference_batch_size].peft_model_id =
           request.peft_model_id;
-      add_peft_config_to_request_info(new_bc, inference_batch_size, get_peft_config(request.peft_model_id));
+      add_peft_config_to_request_info(
+          new_bc, inference_batch_size, get_peft_config(request.peft_model_id));
       set_optimizer_tasks(
           new_bc.requestsInfo[inference_batch_size].optimizer_tasks,
           request.max_training_steps,

From 42198061fb2970a7e40e8141fa23cc0d228dbe98 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 5 Nov 2024 21:27:53 +0000
Subject: [PATCH 23/37] fix

---
 src/runtime/peft_weight_allocator.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc
index 81b412e049..2dd9a4711b 100644
--- a/src/runtime/peft_weight_allocator.cc
+++ b/src/runtime/peft_weight_allocator.cc
@@ -63,6 +63,7 @@ void PEFTMemoryManager::get_finetuning_slot(PEFTModelID const &model_id,
   assert(finetuning_ptr != nullptr &&
          "PEFT Memory Manager finetuning_ptr is null");
   *cache_miss = (model_id.id != finetuning_model_id.id);
+  finetuning_model_id = model_id;
 }
 
 int PEFTMemoryManager::get_inference_peft_slot(PEFTModelID const &model_id,

From f542fbb2690778ed8969807ed71abaadea7eada5 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 6 Nov 2024 15:39:10 +0000
Subject: [PATCH 24/37] small fix

---
 tests/inference/inference_alignment_test.py | 2 +-
 tests/peft/alignment/align_test_utils.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/inference/inference_alignment_test.py b/tests/inference/inference_alignment_test.py
index 8dab7ff43b..1fe2bfbaae 100644
--- a/tests/inference/inference_alignment_test.py
+++ b/tests/inference/inference_alignment_test.py
@@ -361,7 +361,7 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
         hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
         ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)[:,:,-1].squeeze()
         hf_tensor = hf_tensor.squeeze()
-        print(hf_tensor.shape, ff_tensor.shape)
+        # print(hf_tensor.shape, ff_tensor.shape)
         compare(hf_tensor, ff_tensor, label="LM head input")
         output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
         hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py
index f5ed8ae65b..a8a9be2f3b 100644
--- a/tests/peft/alignment/align_test_utils.py
+++ b/tests/peft/alignment/align_test_utils.py
@@ -430,7 +430,7 @@ def compare_loaded_tensors(hf_tensor, ff_tensor, tolerance=1e-2):
         print(f"HF: {hf_tensor}\nFF:{ff_tensor}")
         print(np.isclose(hf_tensor, ff_tensor, atol=tolerance))
         mismatches = np.where(~np.isclose(hf_tensor, ff_tensor, atol=tolerance))[0]
-        print(mismatches)
+        # print(mismatches)
     len_hf_tensor = hf_tensor.flatten().shape[0]
     assert len(mismatches) <= 0.05 * len_hf_tensor
     print("Ok!")

From 139b643646e3c3ddcf69682a9e9e98b37dec2f0e Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 7 Nov 2024 22:03:38 +0000
Subject: [PATCH 25/37] fix

---
 .../ops/kernels/lora_linear_kernels.h         |   2 +
 .../flexflow/utils/peft_weight_allocator.h    |  68 -----
 src/ops/fused.cu                              |   2 +
 src/ops/kernels/lora_linear_kernels.cu        | 192 +++----------
 src/ops/lora_linear.cc                        | 255 +-----------------
 src/ops/lora_linear_params.cc                 |  24 +-
 src/runtime/peft_weight_allocator.cc          |   4 +-
 src/runtime/request_manager.cc                |   8 +-
 tests/peft/hf_finetune.py                     |   2 +-
 tests/peft/peft_alignment_test.py             |  57 +++-
 tests/peft_test.sh                            |   8 +-
 11 files changed, 120 insertions(+), 502 deletions(-)

diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
index 7138f62e90..b17868fb96 100644
--- a/include/flexflow/ops/kernels/lora_linear_kernels.h
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -52,6 +52,7 @@ void peft_bwd_kernel_wrapper(Context ctx,
                              Runtime *runtime,
                              LoraLinearMeta *m,
                              BatchConfig const *bc,
+                             int shard_id,
                              GenericTensorAccessorW const &input_grad,
                              GenericTensorAccessorR const &output_grad);
 
@@ -71,6 +72,7 @@ void peft_bwd_kernel(Context ctx,
                      Runtime *runtime,
                      LoraLinearMeta *m,
                      BatchConfig const *bc,
+                     int shard_id,
                      DT *input_grad_ptr,
                      DT const *output_grad_ptr,
                      int in_dim,
diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h
index bd8ddb1dce..21ac9bf426 100644
--- a/include/flexflow/utils/peft_weight_allocator.h
+++ b/include/flexflow/utils/peft_weight_allocator.h
@@ -23,74 +23,6 @@
 
 namespace FlexFlow {
 
-#ifdef DEADCODE
-class PEFTWeightAllocator {
-public:
-  PEFTWeightAllocator(void *_base_ptr, size_t _total_size)
-      : base_ptr(_base_ptr), total_size(_total_size), sync_offset(0),
-        local_offset(_total_size) {}
-
-  inline void *allocate_sync_weights_untyped(PEFTModelID const &peft_model_id,
-                                             size_t datalen) {
-    const std::lock_guard<std::mutex> lock(peft_weight_allocator_mutex);
-    void *ptr = static_cast<char *>(base_ptr) + sync_offset;
-    off_t model_sync_weights_offset = sync_offset;
-    size_t model_sync_weights_size = datalen;
-    if (sync_weights.find(peft_model_id) != sync_weights.end()) {
-      // Assert that sync weights for each PEFT model is consecutive
-      std::pair<off_t, size_t> offset_and_size = sync_weights[peft_model_id];
-      assert(sync_offset == offset_and_size.first + offset_and_size.second);
-      model_sync_weights_offset = offset_and_size.first;
-      model_sync_weights_size = offset_and_size.second + datalen;
-    }
-    sync_offset += datalen;
-    assert(sync_offset < local_offset);
-    sync_weights[peft_model_id] =
-        std::make_pair(model_sync_weights_offset, model_sync_weights_size);
-    return ptr;
-  }
-
-  std::pair<void *, size_t>
-      get_sync_weights_ptr_and_size(PEFTModelID const &peft_model_id) {
-    const std::lock_guard<std::mutex> lock(peft_weight_allocator_mutex);
-    assert(sync_weights.find(peft_model_id) != sync_weights.end());
-    std::pair<off_t, size_t> offset_and_size = sync_weights[peft_model_id];
-    return std::make_pair(static_cast<char *>(base_ptr) + offset_and_size.first,
-                          offset_and_size.second);
-  }
-
-  inline void *allocate_local_weights_untyped(PEFTModelID const &peft_model_id,
-                                              size_t datalen) {
-    const std::lock_guard<std::mutex> lock(peft_weight_allocator_mutex);
-    local_offset -= datalen;
-    assert(sync_offset < local_offset);
-    void *ptr = static_cast<char *>(base_ptr) + local_offset;
-    return ptr;
-  }
-
-  template <typename DT>
-  inline DT *allocate_sync_weights(PEFTModelID const &peft_model_id,
-                                   size_t count) {
-    return static_cast<DT *>(
-        allocate_sync_weights_untyped(peft_model_id, sizeof(DT) * count));
-  }
-
-  template <typename DT>
-  inline DT *allocate_local_weights(PEFTModelID const &peft_model_id,
-                                    size_t count) {
-    return static_cast<DT *>(
-        allocate_local_weights_untyped(peft_model_id, sizeof(DT) * count));
-  }
-
-public:
-  void *base_ptr;
-  size_t total_size;
-  off_t sync_offset, local_offset;
-  std::unordered_map<PEFTModelID, std::pair<off_t, size_t>> sync_weights;
-  std::mutex peft_weight_allocator_mutex;
-};
-#endif
-
 struct LoraLinearWeight {
   // weights
   void *w0_ptr, *w1_ptr;
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 62845c0f8e..c615a104d2 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -889,11 +889,13 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
         // Assert that the output and the second input are at the same place
         // since we ``inplace'' the output for LoRA
         assert(my_input_grad_accessor[1].ptr == my_output_grad_accessor[0].ptr);
+        int shard_id = task->index_point.point_data[0];
         Kernels::LoraLinear::peft_bwd_kernel_wrapper(
             ctx,
             runtime,
             m,
             bc,
+            shard_id,
             my_input_grad_accessor[0],
             my_output_grad_accessor[0]);
         break;
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 09d79809a7..dabe40c501 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -24,31 +24,34 @@ namespace FlexFlow {
 
 LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li)
     : OpMeta(handler, li) {
-#ifdef DEADCODE
-  allocated_peft_buffer_size1 = 0;
-  allocated_peft_buffer_size2 = 0;
-#endif
 }
 
 LoraLinearMeta::~LoraLinearMeta(void) {}
 
+std::string get_peft_dbg_folder(LoraLinearMeta const *m,
+                                int shard_id,
+                                bool is_fwd) {
+  std::string op_name_without_uid = LoraLinear::get_op_name_without_uid(m);
+  fs::path dst_filepath;
+  if (is_fwd) {
+    dst_filepath = get_dst_folder("fwd", m->decoding_step, shard_id);
+  } else {
+    dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id);
+  }
+  if (m->layer_guid.model_id > 0) {
+    assert(false && "Model ID > 0 not supported yet");
+  }
+  std::string layername = "layers." +
+                          std::to_string(m->layer_guid.transformer_layer_id) +
+                          "." + op_name_without_uid;
+  dst_filepath /= layername;
+  return dst_filepath.string();
+}
+
 namespace Kernels {
 namespace LoraLinear {
 
-#ifdef DEADCODE
-void init_kernel_wrapper(LoraLinearMeta *m, int seed) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
 
-  if (m->input_type[0] == DT_FLOAT) {
-    Internal::init_kernel<float>(m, seed, stream);
-  } else if (m->input_type[0] == DT_HALF) {
-    Internal::init_kernel<half>(m, seed, stream);
-  } else {
-    assert(false && "Unsupported data type");
-  }
-}
-#endif
 
 void inference_kernel_wrapper(LoraLinearMeta *m,
                               BatchConfig const *bc,
@@ -104,6 +107,7 @@ void peft_bwd_kernel_wrapper(Context ctx,
                              Runtime *runtime,
                              LoraLinearMeta *m,
                              BatchConfig const *bc,
+                             int shard_id,
                              GenericTensorAccessorW const &input_grad,
                              GenericTensorAccessorR const &output_grad) {
   cudaStream_t stream;
@@ -121,6 +125,7 @@ void peft_bwd_kernel_wrapper(Context ctx,
                                      runtime,
                                      m,
                                      bc,
+                                     shard_id,
                                      input_grad.get_float_ptr(),
                                      output_grad.get_float_ptr(),
                                      in_dim,
@@ -131,6 +136,7 @@ void peft_bwd_kernel_wrapper(Context ctx,
                                     runtime,
                                     m,
                                     bc,
+                                    shard_id,
                                     input_grad.get_half_ptr(),
                                     output_grad.get_half_ptr(),
                                     in_dim,
@@ -168,146 +174,6 @@ bool lora_applies_to_this_layer(LoraLinearMeta *m,
 
 namespace Internal {
 
-#ifdef DEADCODE
-template <typename DT>
-void inference_kernel(LoraLinearMeta *m,
-                      BatchConfig const *bc,
-                      DT const *input_ptr,
-                      DT *output_ptr,
-                      int in_dim,
-                      int out_dim,
-                      ffStream_t stream) {
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  DT alpha = 1.0f, beta = 0.0f;
-  cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
-  cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]);
-  cudaDataType_t lr_actv_type = output_type;
-  assert(input_type == output_type);
-  cudaDataType_t weight_type = output_type;
-  cudaDataType_t compute_type = output_type;
-  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  //   cudaDataType_t compute_type = output_type;
-  // #else
-  //   // For best performance, set the default cublas compute type to
-  //   // CUBLAS_COMPUTE_16F for half precision and to
-  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  //   if (m->input_type[0] == DT_FLOAT) {
-  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  //   }
-  // #endif
-  int num_peft_requests = 0;
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
-      continue;
-    }
-    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
-      continue;
-    }
-    if (bc->requestsInfo[i].peft_bwd) {
-      num_peft_requests++;
-    }
-  }
-  // Assert that we have at most one request that requires peft_bwd
-  assert(num_peft_requests <= 1);
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
-      continue;
-    }
-    // Skip non-PEFT requests
-    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
-      continue;
-    }
-    int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    int max_peft_tokens = bc->requestsInfo[i].max_length;
-    int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
-    assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
-           m->model_state.end());
-    LoraLinearWeight weight =
-        m->model_state[bc->requestsInfo[i].peft_model_id].weights;
-    int rank = weight.rank;
-    void *intermediate_result_ptr = nullptr;
-    if (bc->requestsInfo[i].peft_bwd) {
-      size_t activation_size_needed1 =
-          data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
-      size_t activation_size_needed2 =
-          data_type_size(m->input_type[1]) * max_peft_tokens * rank;
-      MemoryAllocator *allocator = m->handle.peft_activation_allocator;
-      if (activation_size_needed1 > m->allocated_peft_buffer_size1) {
-        m->input_activation =
-            allocator->allocate_instance_untyped(activation_size_needed1);
-        m->allocated_peft_buffer_size1 = activation_size_needed1;
-      }
-      if (activation_size_needed2 > m->allocated_peft_buffer_size2) {
-        m->low_rank_activation =
-            allocator->allocate_instance_untyped(activation_size_needed2);
-        m->allocated_peft_buffer_size2 = activation_size_needed2;
-      }
-      // copy input activation
-      checkCUDA(cudaMemcpyAsync(m->input_activation,
-                                input_ptr + first_token_offset * in_dim,
-                                data_type_size(m->input_type[0]) *
-                                    num_peft_tokens * in_dim,
-                                cudaMemcpyDeviceToDevice,
-                                stream));
-      intermediate_result_ptr = m->low_rank_activation;
-    } else {
-      // use workspace to save intermediate result
-      assert(m->handle.workSpaceSize >=
-             data_type_size(m->input_type[1]) * num_peft_tokens * rank);
-      intermediate_result_ptr = m->handle.workSpace;
-    }
-    // buffer = weight_first * input
-    // [rank, num_peft_tokens] = [in_dim, rank].T * [in_dim, num_peft_tokens]
-    checkCUDA(cublasGemmEx(m->handle.blas,
-                           CUBLAS_OP_T,
-                           CUBLAS_OP_N,
-                           rank,
-                           num_peft_tokens,
-                           in_dim,
-                           &alpha,
-                           weight.w0_ptr,
-                           weight_type,
-                           in_dim,
-                           input_ptr + first_token_offset * in_dim,
-                           input_type,
-                           in_dim,
-                           &beta,
-                           intermediate_result_ptr,
-                           lr_actv_type,
-                           rank,
-                           compute_type,
-                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    // output = weight_second * buffer
-    // [out_dim, num_peft_tokens] = [rank, out_dim].T * [rank, num_peft_tokens]
-    // Note that we use alpha in both places since we do
-    // an in-place update for LoraLinear
-    float lora_alpha =
-        m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha;
-    DT scaling_constant = (DT)(lora_alpha / rank);
-    checkCUDA(cublasGemmEx(m->handle.blas,
-                           CUBLAS_OP_T,
-                           CUBLAS_OP_N,
-                           out_dim,
-                           num_peft_tokens,
-                           rank,
-                           &scaling_constant,
-                           weight.w1_ptr,
-                           weight_type,
-                           rank,
-                           intermediate_result_ptr,
-                           lr_actv_type,
-                           rank,
-                           &alpha,
-                           output_ptr + first_token_offset * out_dim,
-                           output_type,
-                           out_dim,
-                           compute_type,
-                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-  }
-}
-#endif
 
 template <typename DT>
 void inference_kernel(LoraLinearMeta *m,
@@ -342,6 +208,8 @@ void inference_kernel(LoraLinearMeta *m,
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
+    std::cout << "Lora layer activated!" << std::endl;
+    std::cout << "Lora Config: " << peft_model_config_str << std::endl;
     assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd &&
            "Trainable flag mismatch");
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
@@ -443,6 +311,7 @@ void peft_bwd_kernel(Context ctx,
                      Runtime *runtime,
                      LoraLinearMeta *m,
                      BatchConfig const *bc,
+                      int shard_id,
                      DT *input_grad_ptr,
                      DT const *output_grad_ptr,
                      int in_dim,
@@ -471,6 +340,8 @@ void peft_bwd_kernel(Context ctx,
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
+    std::cout << "Lora layer activated!" << std::endl;
+    std::cout << "Lora Config: " << peft_model_config_str << std::endl;
     assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd &&
            "Trainable flag mismatch");
     m->peft_memory_manager->check_ft_model_id(
@@ -488,6 +359,13 @@ void peft_bwd_kernel(Context ctx,
       DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero)
                     ? 0.0f
                     : 1.0f;
+      std::cout << "Lora B gradient computation, beta = " << (float) beta << std::endl;
+      if (m->inference_debugging) {
+        // save result to file for checking
+        std::string filename = get_peft_dbg_folder(m, shard_id, false) + ".low_rank_activation";
+        std::cout << "Save low_rank_activation (" << lora_config.rank << ", " << num_peft_tokens << ") to " << filename << std::endl;
+        save_tensor(static_cast<const DT*>(weight.low_rank_activation), lora_config.rank*num_peft_tokens, filename.c_str());
+      }
       checkCUDA(cublasGemmEx(m->handle.blas,
                              CUBLAS_OP_N,
                              CUBLAS_OP_T,
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index f17f69a7c9..5f67709358 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -136,133 +136,6 @@ void FFModel::add_lora_layers(std::vector<std::string> target_modules) {
   }
 }
 
-#ifdef DEADCODE
-PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
-  assert(config.enable_peft &&
-         "Cannot add a LoRA layer if PEFT mode is not enabled");
-  if (peft_config.target_modules.size() == 0) {
-    printf("PEFT config does not contain any target module\n");
-    std::cout << peft_config << std::endl;
-    assert(false);
-  }
-  PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++);
-  peft_configs[*peft_model_id] = peft_config;
-
-  for (std::string target_module_name : peft_config.target_modules) {
-    assert(target_module_name.length() > 0 &&
-           "LoRA target module name is empty");
-    // find target layer
-    for (auto it = layers.begin(); it != layers.end(); ++it) {
-      Layer *target_module = *it;
-      bool match = check_lora_layer_match(target_module, target_module_name);
-      if (!match) {
-        continue;
-      }
-
-      if (base_layer_to_peft_layer.find(target_module) !=
-          base_layer_to_peft_layer.end()) {
-        // lora linear layer already added, no need to add again
-        Layer *peft_layer = base_layer_to_peft_layer[target_module];
-        peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id);
-      } else {
-        Tensor const input = target_module->inputs[0];
-        Tensor const output = target_module->outputs[0];
-        assert(input->data_type == output->data_type);
-        std::string name_ = target_module->name
-                                ? std::string(target_module->name)
-                                : std::string("");
-        size_t last_underscore = name_.length() - 1;
-        for (int i = name_.length() - 1; i > 0; i--) {
-          if (!(std::isdigit(target_module->name[i]) ||
-                target_module->name[i] == '_')) {
-            break;
-          } else if (target_module->name[i] == '_') {
-            last_underscore = i;
-          }
-        }
-        name_.erase(last_underscore);
-
-        name_ += ".lora";
-        std::cout << "Adding layer " << name_ << std::endl;
-        Layer *peft_layer = new Layer(this,
-                                      OP_LORA,
-                                      output->data_type,
-                                      name_.c_str(),
-                                      2 /*inputs*/,
-                                      0 /*weights*/,
-                                      1 /*outputs*/,
-                                      input,
-                                      output);
-        // fix LoRA layer's transformer layer ID and model ID
-        peft_layer->layer_guid.transformer_layer_id =
-            target_module->layer_guid.transformer_layer_id;
-        peft_layer->layer_guid.model_id = target_module->layer_guid.model_id;
-        {
-          int numdims = output->num_dims;
-          int dims[MAX_TENSOR_DIM];
-          for (int i = 0; i < numdims; i++) {
-            dims[i] = output->dims[i];
-          }
-          peft_layer->outputs[0] =
-              create_tensor_legion_ordering(numdims,
-                                            dims,
-                                            output->data_type,
-                                            peft_layer,
-                                            0,
-                                            true /*create_grad*/);
-        }
-        it = layers.insert(it + 1, peft_layer);
-        ++it;
-        base_layer_to_peft_layer[target_module] = peft_layer;
-        peft_layer_to_peft_id[peft_layer] = std::vector<PEFTModelID>();
-        peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id);
-      }
-    }
-  }
-
-  // save finetuned lora model configs to file
-  if (peft_config.trainable) {
-    std::string finetuned_model_folder = join_path({
-        peft_config.cache_folder,
-        "finetuned_models",
-        peft_config.peft_model_id,
-    });
-    fs::remove_all(finetuned_model_folder);
-    std::string finetuned_model_config_folder = join_path({
-        finetuned_model_folder,
-        "config",
-    });
-    fs::create_directories(finetuned_model_config_folder);
-    std::string lora_linear_config_filepath = join_path({
-        finetuned_model_config_folder,
-        "ff_config.json",
-    });
-    serialize_to_json_file(peft_config, lora_linear_config_filepath);
-    std::string optimizer_config_filepath = join_path({
-        finetuned_model_config_folder,
-        "ff_optimizer_config.json",
-    });
-    if (typeid(*peft_config.optimizer_config) ==
-        typeid(LoraSGDOptimizerConfig)) {
-      LoraSGDOptimizerConfig const *sgd_config =
-          static_cast<LoraSGDOptimizerConfig const *>(
-              peft_config.optimizer_config);
-      serialize_to_json_file(*sgd_config, optimizer_config_filepath);
-    } else if (typeid(*peft_config.optimizer_config) ==
-               typeid(LoraAdamOptimizerConfig)) {
-      LoraAdamOptimizerConfig const *adam_config =
-          static_cast<LoraAdamOptimizerConfig const *>(
-              peft_config.optimizer_config);
-      serialize_to_json_file(*adam_config, optimizer_config_filepath);
-    } else {
-      assert(false && "Optimizer not supported");
-    }
-  }
-
-  return peft_model_id;
-}
-#endif
-
 Op *LoraLinear::create_operator_from_layer(
     FFModel &model,
     Layer const *layer,
@@ -272,15 +145,6 @@ Op *LoraLinear::create_operator_from_layer(
   int max_rank = value;
   layer->get_int_property("max_concurrent_adapters", value);
   int max_concurrent_adapters = value;
-#ifdef DEADCODE
-  std::unordered_map<PEFTModelID, LoraLinearConfig> _peft_configs;
-  std::vector<PEFTModelID> const &peft_ids =
-      model.peft_layer_to_peft_id[(Layer *)layer];
-  for (int i = 0; i < peft_ids.size(); i++) {
-    _peft_configs.emplace(
-        std::make_pair(peft_ids[i], model.peft_configs[peft_ids[i]]));
-  }
-#endif
   return new LoraLinear(model,
                         layer->layer_guid,
                         inputs[0],
@@ -982,7 +846,7 @@ void LoraLinear::peft_bwd_task(Task const *task,
   int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
   // int num_infr_tokens = bc->num_active_infr_tokens();
   // int num_peft_tokens = bc->num_active_peft_tokens();
-  peft_bwd_kernel_wrapper(ctx, runtime, m, bc, input_grad, output_grad);
+  peft_bwd_kernel_wrapper(ctx, runtime, m, bc, shard_id, input_grad, output_grad);
 
   save_peft_weights_if_needed(m, bc, in_dim, out_dim, shard_id);
 
@@ -1018,14 +882,6 @@ bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) {
   if (lhs.layer_guid == rhs.layer_guid && lhs.max_rank == rhs.max_rank &&
       lhs.max_concurrent_adapters == rhs.max_concurrent_adapters &&
       strcmp(lhs.name, rhs.name) == 0) {
-#ifdef DEADCODE
-    for (auto const &kv : lhs.peft_configs) {
-      auto it = rhs.peft_configs.find(kv.first);
-      if (it == rhs.peft_configs.end() || !(it->second == kv.second)) {
-        return false;
-      }
-    }
-#endif
     return true;
   }
   return false;
@@ -1066,50 +922,6 @@ void LoraLinear::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.model_id);
   sez.serialize(this->max_rank);
   sez.serialize(this->max_concurrent_adapters);
-#ifdef DEADCODE
-  sez.serialize(this->op_type);
-  sez.serialize(this->peft_configs.size());
-  for (auto const &kv : this->peft_configs) {
-    // Serialize PEFTModelID
-    sez.serialize(kv.first.id);
-
-    // Serialize LoraLinearConfig and OptimizerConfig to tmp folder
-    // 1. Create tmp dir and serialize it
-    fs::path unique_temp_dir = create_unique_temp_directory();
-    serialize_string(sez, unique_temp_dir.string());
-    // 2. Dump LoraLinearConfig to json file in tmp dir
-    std::string lora_config_filename = std::string("lora_linear_config_") +
-                                       std::to_string(kv.first.id) +
-                                       std::string(".json");
-    fs::path lora_config_json_filepath = unique_temp_dir / lora_config_filename;
-    serialize_to_json_file(kv.second, lora_config_json_filepath);
-    // 3. Dump optimizer to json file in tmp dir, and serialize optimizer type
-    std::string optimizer_filename = std::string("optimizer_config_") +
-                                     std::to_string(kv.first.id) +
-                                     std::string(".json");
-    fs::path optim_config_filepath = unique_temp_dir / optimizer_filename;
-    assert((kv.second.trainable) == (kv.second.optimizer_config != nullptr));
-    if (kv.second.trainable) {
-      if (typeid(*kv.second.optimizer_config) ==
-          typeid(LoraSGDOptimizerConfig)) {
-        sez.serialize(OPTIMIZER_TYPE_SGD);
-        LoraSGDOptimizerConfig const *sgd_config =
-            static_cast<LoraSGDOptimizerConfig const *>(
-                kv.second.optimizer_config);
-        serialize_to_json_file(*sgd_config, optim_config_filepath);
-      } else if (typeid(*kv.second.optimizer_config) ==
-                 typeid(LoraAdamOptimizerConfig)) {
-        sez.serialize(OPTIMIZER_TYPE_ADAM);
-        LoraAdamOptimizerConfig const *adam_config =
-            static_cast<LoraAdamOptimizerConfig const *>(
-                kv.second.optimizer_config);
-        serialize_to_json_file(*adam_config, optim_config_filepath);
-      } else {
-        assert(false && "Optimizer type not yet supported");
-      }
-    }
-  }
-#endif
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
 }
@@ -1135,58 +947,6 @@ Node LoraLinear::deserialize(FFModel &ff,
   dez.deserialize(deserialized_model_id);
   dez.deserialize(max_rank);
   dez.deserialize(max_concurrent_adapters);
-#ifdef DEADCODE
-  dez.deserialize(op_type);
-  dez.deserialize(num_pefts);
-  for (int i = 0; i < num_pefts; i++) {
-    // Deserialize PEFTModelID
-    size_t pid;
-    dez.deserialize(pid);
-    PEFTModelID peft_model_id(pid);
-    // Deserialize tmp folder containing LoraLinearConfig and optimizer config
-    fs::path unique_temp_dir = fs::path(deserialize_string(dez));
-    // 1. Deserialize LoraLinearConfig
-    std::string lora_config_filename = std::string("lora_linear_config_") +
-                                       std::to_string(pid) +
-                                       std::string(".json");
-    fs::path lora_config_json_filepath = unique_temp_dir / lora_config_filename;
-    std::unique_ptr<LoraLinearConfig> lora_linear_config =
-        deserialize_from_json_file<LoraLinearConfig>(lora_config_json_filepath);
-    // 2. Deserialize optimizer if needed
-    if (lora_linear_config->trainable) {
-      std::string optimizer_filename = std::string("optimizer_config_") +
-                                       std::to_string(pid) +
-                                       std::string(".json");
-      fs::path optim_config_filepath = unique_temp_dir / optimizer_filename;
-      OptimizerType type_;
-      dez.deserialize(type_);
-      if (type_ == OPTIMIZER_TYPE_SGD) {
-        std::unique_ptr<LoraSGDOptimizerConfig> sgd_optimizer_config =
-            deserialize_from_json_file<LoraSGDOptimizerConfig>(
-                optim_config_filepath);
-        lora_linear_config->optimizer_config =
-            dynamic_cast<LoraOptimizerConfig *>(sgd_optimizer_config.release());
-      } else if (type_ == OPTIMIZER_TYPE_ADAM) {
-        std::unique_ptr<LoraAdamOptimizerConfig> adam_optimizer_config =
-            deserialize_from_json_file<LoraAdamOptimizerConfig>(
-                optim_config_filepath);
-        lora_linear_config->optimizer_config =
-            dynamic_cast<LoraOptimizerConfig *>(
-                adam_optimizer_config.release());
-      } else {
-        printf("Optimizer type: %d\n", type_);
-        assert(false && "Optimizer type not yet supported");
-      }
-    }
-    try {
-      fs::remove_all(unique_temp_dir);
-    } catch (fs::filesystem_error const &e) {
-      std::cerr << "Error removing tmp directory: " << e.what() << std::endl;
-    }
-    params.peft_configs.emplace(
-        std::make_pair(peft_model_id, *lora_linear_config));
-  }
-#endif
   dez.deserialize(name_len);
   dez.deserialize(name, name_len);
   LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
@@ -1236,19 +996,6 @@ size_t hash<FlexFlow::LoraLinearParams>::operator()(
   hash_combine(key, params.layer_guid.model_id);
   hash_combine(key, params.max_rank);
   hash_combine(key, params.max_concurrent_adapters);
-#ifdef DEADCODE
-  for (auto const &kv : params.peft_configs) {
-    hash_combine(key, kv.first.id);
-    hash_combine(key, kv.second.rank);
-    hash_combine(key, kv.second.trainable);
-    hash_combine(key, kv.second.cache_folder);
-    hash_combine(key, kv.second.peft_model_id);
-    hash_combine(key, kv.second.lora_alpha);
-    hash_combine(key, kv.second.lora_dropout);
-    hash_combine(key, kv.second.target_modules);
-    hash_combine(key, kv.second.init_lora_weights);
-  }
-#endif
   return key;
 }
 }; // namespace std
diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc
index 4eb59bc53f..4bc75d17e4 100644
--- a/src/ops/lora_linear_params.cc
+++ b/src/ops/lora_linear_params.cc
@@ -282,18 +282,18 @@ LoraLinearConfig LoraLinearConfig::deserialize_from_json_string(
   if (!j["optimizer_config"].is_null()) {
     optimizer_config_ = LoraOptimizerConfig::fromJson(j["optimizer_config"]);
   }
-  LoraLinearConfig config(
-      j["cache_folder"].get<std::string>(),
-      j["peft_model_id"].get<std::string>(),
-      j["trainable"].get<bool>(),
-      optimizer_config_, // optimizer_config will be set later if present
-      j["init_lora_weights"].get<bool>(),
-      j["base_model_name_or_path"].get<std::string>(),
-      j["precision"].get<std::string>(),
-      j["rank"].get<int>(),
-      j["lora_alpha"].get<float>(),
-      j["lora_dropout"].get<float>(),
-      j["target_modules"].get<std::vector<std::string>>());
+  LoraLinearConfig config = LoraLinearConfig::EmptyConfig;
+  config.cache_folder = j["cache_folder"].get<std::string>();
+  config.peft_model_id = j["peft_model_id"].get<std::string>();
+  config.rank = j["rank"].get<int>();
+  config.lora_alpha = j["lora_alpha"].get<float>();
+  config.lora_dropout = j["lora_dropout"].get<float>();
+  config.target_modules = j["target_modules"].get<std::vector<std::string>>();
+  config.trainable = j["trainable"].get<bool>();
+  config.init_lora_weights = j["init_lora_weights"].get<bool>();
+  config.base_model_name_or_path = j["base_model_name_or_path"].get<std::string>();
+  config.precision = j["precision"].get<std::string>();
+  config.optimizer_config = optimizer_config_;
   return config;
 }
 
diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc
index 2dd9a4711b..bd33076309 100644
--- a/src/runtime/peft_weight_allocator.cc
+++ b/src/runtime/peft_weight_allocator.cc
@@ -23,7 +23,7 @@ using Legion::TaskLauncher;
 void PEFTMemoryManager::allocate_inference_memory() {
   // allocate chunk of memory for all the PEFT adapters
   Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0),
-                                 Realm::Point<1, coord_t>(max_lora_size - 1));
+                                 Realm::Point<1, coord_t>(max_lora_size*max_concurrent_adapters - 1));
   std::vector<size_t> field_sizes;
   field_sizes.push_back(sizeof(char));
   Realm::RegionInstance::create_instance(peftLegionInst,
@@ -39,7 +39,7 @@ void PEFTMemoryManager::allocate_inference_memory() {
 void PEFTMemoryManager::allocate_finetuning_memory() {
   size_t ft_size = max_lora_size * 3; // weights, gradients, momentum values
   ft_size +=
-      max_peft_tokens * (in_dim + max_rank); // input, low-rank activations
+      max_peft_tokens * (in_dim + max_rank) * data_type_size(dt); // input, low-rank activations
   // allocate chunk of memory for PEFT adapter
   Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0),
                                  Realm::Point<1, coord_t>(ft_size - 1));
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index a25677b22e..7d1e338d8f 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -268,8 +268,9 @@ void RequestManager::set_peft_config(PEFTModelID const &peft_model_id,
   // check that peft_model_id is not already in use
   assert(peft_configs.find(peft_model_id) == peft_configs.end() &&
          "PEFT model ID already in use");
-  peft_configs[peft_model_id] = LoraLinearConfig::deserialize_from_json_string(
-      peft_config.serialize_to_json_string());
+  // LoraLinearConfig new_config = LoraLinearConfig::deserialize_from_json_string(
+  //     peft_config.serialize_to_json_string());
+  peft_configs[peft_model_id] = peft_config;
 }
 
 LoraLinearConfig const &
@@ -304,6 +305,7 @@ PEFTModelID *
     std::cout << peft_config << std::endl;
     assert(false);
   }
+  std::cout << "Registering PEFT adapter" << peft_config.serialize_to_json_string() << std::endl;
   // go over base_layer_to_peft_layer and check that you can find at least one
   // match
   for (int i = 0; i < peft_config.target_modules.size(); i++) {
@@ -699,6 +701,8 @@ void RequestManager::add_peft_config_to_request_info(
   std::string peft_config_str = peft_config.serialize_to_json_string();
   std::strcpy(bc.requestsInfo[req_idx].peft_model_config_str,
               peft_config_str.c_str());
+  // std::cout << "Added PEFT config to request info: "
+  //           << bc.requestsInfo[req_idx].peft_model_config_str << std::endl;
 }
 
 BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
index a2fc5548ab..8a53ef8c9c 100644
--- a/tests/peft/hf_finetune.py
+++ b/tests/peft/hf_finetune.py
@@ -77,7 +77,7 @@ def main():
     if args.save_peft_tensors:
         make_debug_dirs()
         register_peft_hooks(model)
-        save_model_weights(model, target_modules=["lora", "lm_head", "down_proj"])
+        save_model_weights(model, target_modules=["lora", "lm_head", "down_proj", "up_proj"])
 
     # Load fine-tuning dataset
     data = load_dataset("Abirate/english_quotes")
diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py
index cc677cd51a..bc9d8d9d24 100644
--- a/tests/peft/peft_alignment_test.py
+++ b/tests/peft/peft_alignment_test.py
@@ -17,7 +17,7 @@ def check_bwd_pass(self):
     def check_step(self, step_idx, learning_rate=0.001):
         raise NotImplementedError()
 
-class LllamaAlignmentTest(AlignmentTest):
+class LlamaAlignmentTest(AlignmentTest):
     def __init__(self, model_name, tp_degree=1):
         self.model_name = model_name
         self.peft_config = PeftConfig.from_pretrained(model_name)
@@ -538,11 +538,47 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
             output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
             hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
             ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            print(f"w3 {i} grad output")
+            print("flexflow tensor shape:", ff_tensor.squeeze().shape)
+            print(ff_tensor.squeeze())
+            print("huggingface tensor shape:", hf_tensor.squeeze().T.shape)
+            print(hf_tensor.squeeze().T)
             compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient output")
+            # print(f"W3 {i} output matches!")
+            # print(f"FF shape: {ff_tensor.shape}")
+            # print(f"HF shape: {hf_tensor.shape}")
+
+            # hf_w3_output = hf_tensor.clone()
+
+
             # W3 (up_proj) input
             input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
             hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
             ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+
+            # w3_input_torch = torch.matmul(hf_tensor, torch.transpose(ff_tensor, 0, 1))
+            # ff_up_proj_weight_path="/usr/.cache/flexflow/debug/flexflow/weights/step_0/shard_0/layers.11.layers.11.mlp.up_proj.weight_0"
+            # hf_up_proj_weight_path="/usr/.cache/flexflow/debug/huggingface/weights/step_0/layers.11.mlp.up_proj.weight"
+            # hf_up_proj_weight = torch.load(hf_up_proj_weight_path, map_location='cpu')
+            # print(hf_up_proj_weight.shape)
+            # ff_up_proj_weight = load_ff_tensor(ff_up_proj_weight_path, hf_up_proj_weight.shape[::-1])
+            # print(ff_up_proj_weight.shape)
+            # ff_up_proj_weight = torch.from_numpy(ff_up_proj_weight).to(hf_up_proj_weight.dtype)
+            # assert torch.allclose(hf_up_proj_weight.T, ff_up_proj_weight, atol=1e-5)
+            
+            # print("HF W3 output shape:", hf_w3_output.shape)
+            # print("HF W3 weight shape:", hf_up_proj_weight.shape)
+            # print("HF W3 input shape:", hf_tensor.shape)
+
+            # simulated_w3_input = torch.matmul(hf_w3_output.squeeze(), hf_up_proj_weight)
+            # print("simulated W3 input shape:", simulated_w3_input.T.shape)
+            # print(simulated_w3_input.T)
+            print(f"w3 {i} grad input")
+            print("flexflow tensor shape:", ff_tensor.squeeze().shape)
+            print(ff_tensor.squeeze())
+            print("huggingface tensor shape:", hf_tensor.squeeze().T.shape)
+            print(hf_tensor.squeeze().T)
+
             compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient input")
 
             # Attn O-proj
@@ -695,7 +731,24 @@ def compare(hf_tensor, ff_tensor, label="", tolerance=1e-4):
             torch.testing.assert_close(hf_gradient, (hf_original_weight-hf_finetuned_weight)/learning_rate, rtol=1.3e-6, atol=1e-5)
             ff_gradient_name = convert_hf_filename_to_ff(hf_gradient_name)
             ff_gradient = get_ff_tensor(ff_gradient_name, hf_gradient.shape, tp_type=TPType.REPLICATE)
+            
+            lora_low_rank_activation_fwd_path = f"/usr/.cache/flexflow/debug/flexflow/fwd/step_{step_idx}/shard_0/layers.{i}.layers.{i}.mlp.down_proj.lora.low_rank_activation"
+            lora_low_rank_activation_bwd_path = f"/usr/.cache/flexflow/debug/flexflow/bwd/step_{step_idx}/shard_0/layers.{i}.layers.{i}.mlp.down_proj.lora.low_rank_activation"
+            lora_low_rank_activation_fwd = load_ff_tensor(lora_low_rank_activation_fwd_path, [16, 128])[:,:self.num_tokens]
+            lora_low_rank_activation_fwd = torch.from_numpy(lora_low_rank_activation_fwd)
+            lora_low_rank_activation_bwd = load_ff_tensor(lora_low_rank_activation_bwd_path, [16, 24])
+            lora_low_rank_activation_bwd = torch.from_numpy(lora_low_rank_activation_bwd)
+            torch.testing.assert_close(lora_low_rank_activation_fwd, lora_low_rank_activation_bwd, rtol=1.3e-6, atol=1e-5)
+            
+            print(f"LoRA_B {i} gradient")
+            print("FlexFlow shape: ", ff_gradient.shape)
+            print(ff_gradient)
+            print("HuggingFace shape: ", hf_gradient.shape)
+            print(hf_gradient.squeeze().T)
             compare(hf_gradient, ff_gradient, label=f"LoRA_B {i} gradient")
+
+            
+
             # ff_out_gradient_name = f"layers.{i}.layers.{i}.mlp.down_proj.lora.output_gradient_0"
             # ff_fwd_folder = os.path.join(ff_path, "fwd", f"step_{step_idx}", "shard_0")
             # ff_bwd_folder = os.path.join(ff_path, "bwd", f"step_{step_idx}", "shard_0")
@@ -737,7 +790,7 @@ def compare(hf_tensor, ff_tensor, label="", tolerance=1e-4):
 args = parser.parse_args()
 
 if __name__ == "__main__":
-    llama_alignment = LllamaAlignmentTest(args.model_name, tp_degree=args.tensor_parallelism_degree)
+    llama_alignment = LlamaAlignmentTest(args.model_name, tp_degree=args.tensor_parallelism_degree)
     # llama_alignment.check_weights_alignment()
     for i in range(args.num_steps):
         llama_alignment.check_fwd_pass(i)
diff --git a/tests/peft_test.sh b/tests/peft_test.sh
index 173fb37fd9..b7adce8028 100755
--- a/tests/peft_test.sh
+++ b/tests/peft_test.sh
@@ -34,7 +34,7 @@ export LEGION_BACKTRACE=1
 python ./inference/utils/download_peft_model.py goliaro/llama-160m-lora --base_model_name JackFram/llama-160m 
 
 # Run PEFT in Huggingface to get ground truth tensors
-python ./tests/peft/hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision
+python ./tests/peft/hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision -lr 1.0
 
 # Python test
 echo "Python test"
@@ -45,8 +45,8 @@ echo "Python test"
 # C++ test
 echo "C++ test"
 ./build/inference/peft/peft \
-    -ll:gpu 2 -ll:cpu 4 -ll:util 4 \
-    -tensor-parallelism-degree 2 \
+    -ll:gpu 1 -ll:cpu 4 -ll:util 4 \
+    -tensor-parallelism-degree 1 \
     -ll:fsize 8192 -ll:zsize 12000 \
     -llm-model JackFram/llama-160m \
     -finetuning-dataset ./inference/prompt/peft_dataset.json \
@@ -55,7 +55,7 @@ echo "C++ test"
     --use-full-precision \
     --inference-debugging
 # Check alignment
-python ./tests/peft/peft_alignment_test.py -tp 2
+python ./tests/peft/peft_alignment_test.py -tp 1 -lr 1.0
 
 # Print succeess message
 echo ""

From b56ebd3aab4f7eb4ae77890869437258b6bbe150 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 8 Nov 2024 06:19:18 +0000
Subject: [PATCH 26/37] fix reset input grad for non-activated loras

---
 include/flexflow/operator.h                   |  2 +-
 include/flexflow/ops/kernels/linear_kernels.h |  2 +
 .../ops/kernels/lora_linear_kernels.h         | 17 -------
 src/ops/fused.cu                              |  1 +
 src/ops/kernels/linear_kernels.cu             | 45 +++++++++++++++++++
 src/ops/kernels/lora_linear_kernels.cu        | 34 +++++++-------
 src/ops/linear.cc                             |  1 +
 src/ops/lora_linear.cc                        |  3 +-
 src/ops/lora_linear_params.cc                 |  3 +-
 src/runtime/inference_manager.cc              |  4 +-
 src/runtime/peft_weight_allocator.cc          |  9 ++--
 src/runtime/request_manager.cc                |  6 ++-
 tests/peft/peft_alignment_test.py             | 43 +++++++++++-------
 13 files changed, 111 insertions(+), 59 deletions(-)

diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index 007314797a..c108740ef3 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -280,7 +280,7 @@ class Op {
     // get operator name and print it
     std::string op_name_without_uid = get_op_name_without_uid(m);
     std::cout << (fwd_pass ? "INF " : "BWD ") << op_name_without_uid
-              << std::endl;
+              << (before_kernel ? " (before kernel)" : "") << std::endl;
     // build the path to save the tensor
     fs::path dst_filepath;
     if (fwd_pass) {
diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h
index 90e50a0c9a..aaa845db23 100644
--- a/include/flexflow/ops/kernels/linear_kernels.h
+++ b/include/flexflow/ops/kernels/linear_kernels.h
@@ -61,6 +61,7 @@ void inference_kernel_wrapper(LinearMeta *m,
                               int out_dim,
                               int batch_size);
 void peft_bwd_kernel_wrapper(LinearMeta const *m,
+                             BatchConfig const *bc,
                              void *input_grad_ptr,
                              void *output_grad_ptr,
                              void const *kernel_ptr,
@@ -94,6 +95,7 @@ void forward_kernel(LinearMeta const *m,
                     ffStream_t stream);
 template <typename DT>
 void peft_bwd_kernel(LinearMeta const *m,
+                     BatchConfig const *bc,
                      void *input_grad_ptr,
                      void *output_grad_ptr,
                      void const *kernel_ptr,
diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
index b17868fb96..fd86dc68c0 100644
--- a/include/flexflow/ops/kernels/lora_linear_kernels.h
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -13,27 +13,10 @@ namespace FlexFlow {
 using Legion::Context;
 using Legion::Runtime;
 
-#ifdef DEADCODE
-struct LoraLinearModelState {
-  LoraLinearWeight weights;
-  LoraOptimizerConfig const *optimizer_config;
-  float lora_alpha;
-  std::string cache_folder;
-  // Huggingface model ID (for download and/or upload)
-  std::string peft_model_id;
-};
-#endif
-
 class LoraLinearMeta : public OpMeta {
 public:
   LoraLinearMeta(FFHandler handle, LoraLinear const *li);
   ~LoraLinearMeta(void);
-  // PEFT related fields
-  // void *low_rank_activation;
-  // void *input_activation;
-  // std::unordeded_map<PEFTModelID, LoraLinearWeight> model_state;
-  // std::unordered_map<PEFTModelID, LoraLinearModelState> model_state;
-  // size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0;
   PEFTMemoryManager *peft_memory_manager;
 };
 
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index c615a104d2..8635fd6a87 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -862,6 +862,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task,
         int num_infr_tokens = bc->num_active_infr_tokens();
         int num_peft_tokens = bc->num_active_peft_tokens();
         Kernels::Linear::peft_bwd_kernel_wrapper(m,
+                                                 bc,
                                                  my_input_grad_accessor[0].ptr,
                                                  my_output_grad_accessor[0].ptr,
                                                  my_weight_accessor[0].ptr,
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 3832428c64..51954597d7 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -16,6 +16,7 @@
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/decompress_kernels.h"
 #include "flexflow/ops/kernels/linear_kernels.h"
+#include "flexflow/ops/lora_linear_params.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
@@ -73,6 +74,17 @@ LinearMeta::~LinearMeta(void) {
   }
 }
 
+bool lora_applies_to_this_layer(LinearMeta const *m,
+                                LoraLinearConfig const &config) {
+  for (std::string s : config.target_modules) {
+    std::string n(m->op_name);
+    if (n.find(s) != std::string::npos) {
+      return true;
+    }
+  }
+  return false;
+}
+
 namespace Kernels {
 namespace Linear {
 
@@ -285,6 +297,7 @@ void inference_kernel_wrapper(LinearMeta *m,
 }
 
 void peft_bwd_kernel_wrapper(LinearMeta const *m,
+                             BatchConfig const *bc,
                              void *input_grad_ptr,
                              void *output_grad_ptr,
                              void const *weight_ptr,
@@ -302,6 +315,7 @@ void peft_bwd_kernel_wrapper(LinearMeta const *m,
   }
   if (m->input_type[0] == DT_FLOAT) {
     Internal::peft_bwd_kernel<float>(m,
+                                     bc,
                                      input_grad_ptr,
                                      output_grad_ptr,
                                      weight_ptr,
@@ -312,6 +326,7 @@ void peft_bwd_kernel_wrapper(LinearMeta const *m,
                                      stream);
   } else if (m->input_type[0] == DT_HALF) {
     Internal::peft_bwd_kernel<half>(m,
+                                    bc,
                                     input_grad_ptr,
                                     output_grad_ptr,
                                     weight_ptr,
@@ -568,6 +583,7 @@ void forward_kernel(LinearMeta const *m,
 
 template <typename DT>
 void peft_bwd_kernel(LinearMeta const *m,
+                     BatchConfig const *bc,
                      void *input_grad_ptr,
                      void *output_grad_ptr,
                      void const *kernel_ptr,
@@ -611,6 +627,35 @@ void peft_bwd_kernel(LinearMeta const *m,
   // NOTE: we use beta=1 for input_grad to accumulate gradients when needed
   DT alpha = 1.0f;
   DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f;
+
+  // ensure that we only have one finetuning request, with a single lora
+  int num_peft_requests = 0;
+  bool lora_applies = false;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i] ||
+        bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID ||
+        !bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+    num_peft_requests++;
+    std::string peft_model_config_str =
+        std::string(bc->requestsInfo[i].peft_model_config_str);
+    LoraLinearConfig lora_config =
+        LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
+    if (!lora_applies_to_this_layer(m, lora_config)) {
+      continue;
+    }
+    lora_applies = true;
+  }
+  assert(num_peft_requests == 1 &&
+         "Exactly one PEFT finetuning request is required");
+  // if the request does not have any active lora in the current layer, reset
+  // beta to 0 std::cout << m->op_name << " original beta: " << (float)beta << "
+  // lora_applies: " << lora_applies << std::endl;
+  if (lora_applies) {
+    beta = 1.0f;
+  }
+
   if (input_grad_ptr != NULL) {
     checkCUDA(cublasGemmEx(m->handle.blas,
                            CUBLAS_OP_N,
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index dabe40c501..40095484b5 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -23,14 +23,12 @@
 namespace FlexFlow {
 
 LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li)
-    : OpMeta(handler, li) {
-}
+    : OpMeta(handler, li) {}
 
 LoraLinearMeta::~LoraLinearMeta(void) {}
 
-std::string get_peft_dbg_folder(LoraLinearMeta const *m,
-                                int shard_id,
-                                bool is_fwd) {
+std::string
+    get_peft_dbg_folder(LoraLinearMeta const *m, int shard_id, bool is_fwd) {
   std::string op_name_without_uid = LoraLinear::get_op_name_without_uid(m);
   fs::path dst_filepath;
   if (is_fwd) {
@@ -51,8 +49,6 @@ std::string get_peft_dbg_folder(LoraLinearMeta const *m,
 namespace Kernels {
 namespace LoraLinear {
 
-
-
 void inference_kernel_wrapper(LoraLinearMeta *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
@@ -174,7 +170,6 @@ bool lora_applies_to_this_layer(LoraLinearMeta *m,
 
 namespace Internal {
 
-
 template <typename DT>
 void inference_kernel(LoraLinearMeta *m,
                       BatchConfig const *bc,
@@ -208,8 +203,8 @@ void inference_kernel(LoraLinearMeta *m,
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
-    std::cout << "Lora layer activated!" << std::endl;
-    std::cout << "Lora Config: " << peft_model_config_str << std::endl;
+    // std::cout << "Lora layer activated!" << std::endl;
+    // std::cout << "Lora Config: " << peft_model_config_str << std::endl;
     assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd &&
            "Trainable flag mismatch");
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
@@ -311,7 +306,7 @@ void peft_bwd_kernel(Context ctx,
                      Runtime *runtime,
                      LoraLinearMeta *m,
                      BatchConfig const *bc,
-                      int shard_id,
+                     int shard_id,
                      DT *input_grad_ptr,
                      DT const *output_grad_ptr,
                      int in_dim,
@@ -340,8 +335,8 @@ void peft_bwd_kernel(Context ctx,
     if (!lora_applies_to_this_layer(m, lora_config)) {
       continue;
     }
-    std::cout << "Lora layer activated!" << std::endl;
-    std::cout << "Lora Config: " << peft_model_config_str << std::endl;
+    // std::cout << "Lora layer activated!" << std::endl;
+    // std::cout << "Lora Config: " << peft_model_config_str << std::endl;
     assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd &&
            "Trainable flag mismatch");
     m->peft_memory_manager->check_ft_model_id(
@@ -359,12 +354,17 @@ void peft_bwd_kernel(Context ctx,
       DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero)
                     ? 0.0f
                     : 1.0f;
-      std::cout << "Lora B gradient computation, beta = " << (float) beta << std::endl;
+      // std::cout << "Lora B gradient computation, beta = " << (float) beta <<
+      // std::endl;
       if (m->inference_debugging) {
         // save result to file for checking
-        std::string filename = get_peft_dbg_folder(m, shard_id, false) + ".low_rank_activation";
-        std::cout << "Save low_rank_activation (" << lora_config.rank << ", " << num_peft_tokens << ") to " << filename << std::endl;
-        save_tensor(static_cast<const DT*>(weight.low_rank_activation), lora_config.rank*num_peft_tokens, filename.c_str());
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id, false) + ".low_rank_activation";
+        std::cout << "Save low_rank_activation (" << lora_config.rank << ", "
+                  << num_peft_tokens << ") to " << filename << std::endl;
+        save_tensor(static_cast<const DT *>(weight.low_rank_activation),
+                    lora_config.rank * num_peft_tokens,
+                    filename.c_str());
       }
       checkCUDA(cublasGemmEx(m->handle.blas,
                              CUBLAS_OP_N,
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 09170d3c28..8c2120e283 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -769,6 +769,7 @@ void Linear::peft_bwd_task(Task const *task,
            num_peft_tokens);
   }
   peft_bwd_kernel_wrapper(m,
+                          bc,
                           input_grad.ptr,
                           output_grad.ptr,
                           weight.ptr,
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
index 5f67709358..68605160a5 100644
--- a/src/ops/lora_linear.cc
+++ b/src/ops/lora_linear.cc
@@ -846,7 +846,8 @@ void LoraLinear::peft_bwd_task(Task const *task,
   int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
   // int num_infr_tokens = bc->num_active_infr_tokens();
   // int num_peft_tokens = bc->num_active_peft_tokens();
-  peft_bwd_kernel_wrapper(ctx, runtime, m, bc, shard_id, input_grad, output_grad);
+  peft_bwd_kernel_wrapper(
+      ctx, runtime, m, bc, shard_id, input_grad, output_grad);
 
   save_peft_weights_if_needed(m, bc, in_dim, out_dim, shard_id);
 
diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc
index 4bc75d17e4..69c0081ec9 100644
--- a/src/ops/lora_linear_params.cc
+++ b/src/ops/lora_linear_params.cc
@@ -291,7 +291,8 @@ LoraLinearConfig LoraLinearConfig::deserialize_from_json_string(
   config.target_modules = j["target_modules"].get<std::vector<std::string>>();
   config.trainable = j["trainable"].get<bool>();
   config.init_lora_weights = j["init_lora_weights"].get<bool>();
-  config.base_model_name_or_path = j["base_model_name_or_path"].get<std::string>();
+  config.base_model_name_or_path =
+      j["base_model_name_or_path"].get<std::string>();
   config.precision = j["precision"].get<std::string>();
   config.optimizer_config = optimizer_config_;
   return config;
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index f39ea91f28..45b6ba0db8 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -273,7 +273,9 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
         }
         reset_inputs.insert(op->inputs[i]->region);
       } else {
-        reset_inputs.insert(op->inputs[i]->region);
+        if (op->op_type != OP_LORA) {
+          reset_inputs.insert(op->inputs[i]->region);
+        }
       }
     }
   }
diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc
index bd33076309..1fcef3678e 100644
--- a/src/runtime/peft_weight_allocator.cc
+++ b/src/runtime/peft_weight_allocator.cc
@@ -22,8 +22,9 @@ using Legion::TaskLauncher;
 
 void PEFTMemoryManager::allocate_inference_memory() {
   // allocate chunk of memory for all the PEFT adapters
-  Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0),
-                                 Realm::Point<1, coord_t>(max_lora_size*max_concurrent_adapters - 1));
+  Realm::Rect<1, coord_t> bounds(
+      Realm::Point<1, coord_t>(0),
+      Realm::Point<1, coord_t>(max_lora_size * max_concurrent_adapters - 1));
   std::vector<size_t> field_sizes;
   field_sizes.push_back(sizeof(char));
   Realm::RegionInstance::create_instance(peftLegionInst,
@@ -38,8 +39,8 @@ void PEFTMemoryManager::allocate_inference_memory() {
 
 void PEFTMemoryManager::allocate_finetuning_memory() {
   size_t ft_size = max_lora_size * 3; // weights, gradients, momentum values
-  ft_size +=
-      max_peft_tokens * (in_dim + max_rank) * data_type_size(dt); // input, low-rank activations
+  ft_size += max_peft_tokens * (in_dim + max_rank) *
+             data_type_size(dt); // input, low-rank activations
   // allocate chunk of memory for PEFT adapter
   Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0),
                                  Realm::Point<1, coord_t>(ft_size - 1));
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 7d1e338d8f..798da75b01 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -268,7 +268,8 @@ void RequestManager::set_peft_config(PEFTModelID const &peft_model_id,
   // check that peft_model_id is not already in use
   assert(peft_configs.find(peft_model_id) == peft_configs.end() &&
          "PEFT model ID already in use");
-  // LoraLinearConfig new_config = LoraLinearConfig::deserialize_from_json_string(
+  // LoraLinearConfig new_config =
+  // LoraLinearConfig::deserialize_from_json_string(
   //     peft_config.serialize_to_json_string());
   peft_configs[peft_model_id] = peft_config;
 }
@@ -305,7 +306,8 @@ PEFTModelID *
     std::cout << peft_config << std::endl;
     assert(false);
   }
-  std::cout << "Registering PEFT adapter" << peft_config.serialize_to_json_string() << std::endl;
+  std::cout << "Registering PEFT adapter"
+            << peft_config.serialize_to_json_string() << std::endl;
   // go over base_layer_to_peft_layer and check that you can find at least one
   // match
   for (int i = 0; i < peft_config.target_modules.size(); i++) {
diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py
index bc9d8d9d24..ee82b298e0 100644
--- a/tests/peft/peft_alignment_test.py
+++ b/tests/peft/peft_alignment_test.py
@@ -485,12 +485,16 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
             hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
             ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
             compare(hf_tensor, ff_tensor, label=f"W2 {i} gradient output")
+            down_proj_grad_output_pre = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE, pre=True)
+            down_proj_grad_output = ff_tensor.clone()
+            compare_loaded_tensors(down_proj_grad_output, down_proj_grad_output_pre)
 
             # LoRA_B
             hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_B.default"
             ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
             output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
             hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            lora_grad_output = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
             ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) * self.lora_scaling_factor
             compare(hf_tensor, ff_tensor, label=f"LoRA_B {i} gradient output")
 
@@ -501,6 +505,7 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
             hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
             ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
             compare(hf_tensor, ff_tensor, label=f"LoRA_A {i} gradient input")
+            lora_a_grad_input = ff_tensor.clone()
 
             # W2 (down_proj) input
             hf_tensor_name = f"layers.{i}.mlp.down_proj"
@@ -508,7 +513,15 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
             input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
             hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
             ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            down_proj_grad_input_pre = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION, pre=True)
             compare(hf_tensor, ff_tensor, label=f"W2 {i} gradient input")
+
+            # down proj output (before/after kernel) should match output of lora_b
+            compare_loaded_tensors(down_proj_grad_output, lora_grad_output)
+            # down proj input (before kernel) should match input of lora_a
+            compare_loaded_tensors(down_proj_grad_input_pre, lora_a_grad_input)
+            # compare_loaded_tensors(down_proj_grad_input_pre.squeeze(), ff_tensor.squeeze())
+
             
             # W2 input (HF) and SigmoidSiluMulti output (FF)
             hf_w2_input = hf_tensor.clone()
@@ -538,11 +551,11 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
             output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
             hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
             ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
-            print(f"w3 {i} grad output")
-            print("flexflow tensor shape:", ff_tensor.squeeze().shape)
-            print(ff_tensor.squeeze())
-            print("huggingface tensor shape:", hf_tensor.squeeze().T.shape)
-            print(hf_tensor.squeeze().T)
+            # print(f"w3 {i} grad output")
+            # print("flexflow tensor shape:", ff_tensor.squeeze().shape)
+            # print(ff_tensor.squeeze())
+            # print("huggingface tensor shape:", hf_tensor.squeeze().T.shape)
+            # print(hf_tensor.squeeze().T)
             compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient output")
             # print(f"W3 {i} output matches!")
             # print(f"FF shape: {ff_tensor.shape}")
@@ -573,11 +586,11 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
             # simulated_w3_input = torch.matmul(hf_w3_output.squeeze(), hf_up_proj_weight)
             # print("simulated W3 input shape:", simulated_w3_input.T.shape)
             # print(simulated_w3_input.T)
-            print(f"w3 {i} grad input")
-            print("flexflow tensor shape:", ff_tensor.squeeze().shape)
-            print(ff_tensor.squeeze())
-            print("huggingface tensor shape:", hf_tensor.squeeze().T.shape)
-            print(hf_tensor.squeeze().T)
+            # print(f"w3 {i} grad input")
+            # print("flexflow tensor shape:", ff_tensor.squeeze().shape)
+            # print(ff_tensor.squeeze())
+            # print("huggingface tensor shape:", hf_tensor.squeeze().T.shape)
+            # print(hf_tensor.squeeze().T)
 
             compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient input")
 
@@ -740,11 +753,11 @@ def compare(hf_tensor, ff_tensor, label="", tolerance=1e-4):
             lora_low_rank_activation_bwd = torch.from_numpy(lora_low_rank_activation_bwd)
             torch.testing.assert_close(lora_low_rank_activation_fwd, lora_low_rank_activation_bwd, rtol=1.3e-6, atol=1e-5)
             
-            print(f"LoRA_B {i} gradient")
-            print("FlexFlow shape: ", ff_gradient.shape)
-            print(ff_gradient)
-            print("HuggingFace shape: ", hf_gradient.shape)
-            print(hf_gradient.squeeze().T)
+            # print(f"LoRA_B {i} gradient")
+            # print("FlexFlow shape: ", ff_gradient.shape)
+            # print(ff_gradient)
+            # print("HuggingFace shape: ", hf_gradient.shape)
+            # print(hf_gradient.squeeze().T)
             compare(hf_gradient, ff_gradient, label=f"LoRA_B {i} gradient")
 
             

From 3632754422355eb3cd7c630e6dcdaa6944530972 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 8 Nov 2024 16:35:37 +0000
Subject: [PATCH 27/37] fix

---
 src/runtime/model.cc              | 57 +++----------------------------
 tests/peft/peft_alignment_test.py |  3 +-
 tests/peft_test.sh                |  6 ++--
 3 files changed, 10 insertions(+), 56 deletions(-)

diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index de798890ef..465ee21fc9 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3420,63 +3420,16 @@ bool FFModel::need_to_add_combine(int layer_idx) const {
 
 bool FFModel::need_to_add_allreduce(int layer_idx) const {
   auto const &l = layers[layer_idx];
-  if (config.computationMode == COMP_MODE_INFERENCE &&
-      config.tensor_parallelism_degree > 1 &&
-      (
-          //  l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
-          //  l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION ||
-          (std::string(l->name).find("attn.o_proj") != std::string::npos) ||
-          // mlp layer
-          is_mlp_block(layer_idx) ||
-          // llama mlp layer
-          (l->op_type == OP_LINEAR && layer_idx >= 2 &&
-           layers[layer_idx - 1]->op_type == OP_GELU &&
-           layers[layer_idx - 2]->op_type == OP_LINEAR) ||
-          // LLAMA without element-wise operator fusion
-          (l->op_type == OP_LINEAR && layer_idx >= 5 &&
-           layers[layer_idx - 1]->op_type == OP_EW_MUL &&
-           layers[layer_idx - 2]->op_type == OP_EW_MUL &&
-           layers[layer_idx - 3]->op_type == OP_SIGMOID &&
-           layers[layer_idx - 4]->op_type == OP_LINEAR &&
-           layers[layer_idx - 5]->op_type == OP_LINEAR) ||
-          // LLAMA with element-wise operator fusion
-          (l->op_type == OP_LINEAR && layer_idx >= 3 &&
-           layers[layer_idx - 1]->op_type == OP_SIGMOID_SILU_MULTI &&
-           layers[layer_idx - 2]->op_type == OP_LINEAR &&
-           layers[layer_idx - 3]->op_type == OP_LINEAR))) {
+  if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 &&
+      ((l->op_type == OP_LINEAR && std::string(l->name).find("attn.o_proj") != std::string::npos) ||
+        is_mlp_block(layer_idx) ||
+        (l->op_type == OP_LINEAR && std::string(l->name).find("mlp.down_proj") != std::string::npos)
+      )) {
     return true;
   }
   return false;
 }
 
-#ifdef DEADCODE
-bool FFModel::need_to_add_parallel_identity(int layer_idx) const {
-  auto const &l = layers[layer_idx];
-  // add parallel identity (allreduce in the backward pass) before the lm head
-  // we find the lm head by looking for the linear layer right after a residual
-  // rms norm / layer norm, and before a softmax, followed by
-  // argmax/argtopk/sampling
-  if (config.computationMode == COMP_MODE_INFERENCE &&
-      config.tensor_parallelism_degree > 1 &&
-      ((l->op_type == OP_RESIDUAL_RMS_NORM ||
-        l->op_type == OP_RESIDUAL_LAYERNORM) &&
-       // there are at least 2 layers before the norm, and at least 3 following
-       // the norm
-       layer_idx >= 2 && layer_idx < layers.size() - 3 &&
-       // norm is followed by linear layer (lm head)
-       layers[layer_idx + 1]->op_type == OP_LINEAR &&
-       // lm head is followed by softmax
-       layers[layer_idx + 2]->op_type == OP_SOFTMAX &&
-       // softmax is followed by argmax/argtopk/sampling
-       (layers[layer_idx + 3]->op_type == OP_ARG_TOPK ||
-        layers[layer_idx + 3]->op_type == OP_SAMPLING ||
-        layers[layer_idx + 3]->op_type == OP_ARGMAX ||
-        layers[layer_idx + 3]->op_type == OP_SCALAR_TRUE_DIV))) {
-    return true;
-  }
-  return false;
-}
-#endif
 bool FFModel::need_to_add_parallel_identity(int layer_idx) const {
   auto const &l = layers[layer_idx];
   // add parallel identity (allreduce in the backward pass) before the lm head
diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py
index ee82b298e0..c4db87c099 100644
--- a/tests/peft/peft_alignment_test.py
+++ b/tests/peft/peft_alignment_test.py
@@ -655,7 +655,8 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
                 ff_tensor_name = f"layers.{i}.layers.{i}.input_layernorm"
                 _output_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=1)
                 input_layernorm_out1 = get_ff_tensor(ff_tensor_name, _output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
-                torch.testing.assert_close(attn_input, input_layernorm_out1, rtol=1.3e-6, atol=1e-5)
+                compare_loaded_tensors(attn_input, input_layernorm_out1, tolerance=1e-5)
+                # torch.testing.assert_close(attn_input, input_layernorm_out1, rtol=1.3e-6, atol=1e-5)
 
                 # Input layernorm
                 
diff --git a/tests/peft_test.sh b/tests/peft_test.sh
index b7adce8028..6152844f5e 100755
--- a/tests/peft_test.sh
+++ b/tests/peft_test.sh
@@ -45,8 +45,8 @@ echo "Python test"
 # C++ test
 echo "C++ test"
 ./build/inference/peft/peft \
-    -ll:gpu 1 -ll:cpu 4 -ll:util 4 \
-    -tensor-parallelism-degree 1 \
+    -ll:gpu 4 -ll:cpu 4 -ll:util 4 \
+    -tensor-parallelism-degree 4 \
     -ll:fsize 8192 -ll:zsize 12000 \
     -llm-model JackFram/llama-160m \
     -finetuning-dataset ./inference/prompt/peft_dataset.json \
@@ -55,7 +55,7 @@ echo "C++ test"
     --use-full-precision \
     --inference-debugging
 # Check alignment
-python ./tests/peft/peft_alignment_test.py -tp 1 -lr 1.0
+python ./tests/peft/peft_alignment_test.py -tp 4 -lr 1.0
 
 # Print succeess message
 echo ""

From fca3d95db5b23da604734dd7705a1be33f32e2fa Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 8 Nov 2024 21:14:55 +0000
Subject: [PATCH 28/37] update

---
 inference/python/streamlit/app.py          | 21 +++++----------
 inference/python/streamlit/fastapi_incr.py | 31 +++++++++++++---------
 python/flexflow/core/flexflow_cffi.py      |  1 +
 3 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/inference/python/streamlit/app.py b/inference/python/streamlit/app.py
index 4d8633e167..9788765a3a 100644
--- a/inference/python/streamlit/app.py
+++ b/inference/python/streamlit/app.py
@@ -8,7 +8,7 @@
 st.set_page_config(page_title="🚀💻 FlexLLM Server", layout="wide")
 
 # FastAPI server URL
-FASTAPI_URL = "http://localhost:8000/generate/"  # Adjust the port if necessary
+FASTAPI_URL = "http://localhost:8000/chat/completions"  # Adjust the port if necessary
 FINETUNE_URL = "http://localhost:8000/finetuning"
 
 # Initialize session state variables
@@ -30,18 +30,11 @@ def clear_chat_history():
     st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}]
 
 # Function for generating LLaMA2 response
-def generate_llama2_response(prompt_input):
-    string_dialogue = "You are a helpful assistant. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'."
-    for dict_message in st.session_state.messages:
-        if dict_message["role"] == "user":
-            string_dialogue += "User: " + dict_message["content"] + "\n\n"
-        else:
-            string_dialogue += "Assistant: " + dict_message["content"] + "\n\n"
-    
-    full_prompt = f"{string_dialogue} {prompt_input} Assistant: "
+def generate_llama3_response(prompt_input):
+    system_prompt="You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature."
     
     # Send request to FastAPI server
-    response = requests.post(FASTAPI_URL, json={"prompt": full_prompt})
+    response = requests.post(FASTAPI_URL, json={"max_new_tokens": 1024, "messages": [{"role": "system", "content": system_prompt}] + st.session_state.messages + [{"role": "user", "content": prompt_input}]})
     
     if response.status_code == 200:
         return response.json()["response"]
@@ -58,7 +51,7 @@ def generate_llama2_response(prompt_input):
         st.sidebar.button('Clear Chat History', on_click=clear_chat_history)
 
         st.subheader('Generation parameters')
-        max_length = st.sidebar.slider('Max generation length', min_value=64, max_value=4096, value=2048, step=8)
+        max_length = st.sidebar.slider('Max generation length', min_value=64, max_value=2048, value=1024, step=8)
         # selected_model = st.sidebar.selectbox('Choose a Llama2 model', ['Llama2-7B', 'Llama2-13B', 'Llama2-70B'], key='selected_model')
         decoding_method = st.sidebar.selectbox('Decoding method', ['Greedy decoding (default)', 'Sampling'], key='decoding_method')
         temperature = st.sidebar.slider('temperature', min_value=0.01, max_value=5.0, value=0.1, step=0.01, disabled=decoding_method == 'Greedy decoding (default)')
@@ -181,8 +174,8 @@ def generate_llama2_response(prompt_input):
     # Generate a new response if last message is not from assistant
     if st.session_state.messages[-1]["role"] != "assistant":
         with st.chat_message("assistant"):
-            with st.spinner("Thinking..."):
-                response = generate_llama2_response(prompt)
+            with st.spinner("Running..."):
+                response = generate_llama3_response(prompt)
                 placeholder = st.empty()
                 full_response = ''
                 for item in response:
diff --git a/inference/python/streamlit/fastapi_incr.py b/inference/python/streamlit/fastapi_incr.py
index a1095e13dc..6ac7f4149a 100644
--- a/inference/python/streamlit/fastapi_incr.py
+++ b/inference/python/streamlit/fastapi_incr.py
@@ -46,12 +46,16 @@ class Message(BaseModel):
     content: str
 
 
+# class ChatCompletionRequest(BaseModel):
+#     model: Optional[str] = "mock-gpt-model"
+#     messages: List[Message]
+#     max_tokens: Optional[int] = 512
+#     temperature: Optional[float] = 0.1
+#     stream: Optional[bool] = False
+
 class ChatCompletionRequest(BaseModel):
-    model: Optional[str] = "mock-gpt-model"
+    max_new_tokens: Optional[int] = 1024
     messages: List[Message]
-    max_tokens: Optional[int] = 512
-    temperature: Optional[float] = 0.1
-    stream: Optional[bool] = False
 
 # Global variable to store the LLM model
 llm = None
@@ -76,12 +80,12 @@ def get_configs():
         # Define sample configs
         ff_init_configs = {
             # required parameters
-            "num_gpus": 4,
+            "num_gpus": 8,
             "memory_per_gpu": 20000,
             "zero_copy_memory_per_node": 40000,
             # optional parameters
             "num_cpus": 4,
-            "legion_utility_processors": 4,
+            "legion_utility_processors": 8,
             "data_parallelism_degree": 1,
             "tensor_parallelism_degree": 4,
             "pipeline_parallelism_degree": 1,
@@ -98,7 +102,7 @@ def get_configs():
         }
         llm_configs = {
             # required parameters
-            "llm_model": "meta-llama/Meta-Llama-3.1-8B",
+            "llm_model": "meta-llama/Llama-3.1-8B-Instruct",
             # optional parameters
             "cache_path": os.environ.get("FF_CACHE_PATH", ""),
             "refresh_cache": False,
@@ -139,7 +143,7 @@ async def startup_event():
         generation_config,
         max_requests_per_batch=16,
         max_seq_length=2048,
-        max_tokens_per_batch=64,
+        max_tokens_per_batch=1024,
     )
     llm.start_server()
 
@@ -171,11 +175,12 @@ async def chat_completions(request: ChatCompletionRequest):
     if llm is None:
         raise HTTPException(status_code=503, detail="LLM model is not initialized.")
     
-    if request.messages and request.messages[0].role == 'user':
-      resp_content = "As a mock AI Assitant, I can only echo your last message:" + request.messages[-1].content
-    else:
-      resp_content = "As a mock AI Assitant, I can only echo your last message, but there were no messages!"
-
+    print("received request:", request)
+    result = llm.generate([message.dict() for message in request.messages], max_new_tokens=request.max_new_tokens)[0].output_text.decode('utf-8')
+    print("returning response:", result)
+    return {
+        "response": result
+    }
     return {
         "id": "1337",
         "object": "chat.completion",
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index 151b01b873..4ff8348f46 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -4759,6 +4759,7 @@ def generate(self, requests_list: List[Request]):
                     finetuning_losses=finetuning_losses,
                 )
             )
+        return results
 
     def set_position_offset(self, offset):
         ffc().flexflow_model_set_position_offset(self.handle, offset)

From 9a1eae589ab2283d8583bc59190394db4b840a21 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 8 Nov 2024 22:03:33 +0000
Subject: [PATCH 29/37] demo fixes & readme

---
 inference/python/streamlit/README.md | 18 ++++++++++++++++++
 python/flexflow/serve/serve.py       | 16 ++++++++++++----
 src/runtime/request_manager.cc       |  4 ++++
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/inference/python/streamlit/README.md b/inference/python/streamlit/README.md
index e69de29bb2..86a15e2d6d 100644
--- a/inference/python/streamlit/README.md
+++ b/inference/python/streamlit/README.md
@@ -0,0 +1,18 @@
+# Streamlit demo
+
+## Instructions
+
+1. Build and install FlexFlow, or build and run `source ./set_python_envs.sh` from the build folder
+2. Edit the FlexFlow/inference/python/streamlit/fastapi_incr.py to configure the model to run and the system configs (num gpus, amount of memory, etc)
+3. In one terminal, launch the LLM engine with the commands below, and wait until the model's weights loading completes
+```
+cd FlexFlow/inference/python/streamlit
+python fastapi_incr.py
+```
+4. In another terminal, launch the streamlit app:
+```
+cd FlexFlow/inference/python/streamlit
+streamlit run app.py 
+```
+5. Open the URL printed to the terminal, e.g. `http://localhost:8501` and interact with the app via browser
+
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index d06d59b8c9..9d3fa19706 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -521,7 +521,7 @@ def compile(
 
             atexit.register(self.rm.stop_server)
 
-    def _generate(self, requests: List[Request]):
+    def _generate(self, requests: List[Request]) -> List[GenerationResult]:
         if len(requests) == 0:
             return []
         for req in requests:
@@ -554,7 +554,7 @@ def _generate(self, requests: List[Request]):
                     )
         return self.model.ffmodel.generate(requests)
 
-    def __chat2prompt(self, messages: List[dict]):
+    def __chat2prompt(self, messages: List[dict]) -> str:
         """Convert a list of messages to a single prompt string
 
         :param messages: The list of messages to convert
@@ -573,6 +573,12 @@ def __chat2prompt(self, messages: List[dict]):
         if self.tokenizer.chat_template is None:
             raise ValueError(f"Model {self.model_name} does not support chat completion")
         return self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    
+    def __output2chat_response(self, requests: List[Request], outputs: List[GenerationResult]) -> List[GenerationResult]:
+        assert(len(requests) == len(outputs))
+        for i in range(len(outputs)):
+            outputs[i].output_text = outputs[i].output_text[len(requests[i].prompt):]
+        return outputs
 
     def generate(
         self,
@@ -626,7 +632,8 @@ def generate(
                     max_new_tokens=max_new_tokens,
                     add_special_tokens=False,
                 )
-                return self._generate([request])
+                outputs = self._generate([request])
+                return self.__output2chat_response([request], outputs)
             elif type(requests_or_prompts[0]) == list:
                 prompts = [self.__chat2prompt(messages) for messages in requests_or_prompts]
                 requests = [
@@ -639,7 +646,8 @@ def generate(
                     )
                     for prompt in prompts
                 ]
-                return self._generate(requests)
+                outputs = self._generate(requests)
+                return self.__output2chat_response(requests, outputs)
             elif type(requests_or_prompts[0]) == Request:
                 print(requests_or_prompts)
                 return self._generate(requests_or_prompts)
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 798da75b01..d98d327dba 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -765,6 +765,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       assert(processed_tokens < request.tokens.size());
       bool request_completed = check_inf_req_completion(old_bc, i);
       if (request_completed) {
+        if (is_eos_token(request.tokens.back())) {
+          // remove the EOS token
+          request.tokens.pop_back();
+        }
         std::string output = this->tokenizer_->Decode(request.tokens);
         // Unlike Huggingface, the sentencepiece C++ library automatically
         // removes the BOS token

From c71c6b319d1d71bb4e0da16da9aebc05d1a160f8 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 9 Nov 2024 03:38:47 +0000
Subject: [PATCH 30/37] load weights in parallel

---
 include/flexflow/model.h             |  3 +
 include/flexflow/utils/file_loader.h | 28 +++++++++
 inference/python/chat.py             | 22 ++++---
 src/c/flexflow_c.cc                  |  5 +-
 src/mapper/mapper.cc                 |  6 ++
 src/runtime/file_loader.cc           | 91 ++++++++++++++++++++++++++++
 src/runtime/model.cc                 | 57 +++++++++++++++--
 src/runtime/request_manager.cc       |  6 +-
 8 files changed, 200 insertions(+), 18 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index e50c5f9578..3a80aa6b12 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -278,6 +278,9 @@ enum TaskIDs {
   RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
   RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
   RM_BACKGROUND_SERVING_TASK_ID,
+  LOAD_FLOAT_WEIGHT_TASK_ID,
+  LOAD_HALF_WEIGHT_TASK_ID,
+  LOAD_QUANT_WEIGHT_TASK_ID,
   // Custom tasks
   CUSTOM_GPU_TASK_ID_FIRST,
   CUSTOM_GPU_TASK_ID_1,
diff --git a/include/flexflow/utils/file_loader.h b/include/flexflow/utils/file_loader.h
index 646eb18da2..44cb15d10f 100644
--- a/include/flexflow/utils/file_loader.h
+++ b/include/flexflow/utils/file_loader.h
@@ -39,7 +39,26 @@ class FileDataLoader {
   void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx);
 
   void load_quantization_weight(FFModel *ff, Layer *l, int weight_idx);
+#ifdef DEADCODE
   void load_weights(FFModel *ff);
+#endif
+
+  static void
+      load_float_weight_task(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
+  static void
+      load_half_weight_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void
+      load_quant_weight_task(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
+  void load_weights_parallel(FFModel *ff, Context ctx, Runtime *runtime);
 
   void load_positions(FFModel *ff,
                       Tensor pt,
@@ -54,3 +73,12 @@ class FileDataLoader {
   std::string weights_folder;
   bool use_full_precision;
 };
+
+struct WeightLoadTaskArgs {
+  FFModel *ff;
+  FileDataLoader *loader;
+  Layer *layer;
+  int weight_idx;
+  WeightLoadTaskArgs(FFModel *_ff, FileDataLoader *_loader, Layer *_l, int _idx)
+      : ff(_ff), loader(_loader), layer(_l), weight_idx(_idx) {}
+};
diff --git a/inference/python/chat.py b/inference/python/chat.py
index 70b8ee0067..95132443a2 100644
--- a/inference/python/chat.py
+++ b/inference/python/chat.py
@@ -21,14 +21,14 @@ def get_configs():
     # Define sample configs
     ff_init_configs = {
         # required parameters
-        "num_gpus": 1,
-        "memory_per_gpu": 30000,
-        "zero_copy_memory_per_node": 60000,
+        "num_gpus": 8,
+        "memory_per_gpu": 34000,
+        "zero_copy_memory_per_node": 200000,
         # optional parameters
-        "num_cpus": 4,
-        "legion_utility_processors": 4,
+        "num_cpus": 16,
+        "legion_utility_processors": 16,
         "data_parallelism_degree": 1,
-        "tensor_parallelism_degree": 1,
+        "tensor_parallelism_degree": 8,
         "pipeline_parallelism_degree": 1,
         "offload": False,
         "offload_reserve_space_size": 8 * 1024,  # 8GB
@@ -43,7 +43,7 @@ def get_configs():
     }
     llm_configs = {
         # required parameters
-        "llm_model": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "llm_model": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
         # optional parameters
         "cache_path": os.environ.get("FF_CACHE_PATH", ""),
         "refresh_cache": False,
@@ -85,11 +85,15 @@ def main():
 
     llm.start_server()
 
+    nemotron_system = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature."
+    llama_generic_system = "You are a helpful an honest programming assistant."
+
+
     messages=[
-        {"role": "system", "content": "You are a helpful an honest programming assistant."},
+        {"role": "system", "content": nemotron_system},
         {"role": "user", "content": "Is Rust better than Python?"},
     ]
-    llm.generate(messages, max_new_tokens=256)
+    llm.generate(messages, max_new_tokens=1024)
     
     llm.stop_server()
 
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 837608c9f6..b4056960f4 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -2929,7 +2929,10 @@ void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
                                             flexflow_model_t model_handle_) {
   FileDataLoader *handle = FFCObjectWrapper::unwrap(handle_);
   FFModel *model = FFCObjectWrapper::unwrap(model_handle_);
-  handle->load_weights(model);
+  // handle->load_weights(model);
+  Context ctx = model->config.lg_ctx;
+  Runtime *runtime = model->config.lg_hlr;
+  handle->load_weights_parallel(model, ctx, runtime);
 }
 
 // // -----------------------------------------------------------------------
diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc
index d7b9a5e99d..e79bf5e371 100644
--- a/src/mapper/mapper.cc
+++ b/src/mapper/mapper.cc
@@ -288,6 +288,12 @@ void FFMapper::select_task_options(const MapperContext ctx,
     output.initial_proc = all_cpus[0];
     return;
   }
+  if ((task.task_id == LOAD_FLOAT_WEIGHT_TASK_ID) ||
+      (task.task_id == LOAD_HALF_WEIGHT_TASK_ID) ||
+      (task.task_id == LOAD_QUANT_WEIGHT_TASK_ID)) {
+    output.initial_proc = all_cpus[0];
+    return;
+  }
   if (task.task_id == TOP_LEVEL_TASK_ID) {
     output.initial_proc = all_cpus[0];
     // control replicate top level task
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index e73893475c..1c1dba32c8 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -16,6 +16,7 @@
 #include "flexflow/utils/file_loader.h"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
+#include "flexflow/model.h"
 
 #include <vector>
 using namespace std;
@@ -851,6 +852,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
   delete data;
 }
 
+#ifdef DEADCODE
 void FileDataLoader::load_weights(FFModel *ff) {
   for (Layer *l : ff->layers) {
     if (l->numWeights < 1 || l->name == NULL || strlen(l->name) < 1) {
@@ -883,3 +885,92 @@ void FileDataLoader::load_weights(FFModel *ff) {
     }
   }
 }
+#endif
+
+void FileDataLoader::load_float_weight_task(
+    Legion::Task const *task,
+    std::vector<Legion::PhysicalRegion> const &regions,
+    Legion::Context ctx,
+    Legion::Runtime *runtime) {
+  WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args;
+  args->loader->load_single_weight_tensor<float>(
+      args->ff, args->layer, args->weight_idx);
+}
+
+void FileDataLoader::load_half_weight_task(
+    Legion::Task const *task,
+    std::vector<Legion::PhysicalRegion> const &regions,
+    Legion::Context ctx,
+    Legion::Runtime *runtime) {
+  WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args;
+  args->loader->load_single_weight_tensor<half>(
+      args->ff, args->layer, args->weight_idx);
+}
+
+void FileDataLoader::load_quant_weight_task(
+    Legion::Task const *task,
+    std::vector<Legion::PhysicalRegion> const &regions,
+    Legion::Context ctx,
+    Legion::Runtime *runtime) {
+  WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args;
+  args->loader->load_quantization_weight(
+      args->ff, args->layer, args->weight_idx);
+}
+
+void FileDataLoader::load_weights_parallel(FFModel *ff,
+                                           Context ctx,
+                                           Runtime *runtime) {
+  std::vector<Future> futures;
+
+  for (Layer *l : ff->layers) {
+    if (l->numWeights < 1 || l->name == NULL || strlen(l->name) < 1) {
+      continue;
+    }
+
+    for (int i = 0; i < l->numWeights; i++) {
+      Tensor weight = l->weights[i];
+      if (weight == NULL) {
+        continue;
+      }
+
+      if (l->op_type == OP_LORA) {
+        continue;
+      }
+
+      // Create task arguments
+      WeightLoadTaskArgs args(ff, this, l, i);
+
+      switch (weight->data_type) {
+        case DT_HALF: {
+          TaskLauncher launcher(
+              LOAD_HALF_WEIGHT_TASK_ID,
+              TaskArgument(&args, sizeof(WeightLoadTaskArgs)));
+          futures.push_back(runtime->execute_task(ctx, launcher));
+          break;
+        }
+        case DT_FLOAT: {
+          TaskLauncher launcher(
+              LOAD_FLOAT_WEIGHT_TASK_ID,
+              TaskArgument(&args, sizeof(WeightLoadTaskArgs)));
+          futures.push_back(runtime->execute_task(ctx, launcher));
+          break;
+        }
+        case DT_INT4:
+        case DT_INT8: {
+          TaskLauncher launcher(
+              LOAD_QUANT_WEIGHT_TASK_ID,
+              TaskArgument(&args, sizeof(WeightLoadTaskArgs)));
+          futures.push_back(runtime->execute_task(ctx, launcher));
+          break;
+        }
+        default:
+          assert(false && "Unsupported data type");
+      }
+    }
+  }
+
+  // Wait for all tasks to complete
+  for (Future &f : futures) {
+    f.get_void_result();
+  }
+}
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 465ee21fc9..6bb11b6fa5 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3420,11 +3420,13 @@ bool FFModel::need_to_add_combine(int layer_idx) const {
 
 bool FFModel::need_to_add_allreduce(int layer_idx) const {
   auto const &l = layers[layer_idx];
-  if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 &&
-      ((l->op_type == OP_LINEAR && std::string(l->name).find("attn.o_proj") != std::string::npos) ||
-        is_mlp_block(layer_idx) ||
-        (l->op_type == OP_LINEAR && std::string(l->name).find("mlp.down_proj") != std::string::npos)
-      )) {
+  if (config.computationMode == COMP_MODE_INFERENCE &&
+      config.tensor_parallelism_degree > 1 &&
+      ((l->op_type == OP_LINEAR &&
+        std::string(l->name).find("attn.o_proj") != std::string::npos) ||
+       is_mlp_block(layer_idx) ||
+       (l->op_type == OP_LINEAR &&
+        std::string(l->name).find("mlp.down_proj") != std::string::npos))) {
     return true;
   }
   return false;
@@ -4798,6 +4800,51 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(LOAD_FLOAT_WEIGHT_TASK_ID,
+                                   "load_float_weight_task");
+    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
+    if (pre_register) {
+      Runtime::preregister_task_variant<FileDataLoader::load_float_weight_task>(
+          registrar, "load_float_weight_task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<FileDataLoader::load_float_weight_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(LOAD_HALF_WEIGHT_TASK_ID,
+                                   "load_half_weight_task");
+    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
+    if (pre_register) {
+      Runtime::preregister_task_variant<FileDataLoader::load_half_weight_task>(
+          registrar, "load_half_weight_task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<FileDataLoader::load_half_weight_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(LOAD_QUANT_WEIGHT_TASK_ID,
+                                   "load_quant_weight_task");
+    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
+    if (pre_register) {
+      Runtime::preregister_task_variant<FileDataLoader::load_quant_weight_task>(
+          registrar, "load_quant_weight_task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<FileDataLoader::load_quant_weight_task>(
+          registrar);
+    }
+  }
 #endif
   // ElementUnary task
   {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index d98d327dba..fddaae09ce 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -3025,7 +3025,7 @@ void RequestManager::serve_incr_decoding(FFModel *llm) {
   assert(im->model_weights_loaders.find(llm) !=
          im->model_weights_loaders.end());
   // Load model weights
-  im->model_weights_loaders[llm]->load_weights(llm);
+  im->model_weights_loaders[llm]->load_weights_parallel(llm, ctx, runtime);
   // init operators
   im->init_operators_inference(llm);
   // Legion futures for inc_decoding and spec_infer
@@ -3087,7 +3087,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
     assert(im->model_weights_loaders.find(llm) !=
            im->model_weights_loaders.end());
     // Load model weights
-    im->model_weights_loaders[llm]->load_weights(llm);
+    im->model_weights_loaders[llm]->load_weights_parallel(llm, ctx, runtime);
     // init operators
     im->init_operators_inference(llm);
   }
@@ -3098,7 +3098,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
     assert(im->model_weights_loaders.find(llm) !=
            im->model_weights_loaders.end());
     // Load model weights
-    im->model_weights_loaders[ssm]->load_weights(ssm);
+    im->model_weights_loaders[ssm]->load_weights_parallel(ssm, ctx, runtime);
     // init operators
     im->init_operators_inference(ssm);
   }

From d54fcf292c6a59204b3c4a8f36098f1c29e74b1f Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 9 Nov 2024 15:21:23 +0000
Subject: [PATCH 31/37] cleanup

---
 include/flexflow/model.h             |   4 +-
 include/flexflow/utils/file_loader.h |  31 +++-----
 src/c/flexflow_c.cc                  |   1 -
 src/mapper/mapper.cc                 |   4 +-
 src/runtime/file_loader.cc           | 115 +++++++--------------------
 src/runtime/model.cc                 |  39 +--------
 6 files changed, 48 insertions(+), 146 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 3a80aa6b12..e352159af0 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -278,9 +278,7 @@ enum TaskIDs {
   RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
   RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
   RM_BACKGROUND_SERVING_TASK_ID,
-  LOAD_FLOAT_WEIGHT_TASK_ID,
-  LOAD_HALF_WEIGHT_TASK_ID,
-  LOAD_QUANT_WEIGHT_TASK_ID,
+  LOAD_WEIGHT_TASK_ID,
   // Custom tasks
   CUSTOM_GPU_TASK_ID_FIRST,
   CUSTOM_GPU_TASK_ID_1,
diff --git a/include/flexflow/utils/file_loader.h b/include/flexflow/utils/file_loader.h
index 44cb15d10f..8735f23571 100644
--- a/include/flexflow/utils/file_loader.h
+++ b/include/flexflow/utils/file_loader.h
@@ -39,25 +39,12 @@ class FileDataLoader {
   void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx);
 
   void load_quantization_weight(FFModel *ff, Layer *l, int weight_idx);
-#ifdef DEADCODE
-  void load_weights(FFModel *ff);
-#endif
 
   static void
-      load_float_weight_task(Legion::Task const *task,
-                             std::vector<Legion::PhysicalRegion> const &regions,
-                             Legion::Context ctx,
-                             Legion::Runtime *runtime);
-  static void
-      load_half_weight_task(Legion::Task const *task,
-                            std::vector<Legion::PhysicalRegion> const &regions,
-                            Legion::Context ctx,
-                            Legion::Runtime *runtime);
-  static void
-      load_quant_weight_task(Legion::Task const *task,
-                             std::vector<Legion::PhysicalRegion> const &regions,
-                             Legion::Context ctx,
-                             Legion::Runtime *runtime);
+      load_weight_task(Legion::Task const *task,
+                       std::vector<Legion::PhysicalRegion> const &regions,
+                       Legion::Context ctx,
+                       Legion::Runtime *runtime);
   void load_weights_parallel(FFModel *ff, Context ctx, Runtime *runtime);
 
   void load_positions(FFModel *ff,
@@ -79,6 +66,12 @@ struct WeightLoadTaskArgs {
   FileDataLoader *loader;
   Layer *layer;
   int weight_idx;
-  WeightLoadTaskArgs(FFModel *_ff, FileDataLoader *_loader, Layer *_l, int _idx)
-      : ff(_ff), loader(_loader), layer(_l), weight_idx(_idx) {}
+  DataType data_type;
+  WeightLoadTaskArgs(FFModel *_ff,
+                     FileDataLoader *_loader,
+                     Layer *_l,
+                     int _idx,
+                     DataType _data_type)
+      : ff(_ff), loader(_loader), layer(_l), weight_idx(_idx),
+        data_type(_data_type) {}
 };
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index b4056960f4..4094fb7b44 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -2929,7 +2929,6 @@ void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
                                             flexflow_model_t model_handle_) {
   FileDataLoader *handle = FFCObjectWrapper::unwrap(handle_);
   FFModel *model = FFCObjectWrapper::unwrap(model_handle_);
-  // handle->load_weights(model);
   Context ctx = model->config.lg_ctx;
   Runtime *runtime = model->config.lg_hlr;
   handle->load_weights_parallel(model, ctx, runtime);
diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc
index e79bf5e371..c02f70f752 100644
--- a/src/mapper/mapper.cc
+++ b/src/mapper/mapper.cc
@@ -288,9 +288,7 @@ void FFMapper::select_task_options(const MapperContext ctx,
     output.initial_proc = all_cpus[0];
     return;
   }
-  if ((task.task_id == LOAD_FLOAT_WEIGHT_TASK_ID) ||
-      (task.task_id == LOAD_HALF_WEIGHT_TASK_ID) ||
-      (task.task_id == LOAD_QUANT_WEIGHT_TASK_ID)) {
+  if (task.task_id == LOAD_WEIGHT_TASK_ID) {
     output.initial_proc = all_cpus[0];
     return;
   }
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index 1c1dba32c8..3ebe6cf095 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -852,69 +852,33 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
   delete data;
 }
 
-#ifdef DEADCODE
-void FileDataLoader::load_weights(FFModel *ff) {
-  for (Layer *l : ff->layers) {
-    if (l->numWeights < 1 || l->name == NULL || strlen(l->name) < 1) {
-      continue;
-    }
-    for (int i = 0; i < l->numWeights; i++) {
-      Tensor weight = l->weights[i];
-      if (weight == NULL) {
-        continue;
-      }
-      // TODO: currently skip Lora layers
-      if (l->op_type == OP_LORA) {
-        continue;
-      }
-      switch (weight->data_type) {
-        case DT_HALF:
-          load_single_weight_tensor<half>(ff, l, i);
-          break;
-        case DT_FLOAT:
-          load_single_weight_tensor<float>(ff, l, i);
-          break;
-        case DT_INT4:
-        case DT_INT8:
-          // load weights in quantization
-          load_quantization_weight(ff, l, i);
-          break;
-        default:
-          assert(false && "Unsupported data type");
-      }
-    }
-  }
-}
-#endif
-
-void FileDataLoader::load_float_weight_task(
-    Legion::Task const *task,
-    std::vector<Legion::PhysicalRegion> const &regions,
-    Legion::Context ctx,
-    Legion::Runtime *runtime) {
-  WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args;
-  args->loader->load_single_weight_tensor<float>(
-      args->ff, args->layer, args->weight_idx);
-}
-
-void FileDataLoader::load_half_weight_task(
+void FileDataLoader::load_weight_task(
     Legion::Task const *task,
     std::vector<Legion::PhysicalRegion> const &regions,
     Legion::Context ctx,
     Legion::Runtime *runtime) {
   WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args;
-  args->loader->load_single_weight_tensor<half>(
-      args->ff, args->layer, args->weight_idx);
-}
 
-void FileDataLoader::load_quant_weight_task(
-    Legion::Task const *task,
-    std::vector<Legion::PhysicalRegion> const &regions,
-    Legion::Context ctx,
-    Legion::Runtime *runtime) {
-  WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args;
-  args->loader->load_quantization_weight(
-      args->ff, args->layer, args->weight_idx);
+  switch (args->data_type) {
+    case DT_HALF: {
+      args->loader->load_single_weight_tensor<half>(
+          args->ff, args->layer, args->weight_idx);
+      break;
+    }
+    case DT_FLOAT: {
+      args->loader->load_single_weight_tensor<float>(
+          args->ff, args->layer, args->weight_idx);
+      break;
+    }
+    case DT_INT4:
+    case DT_INT8: {
+      args->loader->load_quantization_weight(
+          args->ff, args->layer, args->weight_idx);
+      break;
+    }
+    default:
+      assert(false && "Unsupported data type");
+  }
 }
 
 void FileDataLoader::load_weights_parallel(FFModel *ff,
@@ -937,35 +901,16 @@ void FileDataLoader::load_weights_parallel(FFModel *ff,
         continue;
       }
 
-      // Create task arguments
-      WeightLoadTaskArgs args(ff, this, l, i);
-
-      switch (weight->data_type) {
-        case DT_HALF: {
-          TaskLauncher launcher(
-              LOAD_HALF_WEIGHT_TASK_ID,
-              TaskArgument(&args, sizeof(WeightLoadTaskArgs)));
-          futures.push_back(runtime->execute_task(ctx, launcher));
-          break;
-        }
-        case DT_FLOAT: {
-          TaskLauncher launcher(
-              LOAD_FLOAT_WEIGHT_TASK_ID,
-              TaskArgument(&args, sizeof(WeightLoadTaskArgs)));
-          futures.push_back(runtime->execute_task(ctx, launcher));
-          break;
-        }
-        case DT_INT4:
-        case DT_INT8: {
-          TaskLauncher launcher(
-              LOAD_QUANT_WEIGHT_TASK_ID,
-              TaskArgument(&args, sizeof(WeightLoadTaskArgs)));
-          futures.push_back(runtime->execute_task(ctx, launcher));
-          break;
-        }
-        default:
-          assert(false && "Unsupported data type");
+      if (weight->data_type != DT_FLOAT && weight->data_type != DT_HALF &&
+          weight->data_type != DT_INT4 && weight->data_type != DT_INT8) {
+        assert(false && "Unsupported data type");
       }
+
+      // Create task arguments
+      WeightLoadTaskArgs args(ff, this, l, i, weight->data_type);
+      TaskLauncher launcher(LOAD_WEIGHT_TASK_ID,
+                            TaskArgument(&args, sizeof(WeightLoadTaskArgs)));
+      futures.push_back(runtime->execute_task(ctx, launcher));
     }
   }
 
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 6bb11b6fa5..ca947039d0 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -4801,47 +4801,16 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     }
   }
   {
-    TaskVariantRegistrar registrar(LOAD_FLOAT_WEIGHT_TASK_ID,
-                                   "load_float_weight_task");
+    TaskVariantRegistrar registrar(LOAD_WEIGHT_TASK_ID, "load_weight_task");
     registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
     if (pre_register) {
-      Runtime::preregister_task_variant<FileDataLoader::load_float_weight_task>(
-          registrar, "load_float_weight_task");
+      Runtime::preregister_task_variant<FileDataLoader::load_weight_task>(
+          registrar, "load_weight_task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<FileDataLoader::load_float_weight_task>(
-          registrar);
-    }
-  }
-  {
-    TaskVariantRegistrar registrar(LOAD_HALF_WEIGHT_TASK_ID,
-                                   "load_half_weight_task");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    if (pre_register) {
-      Runtime::preregister_task_variant<FileDataLoader::load_half_weight_task>(
-          registrar, "load_half_weight_task");
-    } else {
-      if (enable_control_replication) {
-        registrar.global_registration = false;
-      }
-      runtime->register_task_variant<FileDataLoader::load_half_weight_task>(
-          registrar);
-    }
-  }
-  {
-    TaskVariantRegistrar registrar(LOAD_QUANT_WEIGHT_TASK_ID,
-                                   "load_quant_weight_task");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    if (pre_register) {
-      Runtime::preregister_task_variant<FileDataLoader::load_quant_weight_task>(
-          registrar, "load_quant_weight_task");
-    } else {
-      if (enable_control_replication) {
-        registrar.global_registration = false;
-      }
-      runtime->register_task_variant<FileDataLoader::load_quant_weight_task>(
+      runtime->register_task_variant<FileDataLoader::load_weight_task>(
           registrar);
     }
   }

From f7485151e75bc8244f45920723d40b0fd965503b Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 9 Nov 2024 23:00:54 +0000
Subject: [PATCH 32/37] cleanup

---
 include/flexflow/flexflow_c.h              |   3 +
 inference/peft/peft.cc                     |   2 +-
 inference/python/ff_peft.py                |  50 +--
 inference/python/peft_demo/INSTRUCTIONS.md |   2 +-
 inference/python/peft_demo/demo.ipynb      |   4 +-
 inference/python/peft_demo/demo.py         |   4 +-
 inference/utils/download_peft_model.py     |  32 +-
 python/flexflow/core/flexflow_cffi.py      |   5 +
 python/flexflow/serve/serve.py             | 459 ++++++++++-----------
 src/c/flexflow_c.cc                        |   8 +
 tests/peft_test.sh                         |  10 +-
 11 files changed, 291 insertions(+), 288 deletions(-)

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 906cacb920..677f9915cd 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -1029,6 +1029,9 @@ void flexflow_request_manager_set_max_sequence_length(
 int flexflow_request_manager_get_max_sequence_length(
     flexflow_request_manager_t handle_);
 
+void flexflow_request_manager_set_max_concurrent_adapters(
+    flexflow_request_manager_t handle_, int max_concurrent_adapters);
+
 void flexflow_request_manager_set_enable_peft_finetuning(
     flexflow_request_manager_t handle_, bool enable_peft_finetuning_);
 
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index da2993187c..4f2d47055a 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -256,7 +256,7 @@ void FlexFlow::top_level_task(Task const *task,
   LoraOptimizerConfig *optim_config = nullptr;
   if (enable_peft_finetuning) {
     // float sgd_learning_rate = 2e-1;
-    float sgd_learning_rate = 1.0f;
+    float sgd_learning_rate = 0.001f;
     optim_config = new LoraSGDOptimizerConfig(sgd_learning_rate);
   }
   LoraLinearConfig peft_config_finetuning =
diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py
index 35338f5227..0167cecebc 100644
--- a/inference/python/ff_peft.py
+++ b/inference/python/ff_peft.py
@@ -41,14 +41,14 @@ def get_configs():
         # Define sample configs
         ff_init_configs = {
             # required parameters
-            "num_gpus": 2,
+            "num_gpus": 4,
             "memory_per_gpu": 14000,
             "zero_copy_memory_per_node": 10000,
             # optional parameters
             "num_cpus": 4,
             "legion_utility_processors": 4,
             "data_parallelism_degree": 1,
-            "tensor_parallelism_degree": 2,
+            "tensor_parallelism_degree": 4,
             "pipeline_parallelism_degree": 1,
             "offload": False,
             "offload_reserve_space_size": 8 * 1024,  # 8GB
@@ -102,6 +102,23 @@ def main():
         refresh_cache=configs.refresh_cache,
         output_file=configs.output_file,
     )
+
+    # Compile the LLM for inference and load the weights into memory
+    generation_config = ff.GenerationConfig(
+        do_sample=False, temperature=0.9, topp=0.8, topk=1
+    )
+    enable_peft_finetuning = len(configs.finetuning_dataset) > 0
+    llm.compile(
+        generation_config,
+        max_requests_per_batch=1 if not enable_peft_finetuning else 2,
+        max_seq_length=256,
+        max_tokens_per_batch=128,
+        max_concurrent_adapters=1 if not enable_peft_finetuning else 2,
+        enable_peft_finetuning=enable_peft_finetuning,
+    )
+
+    llm.start_server()
+
     # Add inference and/or finetuning lora
     lora_inference_config = None
     lora_finetuning_config = None
@@ -111,18 +128,8 @@ def main():
             configs.inference_peft_model_id,
             base_model_name_or_path=configs.base_model,
         )
-        llm.add_peft(lora_inference_config)
+        llm.register_peft_adapter(lora_inference_config)
     if len(configs.finetuning_dataset) > 0:
-        # lora_finetuning_config = ff.LoraLinearConfig(
-        #     llm.cache_path,
-        #     configs.finetuning_peft_model_id,
-        #     target_modules=["down_proj"],
-        #     rank=16,
-        #     lora_alpha=16,
-        #     trainable=True,
-        #     init_lora_weights=True,
-        #     optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD,
-        # )
         lora_finetuning_config = ff.LoraLinearConfig(
             llm.cache_path,
             configs.inference_peft_model_id,
@@ -136,22 +143,7 @@ def main():
                 "nesterov": False,
             },
         )
-        llm.add_peft(lora_finetuning_config)
-
-    # Compile the LLM for inference and load the weights into memory
-    generation_config = ff.GenerationConfig(
-        do_sample=False, temperature=0.9, topp=0.8, topk=1
-    )
-    enable_peft_finetuning = len(configs.finetuning_dataset) > 0
-    llm.compile(
-        generation_config,
-        enable_peft_finetuning=enable_peft_finetuning,
-        max_requests_per_batch=1 if not enable_peft_finetuning else 2,
-        max_seq_length=256,
-        max_tokens_per_batch=128,
-    )
-
-    llm.start_server()
+        llm.register_peft_adapter(lora_finetuning_config)
 
     requests = []
     # Serving
diff --git a/inference/python/peft_demo/INSTRUCTIONS.md b/inference/python/peft_demo/INSTRUCTIONS.md
index 9b2a7a53b2..0f78efdea9 100644
--- a/inference/python/peft_demo/INSTRUCTIONS.md
+++ b/inference/python/peft_demo/INSTRUCTIONS.md
@@ -13,7 +13,7 @@
 
     * `export HUGGINGFACE_TOKEN="[Your token]"`
     * `huggingface-cli login --token "$HUGGINGFACE_TOKEN"`
-    * `python3 inference/utils/download_peft_model.py "goliaro/llama-2-7b-lora-full" --base_model_name "meta-llama/Llama-2-7b-hf"`
+    * `python3 inference/utils/download_peft_model.py "goliaro/llama-2-7b-lora-full"`
 
 * Run the demo
     ```
diff --git a/inference/python/peft_demo/demo.ipynb b/inference/python/peft_demo/demo.ipynb
index d29ad5ad2f..ea2b8417b6 100644
--- a/inference/python/peft_demo/demo.ipynb
+++ b/inference/python/peft_demo/demo.ipynb
@@ -194,7 +194,7 @@
     }
    ],
    "source": [
-    "args = [configs.inference_peft_model_id, '--base_model_name', configs.base_model]\n",
+    "args = [configs.inference_peft_model_id]\n",
     "subprocess.run(['python', '../../utils/download_peft_model.py'] + args)"
    ]
   },
@@ -1813,7 +1813,7 @@
     "configs = SimpleNamespace(**configs_dict)\n",
     "\n",
     "\n",
-    "args = [configs.finetuning_peft_model_id+\"-dolly\", '--base_model_name', configs.base_model]\n",
+    "args = [configs.finetuning_peft_model_id+\"-dolly\"]\n",
     "subprocess.run(['python', '../../utils/download_peft_model.py'] + args)\n",
     "\n",
     "# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs\n",
diff --git a/inference/python/peft_demo/demo.py b/inference/python/peft_demo/demo.py
index 34b15b9a76..b70f3c8966 100644
--- a/inference/python/peft_demo/demo.py
+++ b/inference/python/peft_demo/demo.py
@@ -98,7 +98,7 @@ def create_datasets(finetune_dataset_size=2, inference_file_path='inference_data
     file.write('')
 
 # Download base and peft inference models
-args = [configs.inference_peft_model_id, '--base_model_name', configs.base_model]
+args = [configs.inference_peft_model_id]
 # hf_token = input("Please enter your HuggingFace personal access token: ")
 # subprocess.run(['huggingface-cli', 'login', '--token', hf_token])
 subprocess.run(['python', '../../utils/download_peft_model.py'] + args)
@@ -206,7 +206,7 @@ def create_datasets(finetune_dataset_size=2, inference_file_path='inference_data
 )
 llm.add_peft(lora_inference_config)
 
-args = [configs.finetuning_peft_model_id, '--base_model_name', configs.base_model]
+args = [configs.finetuning_peft_model_id]
 #hf_token = input("Please enter your HuggingFace personal access token: ")
 # subprocess.run(['huggingface-cli', 'login', '--token', hf_token])
 # subprocess.run(['python', '../../utils/download_peft_model.py'] + args)
diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py
index 38dd577574..2ee63b10bc 100644
--- a/inference/utils/download_peft_model.py
+++ b/inference/utils/download_peft_model.py
@@ -1,13 +1,11 @@
 #!/usr/bin/env python
 import flexflow.serve as ff
 import argparse, os
+from peft import PeftConfig
 
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--base_model_name", type=str, help="Name of the model to download"
-    )
     parser.add_argument(
         "peft_model_ids",
         type=str,
@@ -48,19 +46,21 @@ def main(args):
     else:
         data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF)
 
-    for data_type in data_types:
-        llm = ff.LLM(
-            args.base_model_name,
-            data_type=data_type,
-            cache_path=args.cache_folder,
-            refresh_cache=args.refresh_cache,
-        )
-        for peft_model_id in args.peft_model_ids:
-            lora_config = ff.LoraLinearConfig(llm.cache_path, peft_model_id)
-            llm.add_peft(lora_config)
-        llm.download_hf_weights_if_needed()
-        llm.download_hf_config()
-        llm.download_hf_tokenizer_if_needed()
+    for peft_model_id in args.peft_model_ids:
+        hf_config = PeftConfig.from_pretrained(peft_model_id)
+        for data_type in data_types:
+            llm = ff.LLM(
+                hf_config.base_model_name_or_path,
+                data_type=data_type,
+                cache_path=args.cache_folder,
+                refresh_cache=args.refresh_cache,
+            )
+            # Download base model config, weights and tokenizer
+            llm.download_hf_config()
+            llm.download_hf_weights_if_needed()
+            llm.download_hf_tokenizer_if_needed()
+            # Download PEFT adapter
+            llm.download_peft_adapter_if_needed(peft_model_id)
 
 
 if __name__ == "__main__":
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index 4ff8348f46..02eff0ca76 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -1633,6 +1633,11 @@ def set_max_sequence_length(self, max_length):
 
     def get_max_sequence_length(self):
         return ffc().flexflow_request_manager_get_max_sequence_length(self.handle)
+    
+    def set_max_concurrent_adapters(self, max_adapters):
+        return ffc().flexflow_request_manager_set_max_concurrent_adapters(
+            self.handle, max_adapters
+        )
 
     def set_enable_peft_finetuning(self, enable_peft_finetuning):
         return ffc().flexflow_request_manager_set_enable_peft_finetuning(
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 9d3fa19706..7932441c81 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -31,9 +31,17 @@
 from peft import PeftModel, PeftConfig, LoraConfig
 from huggingface_hub import HfApi
 import torch, shutil, hashlib, json, gc
-from typing import Union, List
+from typing import Union, List, Tuple
+from safetensors import safe_open
 from huggingface_hub import snapshot_download
 
+from enum import Enum
+
+
+class CachedResourceType(Enum):
+    TOKENIZER = "tokenizer"
+    WEIGHTS = "weights"
+
 
 class _SupportedModels:
     def __init__(
@@ -104,14 +112,14 @@ def __init__(
         self.output_file = output_file
         self.rm = None
         self.pefts = {}
-        self.tokenizer=None
+        self.tokenizer = None
 
     def __del__(self):
         # Stop the background server before deleting the object
         if type(self) == LLM and self.rm is not None:
             self.rm.stop_server()
 
-    def add_peft(self, lora_config: LoraLinearConfig):
+    def register_peft_adapter(self, lora_config: LoraLinearConfig):
         """Add a PEFT adapter to the LLM"""
         if lora_config is None:
             raise ValueError("lora_config cannot be None")
@@ -145,9 +153,12 @@ def add_peft(self, lora_config: LoraLinearConfig):
                 f"Attempting to add PEFT with base model name {peft_config.base_model_name_or_path} to LLM {self.model_name}"
             )
 
+        lora_config.ff_compile()
+
         self.pefts[lora_config] = {
             "peft_config": peft_config,
             "peft_type": peft_config.peft_type,
+            "ff_peft_model_id": self.model.ffmodel.register_peft_adapter(lora_config),
         }
 
     def get_ff_peft_id(self, lora_config: LoraLinearConfig) -> PEFTModelID:
@@ -175,34 +186,33 @@ def download_hf_config(self):
         os.makedirs(config_dir, exist_ok=True)
         print(f"Creating directory {config_dir} (if it doesn't exist)...")
         print(f"Saving {self.model_name} configs to file {config_path}...")
-        self.hf_config.to_json_file(config_path)
-
-        # Save PEFT configs if the LLM has any registered PEFTs
-        for ff_peft_config, peft_dict in self.pefts.items():
-            peft_config = peft_dict["peft_config"]
-            peft_model_id = ff_peft_config.peft_model_id
-            peft_config_dir = os.path.join(
-                os.path.expanduser(self.cache_path), "configs", peft_model_id.lower()
-            )
-            os.makedirs(peft_config_dir, exist_ok=True)
-            peft_config_path = os.path.join(peft_config_dir, "config.json")
-            print(f"Saving {peft_model_id} configs to file {peft_config_path}...")
-            with open(peft_config_path, "w") as json_file:
-
-                class SetEncoder(json.JSONEncoder):
-                    def default(self, obj):
-                        if isinstance(obj, set):
-                            return list(obj)
-                        return super().default(obj)
-
-                json.dump(peft_config.to_dict(), json_file, indent=2, cls=SetEncoder)
-
-    def __get_revision_hashes(self, model_name: str, folder: str):
+        # self.hf_config.to_json_file(config_path)
+        src_folder = snapshot_download(
+            repo_id=self.model_name, allow_patterns="config.json"
+        )
+        src_path = os.path.join(src_folder, "config.json")
+        if os.path.exists(src_path):
+            shutil.copy(src_path, config_path)
+
+    def __get_revision_hashes(
+        self, model_name: str, folder: str
+    ) -> Tuple[Union[str, None], str, str]:
+        """Return the commit hash of the object (weight, tokenizer, etc) cached by FlexFlow and the latest commit hash of the object from HuggingFace (or other source)
+
+        Args:
+            model_name (str): Name of the model cached by FlexFlow
+            folder (str): Folder where the cached object is stored
+
+        Returns:
+            ff_revision: Commit hash of the object cached by FlexFlow
+            ff_revision_filepath: Path to the file containing the commit hash of the object cached by FlexFlow
+            latest_revision: Latest commit hash of the object from HuggingFace (or other source)
+        """
         ff_revision = None
-        ff_revision_file = os.path.join(folder, "rev_sha.txt")
+        ff_revision_filepath = os.path.join(folder, "rev_sha.txt")
 
-        if os.path.exists(ff_revision_file):
-            ff_revision = "".join(open(ff_revision_file).read().split())
+        if os.path.exists(ff_revision_filepath):
+            ff_revision = "".join(open(ff_revision_filepath).read().split())
 
         if os.path.exists(model_name) and os.path.isdir(model_name):
             # Local model
@@ -215,16 +225,21 @@ def __get_revision_hashes(self, model_name: str, folder: str):
             # Remote HuggingFace model
             hf_api = HfApi()
             latest_revision = hf_api.model_info(self.model_name).sha
-        return ff_revision, ff_revision_file, latest_revision
+        return ff_revision, latest_revision
 
-    def download_hf_weights_if_needed(self):
-        """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date.
-        If not, or if the refresh_cache parameter is set to True, download new weights.
+    def __get_resource_path(
+        self, model_name: str, resource_type: CachedResourceType
+    ) -> str:
+        """Returns the path to the folder where the model weights or tokenizer files are stored
 
-        If any PEFT adapter is registered, perform the same operation for PEFT.
-        """
+        Args:
+            model_name (str): Name of the model
+            resource_type (CachedResourceType): Whether to get the path to the weights or the tokenizer
 
-        def get_weights_path(model_name):
+        Returns:
+            str: Path to the folder where the model weights or tokenizer files are stored
+        """
+        if resource_type == CachedResourceType.WEIGHTS:
             return os.path.join(
                 os.path.expanduser(self.cache_path),
                 "weights",
@@ -235,19 +250,56 @@ def get_weights_path(model_name):
                     else "half-precision"
                 ),
             )
+        elif resource_type == CachedResourceType.TOKENIZER:
+            return os.path.join(
+                os.path.expanduser(self.cache_path), "tokenizers", model_name.lower()
+            )
+        else:
+            raise ValueError(f"Invalid resource type {resource_type}")
 
-        def refresh_cache_if_needed(model_name):
-            weights_path = get_weights_path(model_name)
-            if self.refresh_cache:
-                print(
-                    f"Refreshing weights in cache for model {model_name} at path {weights_path} ..."
-                )
-                if os.path.exists(weights_path):
-                    shutil.rmtree(weights_path)
-            os.makedirs(weights_path, exist_ok=True)
+    def __need_cache_refresh(
+        self, model_name: str, resource_type: CachedResourceType
+    ) -> bool:
+        """Check whether the model weights or tokenizer files are available and up to date.
+        If they need a refresh, create the folder for the resource, save the new commit hash to the rev_sha.txt file, delete any existing files, and return true.
 
-        def get_hf_llm(model_name):
-            return AutoModelForCausalLM.from_pretrained(
+        Args:
+            model_name (str): Name of the model to check
+            resource_type (CachedResourceType): Whether to check the weights or the tokenizer
+
+        Returns:
+            bool: True if the weights or tokenizer need a refresh, False otherwise
+        """
+        need_refresh = False
+        resource_path = self.__get_resource_path(model_name, resource_type)
+        if self.refresh_cache or not os.path.exists(resource_path):
+            need_refresh = True
+        else:
+            ff_revision, latest_revision = self.__get_revision_hashes(
+                self.model_name, resource_path
+            )
+            if ff_revision != latest_revision:
+                need_refresh = True
+        if need_refresh:
+            print(
+                f"Refreshing {resource_type} in cache for model {model_name} at path {resource_path} ..."
+            )
+            if os.path.exists(resource_path):
+                shutil.rmtree(resource_path)
+        os.makedirs(resource_path, exist_ok=True)
+        ff_revision_file = os.path.join(resource_path, "rev_sha.txt")
+        with open(ff_revision_file, "w+") as f:
+            f.write(latest_revision)
+        return need_refresh
+
+    def download_hf_weights_if_needed(self) -> None:
+        """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date.
+        If not, or if the refresh_cache parameter is set to True, download new weights and convert them.
+        """
+
+        # TODO: edit this to download the weights using snapshot_download and convert them to FlexFlow format without loading them to GPU
+        def download_and_convert_llm_weights(model_name):
+            hf_model = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 trust_remote_code=True,
                 torch_dtype=(
@@ -256,73 +308,26 @@ def get_hf_llm(model_name):
                     else torch.float16
                 ),
             )
-
-        def download_llm_weights():
-            refresh_cache_if_needed(self.model_name)
-            ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
-                self.model_name, self.weights_path
+            # Convert the model to FlexFlow format
+            weights_path = self.__get_resource_path(
+                model_name, CachedResourceType.WEIGHTS
             )
-            if ff_revision != latest_revision:
-                print(
-                    f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now..."
-                )
-                hf_model = get_hf_llm(self.model_name)
-                # Convert the model to FlexFlow format
-                self.model_class.convert_hf_model(hf_model, self.weights_path)
-                # Save new revision hash to file
-                with open(ff_revision_file, "w+") as f:
-                    f.write(latest_revision)
-                print(f"Done converting the weights for model {self.model_name}")
-                # Deallocate hf model
-                del hf_model
-                gc.collect()
-                torch.cuda.empty_cache()
-
-        def convert_peft_model(hf_peft_model, peft_type, weights_path):
-            for name, params in hf_peft_model.named_parameters():
-                if peft_type.lower() in name:
-                    name = name.replace("base_model.model.model.", "").replace(
-                        ".default", ""
-                    )
-                    name = self.model_class.convert_hf_weight_name(name)
-                    params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
-
-        def download_peft_weights():
-            for ff_peft_config, peft_dict in self.pefts.items():
-                if not ff_peft_config.init_lora_weights:
-                    peft_config = peft_dict["peft_config"]
-                    peft_type = peft_dict["peft_type"]
-                    peft_model_id = ff_peft_config.peft_model_id
-
-                    weights_path = get_weights_path(peft_model_id)
-                    refresh_cache_if_needed(peft_model_id)
-                    ff_revision, ff_revision_file, latest_revision = (
-                        self.__get_revision_hashes(peft_model_id, weights_path)
-                    )
-
-                    if ff_revision != latest_revision:
-                        print(
-                            f"'{peft_model_id}' local model weights need updating! Downloading/converting new weights now..."
-                        )
-                        hf_model = get_hf_llm(peft_model_id)
-                        hf_peft_model = PeftModel.from_pretrained(
-                            hf_model, peft_model_id, config=peft_config
-                        )
-                        # Convert the model to FlexFlow format
-                        convert_peft_model(hf_peft_model, peft_type, weights_path)
-                        # Save new revision hash to file
-                        with open(ff_revision_file, "w+") as f:
-                            f.write(latest_revision)
-                        print(f"Done converting the weights for model {peft_model_id}")
-                        # Deallocate hf model
-                        del hf_peft_model
-                        del hf_model
-                        gc.collect()
-                        torch.cuda.empty_cache()
-
-        self.weights_path = get_weights_path(self.model_name)
-        download_llm_weights()
-        download_peft_weights()
+            self.model_class.convert_hf_model(hf_model, weights_path)
+            # Save new revision hash to file
+            print(f"Done converting the weights for model {self.model_name}")
+            # Deallocate hf model
+            del hf_model
+            gc.collect()
+            torch.cuda.empty_cache()
+
+        need_refresh = self.__need_cache_refresh(
+            self.model_name, CachedResourceType.WEIGHTS
+        )
+        if need_refresh:
+            print(
+                f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now..."
+            )
+            download_and_convert_llm_weights(self.model_name)
 
     def download_hf_tokenizer_if_needed(self):
         """Check in the folder specified by the cache_path whether the LLM's tokenizer files are available and up to date.
@@ -331,25 +336,10 @@ def download_hf_tokenizer_if_needed(self):
         print("Loading tokenizer...")
 
         # Use local cache, or download new version
-        self.tokenizer_path = os.path.join(
-            os.path.expanduser(self.cache_path), "tokenizers", self.model_name.lower()
+        need_refresh = self.__need_cache_refresh(
+            self.model_name, CachedResourceType.TOKENIZER
         )
-        if self.refresh_cache:
-            print(
-                f"Refreshing cached tokenizer for model {self.model_name} at path {self.tokenizer_path} ..."
-            )
-            if os.path.exists(self.tokenizer_path):
-                shutil.rmtree(self.tokenizer_path)
-        if not os.path.exists(self.tokenizer_path):
-            print(f"Creating directory {self.tokenizer_path} (if it doesn't exist)...")
-            os.makedirs(self.tokenizer_path, exist_ok=True)
-
-        # Get local revision SHA, check if it matches latest one on huggingface
-        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
-            self.model_name, self.tokenizer_path
-        )
-
-        if ff_revision != latest_revision:
+        if need_refresh:
             print(
                 f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now..."
             )
@@ -367,15 +357,76 @@ def download_hf_tokenizer_if_needed(self):
                 hf_tokenizer_path = snapshot_download(
                     repo_id=self.model_name, allow_patterns=target_tokenizer_files
                 )
+            tokenizer_path = self.__get_resource_path(
+                self.model_name, CachedResourceType.TOKENIZER
+            )
             for file in target_tokenizer_files:
                 src_path = os.path.join(hf_tokenizer_path, file)
-                dst_path = os.path.join(self.tokenizer_path, file)
+                dst_path = os.path.join(tokenizer_path, file)
                 if os.path.exists(src_path):
                     shutil.copy(src_path, dst_path)
             print("Done updating HF tokenizer.")
-            # Save new revision hash to file
-            with open(ff_revision_file, "w+") as f:
-                f.write(latest_revision)
+
+    def download_peft_adapter_if_needed(self, hf_peft_model_id: str):
+        """Check in the folder specified by the cache_path whether the PEFT model weights are available and up to date.
+        If not, or if the refresh_cache parameter is set to True, download new weights and convert them.
+        """
+
+        def download_and_convert_peft_model(hf_peft_model_id: str):
+            if (
+                self.data_type != DataType.DT_FLOAT
+                and self.data_type != DataType.DT_HALF
+            ):
+                raise ValueError(
+                    "data_type must be either DataType.DT_FLOAT or DataType.DT_HALF"
+                )
+
+            # Save peft config to file
+            peft_config_dir = os.path.join(
+                os.path.expanduser(self.cache_path), "configs", hf_peft_model_id.lower()
+            )
+            dst_path = os.path.join(peft_config_dir, "config.json")
+            os.makedirs(peft_config_dir, exist_ok=True)
+            print(f"Saving {hf_peft_model_id} configs to file {dst_path}...")
+            config_path = snapshot_download(
+                repo_id=hf_peft_model_id, allow_patterns="adapter_config.json"
+            )
+            src_path = os.path.join(config_path, "adapter_config.json")
+            if os.path.exists(src_path):
+                shutil.copy(src_path, dst_path)
+
+            # Save peft weights to file
+            adapter_path = snapshot_download(
+                repo_id=hf_peft_model_id, allow_patterns="adapter_model.safetensors"
+            )
+            weights_path = self.__get_resource_path(
+                hf_peft_model_id.lower(), CachedResourceType.WEIGHTS
+            )
+            with safe_open(adapter_path, framework="pt", device="cpu") as f:
+                for tensor_name in f.keys():
+                    tensor = f.get_tensor(tensor_name)
+                    if self.data_type == DataType.DT_HALF:
+                        tensor = tensor.half()
+                    else:
+                        tensor = tensor.float()
+                    tensor_name = tensor_name.replace(
+                        "base_model.model.model.", ""
+                    ).replace(".default", "")
+                    print(tensor_name)
+
+                    tensor_name = self.model_class.convert_hf_weight_name(tensor_name)
+                    tensor.detach().cpu().numpy().tofile(
+                        f"{weights_path}/{tensor_name}"
+                    )
+
+        need_refresh = self.__need_cache_refresh(
+            hf_peft_model_id, CachedResourceType.WEIGHTS
+        )
+        if need_refresh:
+            print(
+                f"'{hf_peft_model_id}' local model weights need updating! Downloading/converting new weights now..."
+            )
+            download_and_convert_peft_model(hf_peft_model_id)
 
     def compile(
         self,
@@ -383,10 +434,8 @@ def compile(
         max_requests_per_batch: int = 1,
         max_seq_length: int = 256,
         max_tokens_per_batch: int = 64,
+        max_concurrent_adapters: int = 1,
         enable_peft_finetuning: bool = False,
-        model_specific_data_parallelism_degree: int = None,
-        model_specific_tensor_parallelism_degree: int = None,
-        model_specific_pipeline_parallelism_degree: int = None,
         ssms: list = [],
     ):
         """Compile the LLM for inference and load the weights into memory
@@ -399,14 +448,10 @@ def compile(
         :type max_seq_length: int, optional
         :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 64
         :type max_tokens_per_batch: int, optional
+        :param max_concurrent_adapters: The maximum number of concurrent LoRA adapters, defaults to 1
+        :type max_concurrent_adapters: int, optional
         :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False
         :type enable_peft_finetuning: bool, optional
-        :param model_specific_data_parallelism_degree: Use this parameter if you want to give the LLM a different data parallelism degree than the one used to initialize the runtime, defaults to None
-        :type model_specific_data_parallelism_degree: int, optional
-        :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the LLM a different tensor parallelism degree than the one used to initialize the runtime, defaults to None
-        :type model_specific_tensor_parallelism_degree: int, optional
-        :param model_specific_pipeline_parallelism_degree: Use this parameter if you want to give the LLM a different pipeline parallelism degree than the one used to initialize the runtime, defaults to None
-        :type model_specific_pipeline_parallelism_degree: int, optional
         :param ssms: The SSMs to use when operating in speculative inference mode, defaults to []
         :type ssms: list, optional
         """
@@ -422,20 +467,6 @@ def compile(
             assert type(self) == LLM
             mode = InferenceMode.INC_DECODING_MODE
 
-        # Apply model-specific parallelism degrees, if needed
-        if model_specific_data_parallelism_degree:
-            self.ffconfig.data_parallelism_degree = (
-                model_specific_data_parallelism_degree
-            )
-        if model_specific_tensor_parallelism_degree:
-            self.ffconfig.tensor_parallelism_degree = (
-                model_specific_tensor_parallelism_degree
-            )
-        if model_specific_pipeline_parallelism_degree:
-            self.ffconfig.pipeline_parallelism_degree = (
-                model_specific_pipeline_parallelism_degree
-            )
-
         self.max_seq_length = max_seq_length
 
         # Create request manager and set serving configuration
@@ -443,6 +474,7 @@ def compile(
         self.rm.set_max_requests_per_batch(max_requests_per_batch)
         self.rm.set_max_tokens_per_batch(max_tokens_per_batch)
         self.rm.set_max_sequence_length(max_seq_length)
+        self.rm.set_max_concurrent_adapters(max_concurrent_adapters)
         self.rm.set_enable_peft_finetuning(enable_peft_finetuning)
 
         # Instantiate the relevant model
@@ -473,8 +505,11 @@ def compile(
             else 20
         )
 
+        weights_path = self.__get_resource_path(
+            self.model_name, CachedResourceType.WEIGHTS
+        )
         self.fileloader = FileDataLoader(
-            self.weights_path,
+            weights_path,
             model_configs.num_attention_heads,
             model_configs.num_key_value_heads,
             model_configs.hidden_size,
@@ -498,21 +533,17 @@ def compile(
             eos_token_id = [eos_token_id]
         elif type(eos_token_id) != list:
             raise ValueError("eos_token_id must be an integer or a list of integers")
+        tokenizer_path = self.__get_resource_path(
+            self.model_name, CachedResourceType.TOKENIZER
+        )
         self.rm.register_tokenizer(
-            self.model_type, bos_token_id, eos_token_id, self.tokenizer_path
+            self.model_type, bos_token_id, eos_token_id, tokenizer_path
         )
         self.rm.register_output_filepath(self.output_file)
 
         for ssm in self.ssms:
             self.rm.register_ssm_model(ssm.model.ffmodel)
 
-        # Add PEFT layer if registered
-        for ff_peft_config, peft_dict in self.pefts.items():
-            ff_peft_config.ff_compile()
-            ff_peft_model_id = self.model.ffmodel.register_peft_adapter(ff_peft_config)
-            peft_dict["ff_peft_model_id"] = ff_peft_model_id
-
-
         # start background server
         if (mode == InferenceMode.TREE_VERIFY_MODE) or (
             mode == InferenceMode.INC_DECODING_MODE
@@ -528,7 +559,7 @@ def _generate(self, requests: List[Request]) -> List[GenerationResult]:
             if req.req_type == RequestType.REQ_INFERENCE:
                 # check max_length and max_new_tokens parameters
                 if req.max_length == -1 and req.max_new_tokens == -1:
-                    req.max_length = self.max_seq_length -1
+                    req.max_length = self.max_seq_length - 1
                 elif req.max_length != -1 and req.max_new_tokens != -1:
                     warnings.warn(
                         f"Both `max_new_tokens` (={req.max_new_tokens}) and `max_length`(={req.max_length}) seem to have been set. `max_new_tokens` will take precedence."
@@ -547,7 +578,7 @@ def _generate(self, requests: List[Request]) -> List[GenerationResult]:
                         f"max_new_tokens ({req.max_new_tokens}) is not allowed for finetuning requests."
                     )
                 if req.max_length == -1:
-                    req.max_length = self.max_seq_length -1
+                    req.max_length = self.max_seq_length - 1
                 if req.max_length >= self.max_seq_length:
                     raise ValueError(
                         f"max_length ({req.max_length}) exceeds the maximum sequence length ({self.max_seq_length})"
@@ -564,20 +595,30 @@ def __chat2prompt(self, messages: List[dict]) -> str:
         """
         # ensure that each element is a dictionary, containing the "role" and "content" keys
         for message in messages:
-            if type(message) != dict or "role" not in message or "content" not in message:
+            if (
+                type(message) != dict
+                or "role" not in message
+                or "content" not in message
+            ):
                 raise ValueError(
                     "Each element in the list must be a dictionary with the keys 'role' and 'content'"
                 )
         if self.tokenizer is None:
             self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
         if self.tokenizer.chat_template is None:
-            raise ValueError(f"Model {self.model_name} does not support chat completion")
-        return self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    
-    def __output2chat_response(self, requests: List[Request], outputs: List[GenerationResult]) -> List[GenerationResult]:
-        assert(len(requests) == len(outputs))
+            raise ValueError(
+                f"Model {self.model_name} does not support chat completion"
+            )
+        return self.tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+
+    def __output2chat_response(
+        self, requests: List[Request], outputs: List[GenerationResult]
+    ) -> List[GenerationResult]:
+        assert len(requests) == len(outputs)
         for i in range(len(outputs)):
-            outputs[i].output_text = outputs[i].output_text[len(requests[i].prompt):]
+            outputs[i].output_text = outputs[i].output_text[len(requests[i].prompt) :]
         return outputs
 
     def generate(
@@ -635,7 +676,9 @@ def generate(
                 outputs = self._generate([request])
                 return self.__output2chat_response([request], outputs)
             elif type(requests_or_prompts[0]) == list:
-                prompts = [self.__chat2prompt(messages) for messages in requests_or_prompts]
+                prompts = [
+                    self.__chat2prompt(messages) for messages in requests_or_prompts
+                ]
                 requests = [
                     Request(
                         req_type=RequestType.REQ_INFERENCE,
@@ -652,7 +695,9 @@ def generate(
                 print(requests_or_prompts)
                 return self._generate(requests_or_prompts)
         else:
-            assert False, "Please pass a string, list of strings, Request, or list of Requests"
+            assert (
+                False
+            ), "Please pass a string, list of strings, Request, or list of Requests"
 
     def start_server(self):
         self.rm.start_server(self.model.ffmodel)
@@ -695,10 +740,8 @@ def compile(
         max_requests_per_batch: int = 16,
         max_seq_length: int = 256,
         max_tokens_per_batch: int = 2048,
+        max_concurrent_adapters: int = 1,
         enable_peft_finetuning: bool = False,
-        model_specific_data_parallelism_degree: int = 1,
-        model_specific_tensor_parallelism_degree: int = 1,
-        model_specific_pipeline_parallelism_degree: int = 1,
         ssms: list = [],
     ):
         """Compile the SSM for inference and load the weights into memory
@@ -710,14 +753,10 @@ def compile(
         :type max_seq_length: int, optional
         :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 2048
         :type max_tokens_per_batch: int, optional
+        :param max_concurrent_adapters: The maximum number of concurrent LoRA adapters, defaults to 1
+        :type max_concurrent_adapters: int, optional
         :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False
         :type enable_peft_finetuning: bool, optional
-        :param model_specific_data_parallelism_degree: Use this parameter if you want to give the SSM a different data parallelism degree than the default one, defaults to 1
-        :type model_specific_data_parallelism_degree: int, optional
-        :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the SSM a different tensor parallelism degree than the default one, defaults to 1
-        :type model_specific_tensor_parallelism_degree: int, optional
-        :param model_specific_pipeline_parallelism_degree: Use this parameter if you want to give the SSM a different pipeline parallelism degree than the default one, defaults to 1
-        :type model_specific_pipeline_parallelism_degree: int, optional
         :param ssms: The SSMs to use when operating in speculative inference mode, defaults to []
         :type ssms: list, optional
         """
@@ -726,51 +765,7 @@ def compile(
             max_requests_per_batch,
             max_seq_length,
             max_tokens_per_batch,
+            max_concurrent_adapters,
             enable_peft_finetuning,
-            model_specific_data_parallelism_degree,
-            model_specific_tensor_parallelism_degree,
-            model_specific_pipeline_parallelism_degree,
             ssms,
         )
-
-from safetensors import safe_open
-from huggingface_hub import hf_hub_download
-def download_and_convert_peft_model(peft_model_id: str, data_type: DataType = DataType.DT_HALF, cache_path: str = "", refresh_cache: bool = False):
-    if data_type != DataType.DT_FLOAT and data_type != DataType.DT_HALF:
-        raise ValueError("data_type must be either DataType.DT_FLOAT or DataType.DT_HALF")
-    adapter_path = hf_hub_download(repo_id=peft_model_id, filename="adapter_model.safetensors")
-    peft_config = PeftConfig.from_pretrained(peft_model_id)
-    base_model_name_or_path = peft_config.base_model_name_or_path
-    llm = LLM(base_model_name_or_path, data_type, cache_path, refresh_cache)
-    
-    # Save peft config to file
-    peft_config_dir = os.path.join(
-        os.path.expanduser(llm.cache_path), "configs", peft_model_id.lower()
-    )
-    os.makedirs(peft_config_dir, exist_ok=True)
-    peft_config_path = os.path.join(peft_config_dir, "config.json")
-    print(f"Saving {peft_model_id} configs to file {peft_config_path}...")
-    with open(peft_config_path, "w") as json_file:
-
-        class SetEncoder(json.JSONEncoder):
-            def default(self, obj):
-                if isinstance(obj, set):
-                    return list(obj)
-                return super().default(obj)
-
-        json.dump(peft_config.to_dict(), json_file, indent=2, cls=SetEncoder)
-
-    # Save peft weights to file
-    with safe_open(adapter_path, framework="pt", device="cpu") as f:
-        for tensor_name in f.keys():
-            tensor = f.get_tensor(tensor_name)
-            if data_type == DataType.DT_HALF:
-                tensor = tensor.half()
-            else:
-                tensor = tensor.float()
-            tensor_name = tensor_name.replace("base_model.model.model.", "").replace(".default", "")
-            print(tensor_name)
-            
-            tensor_name = llm.model_class.convert_hf_weight_name(tensor_name)
-            tensor.detach().cpu().numpy().tofile(f"{llm.weights_path}/{tensor_name}")
-
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 4094fb7b44..e16b0e87bd 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -2785,6 +2785,14 @@ int flexflow_request_manager_get_max_sequence_length(
   return handle->get_max_sequence_length();
 }
 
+void flexflow_request_manager_set_max_concurrent_adapters(
+    flexflow_request_manager_t handle_, int max_concurrent_adapters) {
+  RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->set_max_concurrent_adapters(max_concurrent_adapters);
+  DEBUG_PRINT("[RequestManager] set max_concurrent_adapters %d",
+              max_concurrent_adapters);
+}
+
 void flexflow_request_manager_set_enable_peft_finetuning(
     flexflow_request_manager_t handle_, bool enable_peft_finetuning_) {
   RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
diff --git a/tests/peft_test.sh b/tests/peft_test.sh
index 6152844f5e..e497d4224e 100755
--- a/tests/peft_test.sh
+++ b/tests/peft_test.sh
@@ -31,16 +31,16 @@ mkdir -p ./inference/output
 export LEGION_BACKTRACE=1
 
 # Download test model
-python ./inference/utils/download_peft_model.py goliaro/llama-160m-lora --base_model_name JackFram/llama-160m 
+python ./inference/utils/download_peft_model.py goliaro/llama-160m-lora
 
 # Run PEFT in Huggingface to get ground truth tensors
-python ./tests/peft/hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision -lr 1.0
+python ./tests/peft/hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision -lr 0.001
 
 # Python test
 echo "Python test"
-# python ./inference/python/ff_peft.py
+python ./inference/python/ff_peft.py
 # Check alignment
-# python ./tests/peft/peft_alignment_test.py -tp 2
+python ./tests/peft/peft_alignment_test.py -tp 4 -lr 0.001
 
 # C++ test
 echo "C++ test"
@@ -55,7 +55,7 @@ echo "C++ test"
     --use-full-precision \
     --inference-debugging
 # Check alignment
-python ./tests/peft/peft_alignment_test.py -tp 4 -lr 1.0
+python ./tests/peft/peft_alignment_test.py -tp 4 -lr 0.001
 
 # Print succeess message
 echo ""

From 266a1edd990d100b59bfc618c00b200d6b00d857 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 9 Nov 2024 23:06:25 +0000
Subject: [PATCH 33/37] load weights faster in inference test

---
 tests/inference/python_test_configs/generate_configs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py
index 4f7929e2db..637198f6ff 100644
--- a/tests/inference/python_test_configs/generate_configs.py
+++ b/tests/inference/python_test_configs/generate_configs.py
@@ -4,12 +4,12 @@
 # Base configs dictionaries
 ff_init_configs = {
     # required parameters
-    "num_gpus": 4,
+    "num_gpus": 8,
     "memory_per_gpu": 14000,
     "zero_copy_memory_per_node": 40000,
     # optional parameters
-    "num_cpus": 4,
-    "legion_utility_processors": 4,
+    "num_cpus": 8,
+    "legion_utility_processors": 8,
     "data_parallelism_degree": 1,
     "tensor_parallelism_degree": 1,
     "pipeline_parallelism_degree": 4,

From d771f6bb1b767dd87ac1836d682a8d8eddeea7bc Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 9 Nov 2024 23:10:47 +0000
Subject: [PATCH 34/37] fix

---
 tests/inference/python_test_configs/generate_configs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py
index 637198f6ff..2d6f115542 100644
--- a/tests/inference/python_test_configs/generate_configs.py
+++ b/tests/inference/python_test_configs/generate_configs.py
@@ -4,7 +4,7 @@
 # Base configs dictionaries
 ff_init_configs = {
     # required parameters
-    "num_gpus": 8,
+    "num_gpus": 4,
     "memory_per_gpu": 14000,
     "zero_copy_memory_per_node": 40000,
     # optional parameters

From fc626c67a6a88e29b7eb36653ea63c523b549857 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 9 Nov 2024 23:38:25 +0000
Subject: [PATCH 35/37] cleanup and fixes

---
 inference/models/opt.cc                       |  3 ---
 python/flexflow/serve/serve.py                |  3 +++
 src/runtime/model.cc                          | 26 ++++++++++++++-----
 .../python_test_configs/generate_configs.py   |  7 +++--
 4 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 03bb6600de..cb3d5290cf 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -243,9 +243,6 @@ void OPT::create_opt_model(FFModel &ff,
                    REG_MODE_NONE,
                    0.0f,
                    std::string("layers." + std::to_string(i) + ".fc2").c_str());
-    // Low-Rank Adapter (LoRA) for the second linear layer
-    // ff.lora_linear(std::string("fc2"), std::string("layers." +
-    // std::to_string(i) + ".fc2.lora").c_str());
   }
 
   // final
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 7932441c81..498fb4b616 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -463,6 +463,9 @@ def compile(
             mode = InferenceMode.TREE_VERIFY_MODE
         elif type(self) == SSM:
             mode = InferenceMode.BEAM_SEARCH_MODE
+            self.ffconfig.data_parallelism_degree = 1
+            self.ffconfig.tensor_parallelism_degree = 1
+            self.ffconfig.pipeline_parallelism_degree = 1
         else:
             assert type(self) == LLM
             mode = InferenceMode.INC_DECODING_MODE
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index ca947039d0..2a76415818 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3420,13 +3420,25 @@ bool FFModel::need_to_add_combine(int layer_idx) const {
 
 bool FFModel::need_to_add_allreduce(int layer_idx) const {
   auto const &l = layers[layer_idx];
-  if (config.computationMode == COMP_MODE_INFERENCE &&
-      config.tensor_parallelism_degree > 1 &&
-      ((l->op_type == OP_LINEAR &&
-        std::string(l->name).find("attn.o_proj") != std::string::npos) ||
-       is_mlp_block(layer_idx) ||
-       (l->op_type == OP_LINEAR &&
-        std::string(l->name).find("mlp.down_proj") != std::string::npos))) {
+  if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 && l->op_type == OP_LINEAR &&
+      ( /*llama/mpt attention*/
+        (std::string(l->name).find("attn.o_proj") != std::string::npos) ||
+        /*opt/starcoder attention*/
+        (std::string(l->name).find("self_attn.o_proj") != std::string::npos) ||
+        /*falcon attention*/
+        (std::string(l->name).find("self_attention.o_proj") != std::string::npos) ||
+        /*llama mlp*/
+        (std::string(l->name).find("mlp.down_proj") != std::string::npos) || 
+        /*opt mlp*/
+        (std::string(l->name).find("fc2") != std::string::npos) ||
+        /*falcon mlp*/
+        (std::string(l->name).find("mlp.dense_4h_to_h") != std::string::npos) ||
+        /*mpt mlp*/
+        (std::string(l->name).find("ffn.down_proj") != std::string::npos) ||
+        /*starcoder mlp*/
+        (std::string(l->name).find("mlp.c_proj") != std::string::npos)
+      )
+  ) {
     return true;
   }
   return false;
diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py
index 2d6f115542..afb7ffb9a7 100644
--- a/tests/inference/python_test_configs/generate_configs.py
+++ b/tests/inference/python_test_configs/generate_configs.py
@@ -62,15 +62,14 @@
 # starcoder_models = ["bigcode/starcoderbase-7b",]
 parallelism_settings = [(1, 4), (2, 2), (4, 1)]
 
-# The paths below should be with respect to the folder from which the tests are launched (FF_HOME/tests/inference)
-prompt_file = "../../inference/prompt/test.json"
-output_folder = "../../inference/output"
-
 # Change working dir to folder storing this script
 abspath = os.path.abspath(__file__)
 dname = os.path.dirname(abspath)
 os.chdir(dname)
 
+prompt_file = os.path.abspath("../../../inference/prompt/test.json")
+output_folder = os.path.abspath("../../../inference/output")
+
 
 # Generate incremental decoding configs
 all_models = llama_models + opt_models + falcon_models + mpt_models

From ab5aa4bb638aad62d6593a512a450f8f806a446e Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 9 Nov 2024 23:42:16 +0000
Subject: [PATCH 36/37] linting

---
 src/runtime/model.cc | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 2a76415818..2a95caf6cb 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3420,25 +3420,25 @@ bool FFModel::need_to_add_combine(int layer_idx) const {
 
 bool FFModel::need_to_add_allreduce(int layer_idx) const {
   auto const &l = layers[layer_idx];
-  if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 && l->op_type == OP_LINEAR &&
-      ( /*llama/mpt attention*/
-        (std::string(l->name).find("attn.o_proj") != std::string::npos) ||
-        /*opt/starcoder attention*/
-        (std::string(l->name).find("self_attn.o_proj") != std::string::npos) ||
-        /*falcon attention*/
-        (std::string(l->name).find("self_attention.o_proj") != std::string::npos) ||
-        /*llama mlp*/
-        (std::string(l->name).find("mlp.down_proj") != std::string::npos) || 
-        /*opt mlp*/
-        (std::string(l->name).find("fc2") != std::string::npos) ||
-        /*falcon mlp*/
-        (std::string(l->name).find("mlp.dense_4h_to_h") != std::string::npos) ||
-        /*mpt mlp*/
-        (std::string(l->name).find("ffn.down_proj") != std::string::npos) ||
-        /*starcoder mlp*/
-        (std::string(l->name).find("mlp.c_proj") != std::string::npos)
-      )
-  ) {
+  if (config.computationMode == COMP_MODE_INFERENCE &&
+      config.tensor_parallelism_degree > 1 && l->op_type == OP_LINEAR &&
+      (/*llama/mpt attention*/
+       (std::string(l->name).find("attn.o_proj") != std::string::npos) ||
+       /*opt/starcoder attention*/
+       (std::string(l->name).find("self_attn.o_proj") != std::string::npos) ||
+       /*falcon attention*/
+       (std::string(l->name).find("self_attention.o_proj") !=
+        std::string::npos) ||
+       /*llama mlp*/
+       (std::string(l->name).find("mlp.down_proj") != std::string::npos) ||
+       /*opt mlp*/
+       (std::string(l->name).find("fc2") != std::string::npos) ||
+       /*falcon mlp*/
+       (std::string(l->name).find("mlp.dense_4h_to_h") != std::string::npos) ||
+       /*mpt mlp*/
+       (std::string(l->name).find("ffn.down_proj") != std::string::npos) ||
+       /*starcoder mlp*/
+       (std::string(l->name).find("mlp.c_proj") != std::string::npos))) {
     return true;
   }
   return false;

From 7d99cf777f0bcf15e00ee2b59b849fb3771dc61d Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Mon, 11 Nov 2024 02:11:14 +0000
Subject: [PATCH 37/37] fix

---
 python/flexflow/serve/serve.py | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 498fb4b616..c2804b6966 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -270,27 +270,20 @@ def __need_cache_refresh(
         Returns:
             bool: True if the weights or tokenizer need a refresh, False otherwise
         """
-        need_refresh = False
         resource_path = self.__get_resource_path(model_name, resource_type)
-        if self.refresh_cache or not os.path.exists(resource_path):
-            need_refresh = True
-        else:
-            ff_revision, latest_revision = self.__get_revision_hashes(
-                self.model_name, resource_path
-            )
-            if ff_revision != latest_revision:
-                need_refresh = True
-        if need_refresh:
+        ff_revision, latest_revision = self.__get_revision_hashes(self.model_name, resource_path)
+        if self.refresh_cache or not os.path.exists(resource_path) or ff_revision != latest_revision:
             print(
                 f"Refreshing {resource_type} in cache for model {model_name} at path {resource_path} ..."
             )
             if os.path.exists(resource_path):
                 shutil.rmtree(resource_path)
-        os.makedirs(resource_path, exist_ok=True)
-        ff_revision_file = os.path.join(resource_path, "rev_sha.txt")
-        with open(ff_revision_file, "w+") as f:
-            f.write(latest_revision)
-        return need_refresh
+            os.makedirs(resource_path, exist_ok=True)
+            ff_revision_file = os.path.join(resource_path, "rev_sha.txt")
+            with open(ff_revision_file, "w+") as f:
+                f.write(latest_revision)
+            return True
+        return False
 
     def download_hf_weights_if_needed(self) -> None:
         """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date.