From 470a40fcb974050ee656571fb15373908e76fb51 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 25 Sep 2024 02:19:19 +0000 Subject: [PATCH 01/37] init --- docker/flexflow-environment/Dockerfile | 1 + docker/run.sh | 7 +- inference/python/entrypoint/fastapi_incr.py | 24 ++-- inference/python/streamlit/README.md | 0 inference/python/streamlit/app.py | 122 ++++++++++++++++++++ python/flexflow/core/flexflow_cffi.py | 4 +- python/flexflow/serve/serve.py | 6 +- 7 files changed, 149 insertions(+), 15 deletions(-) create mode 100644 inference/python/streamlit/README.md create mode 100644 inference/python/streamlit/app.py diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index ee13a07375..4f41482ee5 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -112,6 +112,7 @@ RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops RUN pip3 install tensorflow notebook # PEFT-related RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft +RUN pip3 install streamlit # Install Rust RUN curl https://sh.rustup.rs -sSf | sh -s -- -y diff --git a/docker/run.sh b/docker/run.sh index cdf9383052..3e7417a3cc 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -17,6 +17,11 @@ hip_version=${hip_version:-"empty"} ATTACH_GPUS=${ATTACH_GPUS:-true} gpu_arg="" if $ATTACH_GPUS ; then gpu_arg="--gpus all" ; fi +FORWARD_STREAMLIT_PORT=${FORWARD_STREAMLIT_PORT:-true} +port_forward_arg="" +if $FORWARD_STREAMLIT_PORT ; then + port_forward_arg+="-p 8501:8501" +fi # Amount of shared memory to give the Docker container access to @@ -120,4 +125,4 @@ if [ -f "$hf_token_path" ]; then hf_token_volume+="-v $hf_token_path:/root/.cache/huggingface/token" fi -eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${hf_token_volume}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest" +eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${hf_token_volume}" "${port_forward_arg}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest" diff --git a/inference/python/entrypoint/fastapi_incr.py b/inference/python/entrypoint/fastapi_incr.py index 34f61739fb..f2830e6e5e 100644 --- a/inference/python/entrypoint/fastapi_incr.py +++ b/inference/python/entrypoint/fastapi_incr.py @@ -60,28 +60,32 @@ def get_configs(): # Define sample configs ff_init_configs = { # required parameters - "num_gpus": 2, - "memory_per_gpu": 14000, + "num_gpus": 4, + "memory_per_gpu": 20000, "zero_copy_memory_per_node": 40000, # optional parameters "num_cpus": 4, "legion_utility_processors": 4, "data_parallelism_degree": 1, - "tensor_parallelism_degree": 1, - "pipeline_parallelism_degree": 2, + "tensor_parallelism_degree": 4, + "pipeline_parallelism_degree": 1, "offload": False, - "offload_reserve_space_size": 1024**2, + "offload_reserve_space_size": 8 * 1024, # 8GB "use_4bit_quantization": False, "use_8bit_quantization": False, + "enable_peft": False, + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, + "benchmarking": False, "inference_debugging": False, "fusion": True, } llm_configs = { # required parameters - "llm_model": "tiiuae/falcon-7b", + "llm_model": "meta-llama/Meta-Llama-3.1-8B", # optional parameters - "cache_path": "", + "cache_path": os.environ.get("FF_CACHE_PATH", ""), "refresh_cache": False, "full_precision": False, "prompt": "", @@ -102,7 +106,9 @@ async def startup_event(): configs = SimpleNamespace(**configs_dict) ff.init(configs_dict) - ff_data_type = ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + ) llm = ff.LLM( configs.llm_model, data_type=ff_data_type, @@ -117,7 +123,7 @@ async def startup_event(): llm.compile( generation_config, max_requests_per_batch=1, - max_seq_length=256, + max_seq_length=2048, max_tokens_per_batch=64, ) llm.start_server() diff --git a/inference/python/streamlit/README.md b/inference/python/streamlit/README.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/inference/python/streamlit/app.py b/inference/python/streamlit/app.py new file mode 100644 index 0000000000..564a9b6c5a --- /dev/null +++ b/inference/python/streamlit/app.py @@ -0,0 +1,122 @@ +import streamlit as st +import requests +import os +from huggingface_hub import model_info + + +# App title +st.set_page_config(page_title="🦙💬 FlexLLM Llama Server") + +# FastAPI server URL +FASTAPI_URL = "http://localhost:8000/generate/" # Adjust the port if necessary + +# Initialize session state variables +if 'added_adapters' not in st.session_state: + st.session_state.added_adapters = [] + +def check_model_availability(model_name): + try: + info = model_info(model_name) + return True + except Exception: + return False + +# Store LLM generated responses +if "messages" not in st.session_state.keys(): + st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}] + +# Display or clear chat messages +for message in st.session_state.messages: + with st.chat_message(message["role"]): + st.write(message["content"]) + +def clear_chat_history(): + st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}] + + +# App title and description +with st.sidebar: + st.title('🦙💬 FlexLLM Llama Server') + # st.success('Using local FastAPI server', icon='✅') + st.sidebar.button('Clear Chat History', on_click=clear_chat_history) + + st.subheader('Generation parameters') + max_length = st.sidebar.slider('Max generation length', min_value=64, max_value=4096, value=2048, step=8) + # selected_model = st.sidebar.selectbox('Choose a Llama2 model', ['Llama2-7B', 'Llama2-13B', 'Llama2-70B'], key='selected_model') + decoding_method = st.sidebar.selectbox('Decoding method', ['Greedy decoding (default)', 'Sampling'], key='decoding_method') + temperature = st.sidebar.slider('temperature', min_value=0.01, max_value=5.0, value=0.1, step=0.01, disabled=decoding_method == 'Greedy decoding (default)') + top_p = st.sidebar.slider('top_p', min_value=0.01, max_value=1.0, value=0.9, step=0.01, disabled=decoding_method == 'Greedy decoding (default)') + + # lora_adapter = st.sidebar.text_input('Lora adapter', placeholder='None') + st.subheader("LoRA Adapters (optional)") + # Text input for PEFT model ID + peft_id = st.text_input("Add a LoRA Adapter", placeholder="Enter the Huggingface PEFT model ID") + # Button to load the adapter + if st.button("Load Adapter"): + if peft_id: + with st.spinner("Checking PEFT availability..."): + is_available = check_model_availability(peft_id) + if is_available: + if peft_id not in st.session_state.added_adapters: + st.session_state.added_adapters.append(peft_id) + st.success(f"Successfully added PEFT: {peft_id}") + else: + st.warning(f"PEFT {peft_id} is already in the list.") + else: + st.error(f"PEFT {peft_id} is not available on Hugging Face. Please check the ID and try again.") + else: + st.warning("Please enter a PEFT Model ID.") + # Button to remove all adapters + if st.button("Remove All Adapters"): + st.session_state.added_adapters = [] + st.success("All adapters have been removed.") + # Display the list of added adapters + st.markdown("**Added Adapters:**") + if st.session_state.added_adapters: + for adapter in st.session_state.added_adapters: + st.write(f"- {adapter}") + else: + st.write("No adapters added yet.") + + # st.markdown('📖 Learn how to build this app in this [blog](https://blog.streamlit.io/how-to-build-a-llama-2-chatbot/)!') + + + +# Function for generating LLaMA2 response +def generate_llama2_response(prompt_input): + string_dialogue = "You are a helpful assistant. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'." + for dict_message in st.session_state.messages: + if dict_message["role"] == "user": + string_dialogue += "User: " + dict_message["content"] + "\n\n" + else: + string_dialogue += "Assistant: " + dict_message["content"] + "\n\n" + + full_prompt = f"{string_dialogue} {prompt_input} Assistant: " + + # Send request to FastAPI server + response = requests.post(FASTAPI_URL, json={"prompt": full_prompt}) + + if response.status_code == 200: + return response.json()["response"] + else: + return f"Error: {response.status_code} - {response.text}" + +# User-provided prompt +if prompt := st.chat_input(): + st.session_state.messages.append({"role": "user", "content": prompt}) + with st.chat_message("user"): + st.write(prompt) + +# Generate a new response if last message is not from assistant +if st.session_state.messages[-1]["role"] != "assistant": + with st.chat_message("assistant"): + with st.spinner("Thinking..."): + response = generate_llama2_response(prompt) + placeholder = st.empty() + full_response = '' + for item in response: + full_response += item + placeholder.markdown(full_response) + placeholder.markdown(full_response) + message = {"role": "assistant", "content": full_response} + st.session_state.messages.append(message) \ No newline at end of file diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 7692ccb88f..d065398f87 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -2057,7 +2057,7 @@ def __init__( self, req_type: RequestType, prompt: str = None, - max_sequence_length: int = 128, + max_sequence_length: int = 2048, peft_model_id: PEFTModelID = None, dataset_filepath: str = None, max_training_steps: int = 1, @@ -4665,7 +4665,7 @@ def get_output_tensor(self, ffmodel, data_type): assert ret_val == True return np_array - def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 128): + def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 2048): assert isinstance(prompt_list, list) c_input_texts = [get_c_name(prompt) for prompt in prompt_list] max_num_chars = 5 * (max_sequence_length + 100) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 132c50995b..988789bab4 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -498,7 +498,7 @@ def compile( def generate( self, requests_or_prompts: Union[str, List[str], Request, List[Request]], - max_length: int = 128, + max_length: int = 2048, ): """Generate tokens based on the input prompt(s) @@ -568,7 +568,7 @@ def compile( generation_config: GenerationConfig = GenerationConfig(), max_requests_per_batch: int = 16, max_seq_length: int = 256, - max_tokens_per_batch: int = 128, + max_tokens_per_batch: int = 2048, enable_peft_finetuning: bool = False, model_specific_data_parallelism_degree: int = 1, model_specific_tensor_parallelism_degree: int = 1, @@ -582,7 +582,7 @@ def compile( :type max_requests_per_batch: int, optional :param max_seq_length: The maximum sequence length to allow per batch, defaults to 256 :type max_seq_length: int, optional - :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 128 + :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 2048 :type max_tokens_per_batch: int, optional :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False :type enable_peft_finetuning: bool, optional From 7f23188772c5a32fa0e5586673d9f666f8cd5190 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 25 Sep 2024 03:10:11 +0000 Subject: [PATCH 02/37] update --- inference/python/streamlit/app.py | 204 ++++++++++++++++++------------ 1 file changed, 126 insertions(+), 78 deletions(-) diff --git a/inference/python/streamlit/app.py b/inference/python/streamlit/app.py index 564a9b6c5a..c264930e7d 100644 --- a/inference/python/streamlit/app.py +++ b/inference/python/streamlit/app.py @@ -1,19 +1,24 @@ import streamlit as st import requests -import os +import os, json from huggingface_hub import model_info # App title -st.set_page_config(page_title="🦙💬 FlexLLM Llama Server") +st.set_page_config(page_title="🚀💻 FlexLLM Server", layout="wide") # FastAPI server URL FASTAPI_URL = "http://localhost:8000/generate/" # Adjust the port if necessary +FINETUNE_URL = "http://localhost:8000/finetuning" # Initialize session state variables if 'added_adapters' not in st.session_state: st.session_state.added_adapters = [] +# Store LLM generated responses +if "messages" not in st.session_state.keys(): + st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}] + def check_model_availability(model_name): try: info = model_info(model_name) @@ -21,67 +26,9 @@ def check_model_availability(model_name): except Exception: return False -# Store LLM generated responses -if "messages" not in st.session_state.keys(): - st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}] - -# Display or clear chat messages -for message in st.session_state.messages: - with st.chat_message(message["role"]): - st.write(message["content"]) - def clear_chat_history(): st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}] - -# App title and description -with st.sidebar: - st.title('🦙💬 FlexLLM Llama Server') - # st.success('Using local FastAPI server', icon='✅') - st.sidebar.button('Clear Chat History', on_click=clear_chat_history) - - st.subheader('Generation parameters') - max_length = st.sidebar.slider('Max generation length', min_value=64, max_value=4096, value=2048, step=8) - # selected_model = st.sidebar.selectbox('Choose a Llama2 model', ['Llama2-7B', 'Llama2-13B', 'Llama2-70B'], key='selected_model') - decoding_method = st.sidebar.selectbox('Decoding method', ['Greedy decoding (default)', 'Sampling'], key='decoding_method') - temperature = st.sidebar.slider('temperature', min_value=0.01, max_value=5.0, value=0.1, step=0.01, disabled=decoding_method == 'Greedy decoding (default)') - top_p = st.sidebar.slider('top_p', min_value=0.01, max_value=1.0, value=0.9, step=0.01, disabled=decoding_method == 'Greedy decoding (default)') - - # lora_adapter = st.sidebar.text_input('Lora adapter', placeholder='None') - st.subheader("LoRA Adapters (optional)") - # Text input for PEFT model ID - peft_id = st.text_input("Add a LoRA Adapter", placeholder="Enter the Huggingface PEFT model ID") - # Button to load the adapter - if st.button("Load Adapter"): - if peft_id: - with st.spinner("Checking PEFT availability..."): - is_available = check_model_availability(peft_id) - if is_available: - if peft_id not in st.session_state.added_adapters: - st.session_state.added_adapters.append(peft_id) - st.success(f"Successfully added PEFT: {peft_id}") - else: - st.warning(f"PEFT {peft_id} is already in the list.") - else: - st.error(f"PEFT {peft_id} is not available on Hugging Face. Please check the ID and try again.") - else: - st.warning("Please enter a PEFT Model ID.") - # Button to remove all adapters - if st.button("Remove All Adapters"): - st.session_state.added_adapters = [] - st.success("All adapters have been removed.") - # Display the list of added adapters - st.markdown("**Added Adapters:**") - if st.session_state.added_adapters: - for adapter in st.session_state.added_adapters: - st.write(f"- {adapter}") - else: - st.write("No adapters added yet.") - - # st.markdown('📖 Learn how to build this app in this [blog](https://blog.streamlit.io/how-to-build-a-llama-2-chatbot/)!') - - - # Function for generating LLaMA2 response def generate_llama2_response(prompt_input): string_dialogue = "You are a helpful assistant. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'." @@ -101,22 +48,123 @@ def generate_llama2_response(prompt_input): else: return f"Error: {response.status_code} - {response.text}" -# User-provided prompt -if prompt := st.chat_input(): - st.session_state.messages.append({"role": "user", "content": prompt}) - with st.chat_message("user"): - st.write(prompt) - -# Generate a new response if last message is not from assistant -if st.session_state.messages[-1]["role"] != "assistant": - with st.chat_message("assistant"): - with st.spinner("Thinking..."): - response = generate_llama2_response(prompt) - placeholder = st.empty() - full_response = '' - for item in response: - full_response += item +# Sidebar +with st.sidebar: + st.title('🚀 FlexLLM Server') + page = st.radio("Choose a page", ["Chat", "Finetune"]) + if page == "Chat": + st.header('🦙 Llama Chatbot') + # st.success('Using local FastAPI server', icon='✅') + st.sidebar.button('Clear Chat History', on_click=clear_chat_history) + + st.subheader('Generation parameters') + max_length = st.sidebar.slider('Max generation length', min_value=64, max_value=4096, value=2048, step=8) + # selected_model = st.sidebar.selectbox('Choose a Llama2 model', ['Llama2-7B', 'Llama2-13B', 'Llama2-70B'], key='selected_model') + decoding_method = st.sidebar.selectbox('Decoding method', ['Greedy decoding (default)', 'Sampling'], key='decoding_method') + temperature = st.sidebar.slider('temperature', min_value=0.01, max_value=5.0, value=0.1, step=0.01, disabled=decoding_method == 'Greedy decoding (default)') + top_p = st.sidebar.slider('top_p', min_value=0.01, max_value=1.0, value=0.9, step=0.01, disabled=decoding_method == 'Greedy decoding (default)') + + # lora_adapter = st.sidebar.text_input('Lora adapter', placeholder='None') + st.subheader("LoRA Adapters (optional)") + # Text input for PEFT model ID + peft_id = st.text_input("Add a LoRA Adapter", placeholder="Enter the Huggingface PEFT model ID") + # Button to load the adapter + if st.button("Load Adapter"): + if peft_id: + with st.spinner("Checking PEFT availability..."): + is_available = check_model_availability(peft_id) + if is_available: + if peft_id not in st.session_state.added_adapters: + st.session_state.added_adapters.append(peft_id) + st.success(f"Successfully added PEFT: {peft_id}") + else: + st.warning(f"PEFT {peft_id} is already in the list.") + else: + st.error(f"PEFT {peft_id} is not available on Hugging Face. Please check the ID and try again.") + else: + st.warning("Please enter a PEFT Model ID.") + # Button to remove all adapters + if st.button("Remove All Adapters"): + st.session_state.added_adapters = [] + st.success("All adapters have been removed.") + # Display the list of added adapters + st.markdown("**Added Adapters:**") + if st.session_state.added_adapters: + for adapter in st.session_state.added_adapters: + st.write(f"- {adapter}") + else: + st.write("No adapters added yet.") + # st.markdown('📖 Learn how to build this app in this [blog](https://blog.streamlit.io/how-to-build-a-llama-2-chatbot/)!') + elif page == "Finetune": + st.header("🏋️‍♂️ LoRA Finetuning") + + # Hugging Face token input + hf_token = st.text_input("Enter your Hugging Face token:", type="password") + + # Dataset selection + dataset_option = st.radio("Choose dataset source:", ["Upload JSON", "Hugging Face Dataset"]) + + if dataset_option == "Upload JSON": + uploaded_file = st.file_uploader("Upload JSON dataset", type="json") + if uploaded_file is not None: + dataset = json.load(uploaded_file) + st.success("Dataset uploaded successfully!") + else: + dataset_name = st.text_input("Enter Hugging Face dataset name:") + + # Start finetuning button + if st.button("Start Finetuning"): + if not hf_token: + st.error("Please enter your Hugging Face token.") + elif dataset_option == "Upload JSON" and uploaded_file is None: + st.error("Please upload a JSON dataset.") + elif dataset_option == "Hugging Face Dataset" and not dataset_name: + st.error("Please enter a Hugging Face dataset name.") + else: + # Prepare the request data + request_data = { + "token": hf_token, + "dataset_source": dataset_option, + } + + if dataset_option == "Upload JSON": + request_data["dataset"] = dataset + else: + request_data["dataset_name"] = dataset_name + + # Send finetuning request to FastAPI server + with st.spinner("Finetuning in progress..."): + response = requests.post(FINETUNE_URL, json=request_data) + + if response.status_code == 200: + st.success("Finetuning completed successfully!") + else: + st.error(f"Finetuning failed. Error: {response.status_code} - {response.text}") + +if page == "Chat": + # Display or clear chat messages + for message in st.session_state.messages: + with st.chat_message(message["role"]): + st.write(message["content"]) + + # User-provided prompt + if prompt := st.chat_input(): + st.session_state.messages.append({"role": "user", "content": prompt}) + with st.chat_message("user"): + st.write(prompt) + + # Generate a new response if last message is not from assistant + if st.session_state.messages[-1]["role"] != "assistant": + with st.chat_message("assistant"): + with st.spinner("Thinking..."): + response = generate_llama2_response(prompt) + placeholder = st.empty() + full_response = '' + for item in response: + full_response += item + placeholder.markdown(full_response) placeholder.markdown(full_response) - placeholder.markdown(full_response) - message = {"role": "assistant", "content": full_response} - st.session_state.messages.append(message) \ No newline at end of file + message = {"role": "assistant", "content": full_response} + st.session_state.messages.append(message) +elif page == "Finetune": + st.write("Use the sidebar to configure and start finetuning.") \ No newline at end of file From a2d2ac0d5896916808eec81b50bae54099e06663 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 25 Sep 2024 14:15:23 +0000 Subject: [PATCH 03/37] update --- inference/python/streamlit/app.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/inference/python/streamlit/app.py b/inference/python/streamlit/app.py index c264930e7d..5ed56148c9 100644 --- a/inference/python/streamlit/app.py +++ b/inference/python/streamlit/app.py @@ -99,7 +99,18 @@ def generate_llama2_response(prompt_input): st.header("🏋️‍♂️ LoRA Finetuning") # Hugging Face token input - hf_token = st.text_input("Enter your Hugging Face token:", type="password") + # hf_token = st.text_input("Enter your Hugging Face token:", type="password") + if 'hf_token' in st.session_state.keys(): + st.success('HF token already provided!', icon='✅') + hf_token = st.session_state.hf_token + print(hf_token) + else: + hf_token = st.text_input('Enter your Hugging Face token:', type='password') + if not (hf_token.startswith('hf_') and len(hf_token)==37): + st.warning('Please enter valid credentials!', icon='⚠️') + else: + st.success('Proceed to finetuning your model!', icon='👉') + st.session_state.hf_token = hf_token # Dataset selection dataset_option = st.radio("Choose dataset source:", ["Upload JSON", "Hugging Face Dataset"]) From f8c90e64ae070cbcb4fee81080f31a00a758284f Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 25 Sep 2024 17:18:07 +0000 Subject: [PATCH 04/37] update --- inference/python/streamlit/app.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/inference/python/streamlit/app.py b/inference/python/streamlit/app.py index 5ed56148c9..4d8633e167 100644 --- a/inference/python/streamlit/app.py +++ b/inference/python/streamlit/app.py @@ -103,15 +103,17 @@ def generate_llama2_response(prompt_input): if 'hf_token' in st.session_state.keys(): st.success('HF token already provided!', icon='✅') hf_token = st.session_state.hf_token - print(hf_token) else: hf_token = st.text_input('Enter your Hugging Face token:', type='password') if not (hf_token.startswith('hf_') and len(hf_token)==37): - st.warning('Please enter valid credentials!', icon='⚠️') + st.warning('please enter a valid token', icon='⚠️') else: st.success('Proceed to finetuning your model!', icon='👉') st.session_state.hf_token = hf_token + # PEFT model name + peft_model_name = st.text_input("Enter the PEFT model name:", help="The name of the PEFT model should start with the username associated with the provided HF token, followed by '/'ß. E.g. 'username/peft-base-uncased'") + # Dataset selection dataset_option = st.radio("Choose dataset source:", ["Upload JSON", "Hugging Face Dataset"]) @@ -123,6 +125,18 @@ def generate_llama2_response(prompt_input): else: dataset_name = st.text_input("Enter Hugging Face dataset name:") + # Finetuning parameters + st.subheader("Finetuning parameters") + lora_rank = st.number_input("LoRA rank", min_value=2, max_value=64, value=16, step=2) + lora_alpha = st.number_input("LoRA alpha", min_value=2, max_value=64, value=16, step=2) + target_modules = st.multiselect("Target modules", ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head"], default=["down_proj"]) + learning_rate = st.number_input("Learning rate", min_value=1e-6, max_value=1e-3, value=1e-5, step=1e-6) + optimizer_type = st.selectbox("Optimizer type", ["SGD", "Adam", "AdamW", "Adagrad", "Adadelta", "Adamax", "RMSprop"]) + momentum = st.number_input("Momentum", min_value=0.0, max_value=1.0, value=0.0, step=0.01) + weight_decay = st.number_input("Weight decay", min_value=0.0, max_value=1.0, value=0.0, step=0.01) + nesterov = st.checkbox("Nesterov") + max_steps = st.number_input("Max steps", min_value=1000, max_value=100000, value=10000, step=1000) + # Start finetuning button if st.button("Start Finetuning"): if not hf_token: From 2906e57272ecf8b02e1fac790f9117491b44001b Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 26 Sep 2024 13:21:06 +0000 Subject: [PATCH 05/37] update --- inference/python/streamlit/fastapi_incr.py | 203 ++++++++++++++++++ .../inference/huggingface_inference_simple.py | 51 +++++ tests/inference/huggingface_pipeline.py | 33 +++ 3 files changed, 287 insertions(+) create mode 100644 inference/python/streamlit/fastapi_incr.py create mode 100644 tests/inference/huggingface_inference_simple.py create mode 100644 tests/inference/huggingface_pipeline.py diff --git a/inference/python/streamlit/fastapi_incr.py b/inference/python/streamlit/fastapi_incr.py new file mode 100644 index 0000000000..0bc20f3b0a --- /dev/null +++ b/inference/python/streamlit/fastapi_incr.py @@ -0,0 +1,203 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Running Instructions: +- To run this FastAPI application, make sure you have FastAPI and Uvicorn installed. +- Save this script as 'fastapi_incr.py'. +- Run the application using the command: `uvicorn fastapi_incr:app --reload --port PORT_NUMBER` +- The server will start on `http://localhost:PORT_NUMBER`. Use this base URL to make API requests. +- Go to `http://localhost:PORT_NUMBER/docs` for API documentation. +""" + + +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel, Field +import flexflow.serve as ff +import uvicorn +import json, os, argparse +from types import SimpleNamespace +from typing import Optional, List +import time + + +# Initialize FastAPI application +app = FastAPI() + +# Define the request model +class PromptRequest(BaseModel): + prompt: str + +# data models +class Message(BaseModel): + role: str + content: str + + +class ChatCompletionRequest(BaseModel): + model: Optional[str] = "mock-gpt-model" + messages: List[Message] + max_tokens: Optional[int] = 512 + temperature: Optional[float] = 0.1 + stream: Optional[bool] = False + +# Global variable to store the LLM model +llm = None + + +def get_configs(): + + # Fetch configuration file path from environment variable + config_file = os.getenv("CONFIG_FILE", "") + + # Load configs from JSON file (if specified) + if config_file: + if not os.path.isfile(config_file): + raise FileNotFoundError(f"Config file {config_file} not found.") + try: + with open(config_file) as f: + return json.load(f) + except json.JSONDecodeError as e: + print("JSON format error:") + print(e) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 4, + "memory_per_gpu": 20000, + "zero_copy_memory_per_node": 40000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 4, + "pipeline_parallelism_degree": 1, + "offload": False, + "offload_reserve_space_size": 8 * 1024, # 8GB + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "enable_peft": False, + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB + "profiling": False, + "benchmarking": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required parameters + "llm_model": "meta-llama/Meta-Llama-3.1-8B", + # optional parameters + "cache_path": os.environ.get("FF_CACHE_PATH", ""), + "refresh_cache": False, + "full_precision": False, + "prompt": "", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + +# Initialize model on startup +@app.on_event("startup") +async def startup_event(): + global llm + + # Initialize your LLM model configuration here + configs_dict = get_configs() + configs = SimpleNamespace(**configs_dict) + ff.init(configs_dict) + + ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + ) + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + + generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + llm.compile( + generation_config, + max_requests_per_batch=1, + max_seq_length=2048, + max_tokens_per_batch=64, + ) + llm.start_server() + +# API endpoint to generate response +@app.post("/generate/") +async def generate(prompt_request: PromptRequest): + if llm is None: + raise HTTPException(status_code=503, detail="LLM model is not initialized.") + + # Call the model to generate a response + full_output = llm.generate([prompt_request.prompt])[0].output_text.decode('utf-8') + + # Separate the prompt and response + split_output = full_output.split('\n', 1) + if len(split_output) > 1: + response_text = split_output[1] + else: + response_text = "" + + # Return the prompt and the response in JSON format + return { + "prompt": prompt_request.prompt, + "response": response_text + } + +@app.post("/chat/completions") +async def chat_completions(request: ChatCompletionRequest): + + if llm is None: + raise HTTPException(status_code=503, detail="LLM model is not initialized.") + + if request.messages and request.messages[0].role == 'user': + resp_content = "As a mock AI Assitant, I can only echo your last message:" + request.messages[-1].content + else: + resp_content = "As a mock AI Assitant, I can only echo your last message, but there were no messages!" + + return { + "id": "1337", + "object": "chat.completion", + "created": time.time(), + "model": request.model, + "choices": [{"message": Message(role="assistant", content=resp_content)}], + } + +# Shutdown event to stop the model server +@app.on_event("shutdown") +async def shutdown_event(): + global llm + if llm is not None: + llm.stop_server() + +# Main function to run Uvicorn server +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) + +# Running within the entrypoint folder: +# uvicorn fastapi_incr:app --reload --port + +# Running within the python folder: +# uvicorn entrypoint.fastapi_incr:app --reload --port 3000 diff --git a/tests/inference/huggingface_inference_simple.py b/tests/inference/huggingface_inference_simple.py new file mode 100644 index 0000000000..f1cf8450b7 --- /dev/null +++ b/tests/inference/huggingface_inference_simple.py @@ -0,0 +1,51 @@ +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + AutoConfig, + GenerationConfig, +) + +model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct" +do_sample = False +max_length = 128 +model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto",) +hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) +tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) +generation_config = GenerationConfig.from_pretrained(model_name) +print(generation_config.do_sample) +generation_config.do_sample = do_sample +generation_config.num_beams=1 +generation_config.temperature = None +generation_config.top_p = None + + +def run_text_completion(): + prompt = "Help me plan a 1-week trip to Dubai" + batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) + + generated = model.generate( + batch["input_ids"], + max_new_tokens=max_length, + generation_config=generation_config, + ) + out = tokenizer.decode(generated[0]) + print(out) + +def run_chat_completion(): + messages=[ + {"role": "system", "content": "You are a helpful an honest programming assistant."}, + {"role": "user", "content": "Is Rust better than Python?"}, + ] + tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + batch = tokenizer(tokenized_chat, return_tensors="pt") + + generated = model.generate( + batch["input_ids"], + max_new_tokens=max_length, + generation_config=generation_config, + ) + out = tokenizer.decode(generated[0], skip_special_tokens=True, clean_up_tokenization_spaces=True) + prompt_length = len(tokenizer.decode(batch["input_ids"][0], skip_special_tokens=True, clean_up_tokenization_spaces=True)) + all_text = out[prompt_length:] + print(all_text) +run_chat_completion() \ No newline at end of file diff --git a/tests/inference/huggingface_pipeline.py b/tests/inference/huggingface_pipeline.py new file mode 100644 index 0000000000..95388e0a4b --- /dev/null +++ b/tests/inference/huggingface_pipeline.py @@ -0,0 +1,33 @@ +import transformers +from transformers import GenerationConfig + +model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct" +do_sample = False + +generation_config = GenerationConfig.from_pretrained(model_id) +generation_config.do_sample = do_sample +generation_config.num_beams=1 +# generation_config.max_length = 128 +generation_config.temperature = None +generation_config.top_p = None +print(generation_config) + +pipeline = transformers.pipeline( + "text-generation", + model=model_id, + # model_kwargs={"torch_dtype": torch.bfloat16}, + device_map="auto", +) + +messages=[ + {"role": "system", "content": "You are a helpful an honest programming assistant."}, + {"role": "user", "content": "Is Rust better than Python?"}, + ] + +# messages="Help me plan a 1-week trip to Dubai" +outputs = pipeline( + messages, + max_new_tokens=128, + generation_config=generation_config, +) +print(outputs[0]["generated_text"][-1]['content']) \ No newline at end of file From d62d9beb020113047454b56e306d99625abb413b Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 1 Oct 2024 04:41:28 +0000 Subject: [PATCH 06/37] add max new tokens parameter --- include/flexflow/batch_config.h | 4 +- include/flexflow/flexflow_c.h | 3 +- include/flexflow/request_manager.h | 3 +- inference/incr_decoding/incr_decoding.cc | 2 +- inference/peft/peft.cc | 2 +- inference/peft/peft_bwd_benchmark.cc | 6 +- inference/peft/peft_fwd_benchmark.cc | 2 +- inference/peft/req_rate_benchmark.cc | 8 +- inference/python/entrypoint/fastapi_incr.py | 24 ++--- inference/python/streamlit/fastapi_incr.py | 2 +- inference/spec_infer/spec_infer.cc | 2 +- python/flexflow/core/flexflow_cffi.py | 59 +++++------ python/flexflow/serve/serve.py | 11 ++- src/c/flexflow_c.cc | 32 ++++-- src/ops/add_bias_residual_layer_norm.cpp | 2 +- src/ops/add_bias_residual_layer_norm.cu | 2 +- src/ops/inc_multihead_self_attention.cpp | 2 +- src/ops/inc_multihead_self_attention.cu | 2 +- src/ops/kernels/linear_kernels.cpp | 2 +- src/ops/kernels/linear_kernels.cu | 2 +- src/ops/kernels/lora_linear_kernels.cpp | 2 +- src/ops/kernels/lora_linear_kernels.cu | 2 +- src/ops/kernels/residual_rms_norm_kernels.cpp | 2 +- src/ops/kernels/residual_rms_norm_kernels.cu | 2 +- src/ops/kernels/rms_norm_kernels.cpp | 2 +- src/ops/kernels/rms_norm_kernels.cu | 2 +- src/ops/layer_norm.cpp | 2 +- src/ops/layer_norm.cu | 2 +- src/ops/residual_layer_norm.cpp | 2 +- src/ops/residual_layer_norm.cu | 2 +- src/ops/sigmoid_silu_multi.cpp | 2 +- src/ops/sigmoid_silu_multi.cu | 2 +- src/runtime/batch_config.cc | 4 +- src/runtime/beam_search_batch_config.cc | 4 +- src/runtime/request_manager.cc | 97 ++++++++++++------- src/runtime/tree_verify_batch_config.cc | 4 +- 36 files changed, 176 insertions(+), 129 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 873fed0bdb..a509af765c 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -87,7 +87,7 @@ class BatchConfig { first_token_depth_in_request = 0; first_token_offset_in_batch = 0; num_tokens_in_batch = 0; - max_sequence_length = 0; + max_length = 0; request_guid = 0; prompt_phase = false; batch_config_request_id = -1; @@ -98,7 +98,7 @@ class BatchConfig { int first_token_depth_in_request; int first_token_offset_in_batch; int num_tokens_in_batch; - int max_sequence_length; + int max_length; // request id in batch config: int batch_config_request_id = -1; diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 52b4b3d362..5aa2fdd551 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -627,7 +627,8 @@ void flexflow_model_generate(flexflow_model_t handle_, enum RequestType *request_types, char const **input_texts, char **output_texts, - int *max_seq_lengths, + int *max_lengths, + int *max_new_tokens_, flexflow_peft_model_id_t *peft_model_ids, char const **dataset_filepaths, int *training_steps, diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index f0fab957ee..36a56012fc 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -67,7 +67,8 @@ struct Request { }; BatchConfig::RequestGuid guid; PEFTModelID peft_model_id = PEFTModelID::NO_ID; - int max_sequence_length = 128; + int max_length = -1; + int max_new_tokens = 128; int initial_len; int ssm_cache_size = 0; int llm_cache_size = 0; diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index c9ffff5c07..f8e16f24fa 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -271,7 +271,7 @@ void FlexFlow::top_level_task(Task const *task, printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); Request inference_req; inference_req.prompt = text; - inference_req.max_sequence_length = 128; + inference_req.max_length = 128; requests.push_back(inference_req); total_num_requests++; } diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index c55f2c0bfd..ee5bd1b460 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -340,7 +340,7 @@ void FlexFlow::top_level_task(Task const *task, printf("Inference prompt[%d]: %s\n", total_num_requests, text.c_str()); Request inference_req; inference_req.prompt = text; - inference_req.max_sequence_length = 128; + inference_req.max_length = 128; inference_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; requests.push_back(inference_req); diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc index 86d6d8cbbf..df9a1e35db 100644 --- a/inference/peft/peft_bwd_benchmark.cc +++ b/inference/peft/peft_bwd_benchmark.cc @@ -308,7 +308,7 @@ void FlexFlow::top_level_task(Task const *task, for (int i = 0; i < 100; i++) { Request inference_req; inference_req.benchmarking_tokens = 128; - inference_req.max_sequence_length = 256; + inference_req.max_length = 256; inference_req.warmup = true; inference_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; @@ -317,7 +317,7 @@ void FlexFlow::top_level_task(Task const *task, Request fine_tuning_req; fine_tuning_req.req_type = RequestType::REQ_FINETUNING; fine_tuning_req.benchmarking_tokens = 1024; - fine_tuning_req.max_sequence_length = 1024; + fine_tuning_req.max_length = 1024; fine_tuning_req.warmup = true; fine_tuning_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; @@ -361,7 +361,7 @@ void FlexFlow::top_level_task(Task const *task, Request fine_tuning_req; fine_tuning_req.req_type = RequestType::REQ_FINETUNING; fine_tuning_req.benchmarking_tokens = lengths[i]; - fine_tuning_req.max_sequence_length = lengths[i]; + fine_tuning_req.max_length = lengths[i]; fine_tuning_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; fine_tuning_req.max_training_steps = 1; diff --git a/inference/peft/peft_fwd_benchmark.cc b/inference/peft/peft_fwd_benchmark.cc index 9ff042c157..9b020f5954 100644 --- a/inference/peft/peft_fwd_benchmark.cc +++ b/inference/peft/peft_fwd_benchmark.cc @@ -333,7 +333,7 @@ void FlexFlow::top_level_task(Task const *task, // sequence_length); Request inference_req; inference_req.benchmarking_tokens = prompt.first; - inference_req.max_sequence_length = prompt.second + prompt.first; + inference_req.max_length = prompt.second + prompt.first; inference_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; requests.push_back(inference_req); diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc index 43008e74fe..cde3b1c02e 100644 --- a/inference/peft/req_rate_benchmark.cc +++ b/inference/peft/req_rate_benchmark.cc @@ -369,7 +369,7 @@ void FlexFlow::top_level_task(Task const *task, for (int i = 0; i < 100; i++) { Request inference_req; inference_req.benchmarking_tokens = 128; - inference_req.max_sequence_length = 256; + inference_req.max_length = 256; inference_req.warmup = true; inference_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; @@ -379,7 +379,7 @@ void FlexFlow::top_level_task(Task const *task, Request fine_tuning_req; fine_tuning_req.req_type = RequestType::REQ_FINETUNING; fine_tuning_req.benchmarking_tokens = 1024; - fine_tuning_req.max_sequence_length = 1024; + fine_tuning_req.max_length = 1024; fine_tuning_req.warmup = true; fine_tuning_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; @@ -443,7 +443,7 @@ void FlexFlow::top_level_task(Task const *task, Request fine_tuning_req; fine_tuning_req.req_type = RequestType::REQ_FINETUNING; fine_tuning_req.benchmarking_tokens = 1024; - fine_tuning_req.max_sequence_length = 1024; + fine_tuning_req.max_length = 1024; fine_tuning_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; fine_tuning_req.max_training_steps = 1000000000; @@ -473,7 +473,7 @@ void FlexFlow::top_level_task(Task const *task, // sequence_length); Request inference_req; inference_req.benchmarking_tokens = prompt.first; - inference_req.max_sequence_length = prompt.second + prompt.first; + inference_req.max_length = prompt.second + prompt.first; inference_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; requests.push_back(inference_req); diff --git a/inference/python/entrypoint/fastapi_incr.py b/inference/python/entrypoint/fastapi_incr.py index f2830e6e5e..34f61739fb 100644 --- a/inference/python/entrypoint/fastapi_incr.py +++ b/inference/python/entrypoint/fastapi_incr.py @@ -60,32 +60,28 @@ def get_configs(): # Define sample configs ff_init_configs = { # required parameters - "num_gpus": 4, - "memory_per_gpu": 20000, + "num_gpus": 2, + "memory_per_gpu": 14000, "zero_copy_memory_per_node": 40000, # optional parameters "num_cpus": 4, "legion_utility_processors": 4, "data_parallelism_degree": 1, - "tensor_parallelism_degree": 4, - "pipeline_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 2, "offload": False, - "offload_reserve_space_size": 8 * 1024, # 8GB + "offload_reserve_space_size": 1024**2, "use_4bit_quantization": False, "use_8bit_quantization": False, - "enable_peft": False, - "peft_activation_reserve_space_size": 1024, # 1GB - "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, - "benchmarking": False, "inference_debugging": False, "fusion": True, } llm_configs = { # required parameters - "llm_model": "meta-llama/Meta-Llama-3.1-8B", + "llm_model": "tiiuae/falcon-7b", # optional parameters - "cache_path": os.environ.get("FF_CACHE_PATH", ""), + "cache_path": "", "refresh_cache": False, "full_precision": False, "prompt": "", @@ -106,9 +102,7 @@ async def startup_event(): configs = SimpleNamespace(**configs_dict) ff.init(configs_dict) - ff_data_type = ( - ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF - ) + ff_data_type = ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF llm = ff.LLM( configs.llm_model, data_type=ff_data_type, @@ -123,7 +117,7 @@ async def startup_event(): llm.compile( generation_config, max_requests_per_batch=1, - max_seq_length=2048, + max_seq_length=256, max_tokens_per_batch=64, ) llm.start_server() diff --git a/inference/python/streamlit/fastapi_incr.py b/inference/python/streamlit/fastapi_incr.py index 0bc20f3b0a..622f50008e 100644 --- a/inference/python/streamlit/fastapi_incr.py +++ b/inference/python/streamlit/fastapi_incr.py @@ -138,7 +138,7 @@ async def startup_event(): ) llm.compile( generation_config, - max_requests_per_batch=1, + max_requests_per_batch=16, max_seq_length=2048, max_tokens_per_batch=64, ) diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 9689080825..134ae70c4a 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -421,7 +421,7 @@ void FlexFlow::top_level_task(Task const *task, // Add inference request Request inference_req; inference_req.prompt = text; - inference_req.max_sequence_length = 128; + inference_req.max_length = 128; requests.push_back(inference_req); total_num_requests++; } diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index d065398f87..ec07ee9a5f 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -38,9 +38,10 @@ ) from flexflow.config import * from .flexflowlib import ffi, flexflow_library -from typing import Union, List +from typing import Union, List, Optional +from dataclasses import dataclass from peft import LoraConfig -import json +import json, math def ffc(): @@ -2049,25 +2050,16 @@ def no_id_handle(): # Request # ----------------------------------------------------------------------- - +@dataclass class Request: """A class to record the metadata of an inference or finetuning request.""" - - def __init__( - self, - req_type: RequestType, - prompt: str = None, - max_sequence_length: int = 2048, - peft_model_id: PEFTModelID = None, - dataset_filepath: str = None, - max_training_steps: int = 1, - ): - self.req_type = req_type - self.prompt = prompt - self.max_sequence_length = max_sequence_length - self.peft_model_id = peft_model_id - self.dataset_filepath = dataset_filepath - self.max_training_steps = max_training_steps + req_type: RequestType + prompt: Optional[str] = None + max_length: int = -1 + max_new_tokens: int = 128 + peft_model_id: Optional[PEFTModelID] = None + dataset_filepath: Optional[str] = None + max_training_steps: int = 1 # ----------------------------------------------------------------------- @@ -4665,19 +4657,23 @@ def get_output_tensor(self, ffmodel, data_type): assert ret_val == True return np_array - def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 2048): + def generate_inf_only(self, prompt_list: List[str], max_length: int = -1, max_new_tokens: int = 128): + if max_length != -1 and max_new_tokens != -1: + warnings.warn(f"Both `max_new_tokens` (={self.max_new_tokens}) and `max_length`(={self.max_length}) seem to have been set. `max_new_tokens` will take precedence.") assert isinstance(prompt_list, list) c_input_texts = [get_c_name(prompt) for prompt in prompt_list] - max_num_chars = 5 * (max_sequence_length + 100) + estimated_max_tokens = math.ceil(max_new_tokens + max([len(prompt.split()) for prompt in prompt_list])*1.5) if max_new_tokens != -1 else max_length + max_num_chars = 5 * (estimated_max_tokens + 100) c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list] c_output_length_and_tokens = [ - ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list + ffi.new("int[]", estimated_max_tokens + 100) for prompt in prompt_list ] c_request_types = [ enum_to_int(RequestType, RequestType.REQ_INFERENCE) for prompt in prompt_list ] - max_sequence_lengths = [max_sequence_length for prompt in prompt_list] + max_lengths = [max_length for prompt in prompt_list] + max_new_tokens_ = [max_new_tokens for prompt in prompt_list] peft_model_ids = [PEFTModelID.no_id_handle() for prompt in prompt_list] dataset_filepaths = [ffi.NULL for prompt in prompt_list] training_steps = [0 for prompt in prompt_list] @@ -4689,7 +4685,8 @@ def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 2 c_request_types, c_input_texts, c_output_texts, - max_sequence_lengths, + max_lengths, + max_new_tokens_, peft_model_ids, dataset_filepaths, training_steps, @@ -4726,9 +4723,16 @@ def generate(self, requests_list: List[Request]): c_request_types = [ enum_to_int(RequestType, request.req_type) for request in requests_list ] - max_sequence_lengths = [ - request.max_sequence_length for request in requests_list + max_lengths = [ + request.max_length for request in requests_list + ] + max_new_tokens_ = [ + request.max_new_tokens for request in requests_list ] + for i in range(len(requests_list)): + if max_lengths[i] != -1 and max_new_tokens_[i] != -1: + warnings.warn(f"Both `max_new_tokens` (={max_new_tokens_[i]}) and `max_length`(={max_lengths[i]}) seem to have been set. `max_new_tokens` will take precedence.") + peft_model_ids = [ ( request.peft_model_id @@ -4752,7 +4756,8 @@ def generate(self, requests_list: List[Request]): c_request_types, c_input_texts, c_output_texts, - max_sequence_lengths, + max_lengths, + max_new_tokens_, peft_model_ids, dataset_filepaths, training_steps, diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 988789bab4..32e8e49453 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -498,12 +498,17 @@ def compile( def generate( self, requests_or_prompts: Union[str, List[str], Request, List[Request]], - max_length: int = 2048, + max_length: int = -1, + max_new_tokens: int = 128, ): """Generate tokens based on the input prompt(s) :param requests_or_prompts: The generation prompt(s) in the form of a string, a list of strings, a Request, or list of Requests :type requests_or_prompts: Union[str, List[str], Request, List[Request]] + :param max_length: The maximum length in tokens of the prompt + generated sequence, defaults to -1 (no maximum length) + :type max_length: int, optional + :param max_new_tokens: The maximum number of new tokens (excluding the prompt) to generate, defaults to 128 + :type max_new_tokens: int, optional :return: the generation results :rtype: GenerationResult """ @@ -511,7 +516,7 @@ def generate( if len(requests_or_prompts) == 0: return None return self.model.ffmodel.generate_inf_only( - [requests_or_prompts], max_length + [requests_or_prompts], max_length, max_new_tokens ) elif type(requests_or_prompts) == Request: return self.model.ffmodel.generate(requests_or_prompts) @@ -520,7 +525,7 @@ def generate( return [] if type(requests_or_prompts[0]) == str: return self.model.ffmodel.generate_inf_only( - requests_or_prompts, max_length + requests_or_prompts, max_length, max_new_tokens ) else: print(requests_or_prompts) diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index e39cb29037..e6b246597f 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1622,7 +1622,8 @@ void flexflow_model_generate(flexflow_model_t handle_, enum RequestType *request_types, char const **input_texts, char **output_texts, - int *max_seq_lengths, + int *max_lengths, + int *max_new_tokens_, flexflow_peft_model_id_t *peft_model_ids, char const **dataset_filepaths, int *training_steps, @@ -1637,21 +1638,24 @@ void flexflow_model_generate(flexflow_model_t handle_, std::string const text_str(input_texts[i]); Request inference_req; inference_req.prompt = text_str; - inference_req.max_sequence_length = max_seq_lengths[i]; + inference_req.max_length = max_lengths[i]; + inference_req.max_new_tokens = max_new_tokens_[i]; PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]); if (peft_model_id != nullptr) { inference_req.peft_model_id = *peft_model_id; } requests.push_back(inference_req); - DEBUG_PRINT("[Model] generate[%d] %p %s %i", + DEBUG_PRINT("[Model] generate[%d] %p %s %i %i", i, handle, text_str.c_str(), - max_seq_lengths[i]); + max_lengths[i], + max_new_tokens_[i]); } else if (request_types[i] == RequestType::REQ_FINETUNING) { Request fine_tuning_req; fine_tuning_req.req_type = RequestType::REQ_FINETUNING; - fine_tuning_req.max_sequence_length = max_seq_lengths[i]; + fine_tuning_req.max_length = max_lengths[i]; + fine_tuning_req.max_new_tokens = max_new_tokens_[i]; PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]); if (peft_model_id != nullptr) { fine_tuning_req.peft_model_id = *peft_model_id; @@ -1660,11 +1664,12 @@ void flexflow_model_generate(flexflow_model_t handle_, fine_tuning_req.dataset_filepath = dataset_fp; fine_tuning_req.max_training_steps = training_steps[i]; requests.push_back(fine_tuning_req); - DEBUG_PRINT("[Model] finetune[%d] %p %s %i %i", + DEBUG_PRINT("[Model] finetune[%d] %p %s %i %i %i", i, handle, dataset_fp.c_str(), - max_seq_lengths[i], + max_lengths[i], + max_new_tokens[i], training_steps[i]); } else { assert(false && "Unknown request type"); @@ -1678,8 +1683,17 @@ void flexflow_model_generate(flexflow_model_t handle_, // If the prompt exceeds max seq len, check that we return the prompt with // no additional token. Otherwise, check that the output does not exceed // the max sequence length. - assert(results[i].output_tokens.size() <= max_seq_lengths[i] || - results[i].output_tokens.size() == results[i].input_tokens.size()); + int total_tokens = results[i].output_tokens.size(); + int num_output_tokens = total_tokens - results[i].input_tokens.size(); + if (max_new_tokens_[i] >= 0) { + assert(num_output_tokens <= max_new_tokens_[i]); + } + if (max_lengths[i] >= 0) { + assert(total_tokens <= max_lengths[i] || num_output_tokens == 0); + } + // assert(results[i].output_tokens.size() <= max_seq_lengths[i] || + // results[i].output_tokens.size() == + // results[i].input_tokens.size()); output_length_and_tokens[i][0] = results[i].output_tokens.size(); std::copy(results[i].output_tokens.begin(), results[i].output_tokens.end(), diff --git a/src/ops/add_bias_residual_layer_norm.cpp b/src/ops/add_bias_residual_layer_norm.cpp index 681f55c998..cb140e0c75 100644 --- a/src/ops/add_bias_residual_layer_norm.cpp +++ b/src/ops/add_bias_residual_layer_norm.cpp @@ -224,7 +224,7 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu index bcca1ba2c6..2d2707f10b 100644 --- a/src/ops/add_bias_residual_layer_norm.cu +++ b/src/ops/add_bias_residual_layer_norm.cu @@ -222,7 +222,7 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 826fea4347..92cfdef5a6 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -1526,7 +1526,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; // Copy query to m->query_activation_buffer if we need to compute // PEFT backward if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index b278611b60..39c7397f6b 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1492,7 +1492,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; // Copy query to m->query_activation_buffer if we need to compute // PEFT backward if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index a36d6719c9..6b371b840e 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -238,7 +238,7 @@ void inference_kernel_wrapper(LinearMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch; if (bc->requestsInfo[i].peft_bwd) { size_t activation_size_needed = diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index d4f930db6c..ffd2c66c9b 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -239,7 +239,7 @@ void inference_kernel_wrapper(LinearMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch; if (bc->requestsInfo[i].peft_bwd) { size_t activation_size_needed = diff --git a/src/ops/kernels/lora_linear_kernels.cpp b/src/ops/kernels/lora_linear_kernels.cpp index c3c2cce3cf..eab8899167 100644 --- a/src/ops/kernels/lora_linear_kernels.cpp +++ b/src/ops/kernels/lora_linear_kernels.cpp @@ -249,7 +249,7 @@ void inference_kernel(LoraLinearMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != m->model_state.end()); diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 5f130782aa..93e5820f9c 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -248,7 +248,7 @@ void inference_kernel(LoraLinearMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != m->model_state.end()); diff --git a/src/ops/kernels/residual_rms_norm_kernels.cpp b/src/ops/kernels/residual_rms_norm_kernels.cpp index 016364edfd..cbdb8ee153 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cpp +++ b/src/ops/kernels/residual_rms_norm_kernels.cpp @@ -273,7 +273,7 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index 0d44f0260a..285a5a5b8f 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -270,7 +270,7 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp index 4158628005..551cb72022 100644 --- a/src/ops/kernels/rms_norm_kernels.cpp +++ b/src/ops/kernels/rms_norm_kernels.cpp @@ -227,7 +227,7 @@ void inference_kernel_wrapper(RMSNormMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index dd6ada864d..8f59d65ea7 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -225,7 +225,7 @@ void inference_kernel_wrapper(RMSNormMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index 27d314e21e..2fe4a85905 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -256,7 +256,7 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 0801d11617..b08b23819c 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -255,7 +255,7 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp index 582e0752ef..7f6b0b370d 100644 --- a/src/ops/residual_layer_norm.cpp +++ b/src/ops/residual_layer_norm.cpp @@ -280,7 +280,7 @@ void ResidualLayerNorm::inference_kernel_wrapper( continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index 8cdf87a92c..6caf6b436d 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -278,7 +278,7 @@ void ResidualLayerNorm::inference_kernel_wrapper( continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp index ceaa1a7788..50a358beab 100644 --- a/src/ops/sigmoid_silu_multi.cpp +++ b/src/ops/sigmoid_silu_multi.cpp @@ -130,7 +130,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper( continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { size_t input_tensor_size = diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu index 929d557a17..ca0168a59d 100644 --- a/src/ops/sigmoid_silu_multi.cu +++ b/src/ops/sigmoid_silu_multi.cu @@ -129,7 +129,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper( continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { size_t input_tensor_size = diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 4c339750c7..a4bf960a2c 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -162,8 +162,8 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { << bc.requestsInfo[i].first_token_offset_in_batch << std::endl; os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; - os << " Max sequence length: " - << bc.requestsInfo[i].max_sequence_length << std::endl; + os << " Max sequence length: " << bc.requestsInfo[i].max_length + << std::endl; os << " BatchConfig Req ID: " << bc.requestsInfo[i].batch_config_request_id << std::endl; os << " Prompt phase: " << bc.requestsInfo[i].prompt_phase diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index b10f8e82ab..83e4390993 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -141,8 +141,8 @@ std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) { os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl; os << " PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; - os << " Max sequence length: " - << bc.requestsInfo[i].max_sequence_length << std::endl; + os << " Max sequence length: " << bc.requestsInfo[i].max_length + << std::endl; os << " Request completed: " << bc.request_completed[i] << std::endl; os << " Request running: " << bc.request_running[i] << std::endl; os << " Beam Search Specific: " << std::endl; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 31a32dd3c8..44b181fcb3 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -54,7 +54,8 @@ std::ostream &operator<<(std::ostream &os, Request const &req) { os << "Request {\n"; os << " guid: " << req.guid << "\n"; os << " peft_model_id: " << req.peft_model_id << "\n"; - os << " max_sequence_length: " << req.max_sequence_length << "\n"; + os << " max_length: " << req.max_length << "\n"; + os << " max_new_tokens: " << req.max_new_tokens << "\n"; os << " initial_len: " << req.initial_len << "\n"; os << " ssm_cache_size: " << req.ssm_cache_size << "\n"; os << " llm_cache_size: " << req.llm_cache_size << "\n"; @@ -261,24 +262,45 @@ RequestManager::RequestGuid Request request; request.status = Request::PENDING; request.guid = next_available_guid++; - request.max_sequence_length = request_.max_sequence_length; + request.max_length = request_.max_length; + request.max_new_tokens = request_.max_new_tokens; + if (request.max_length != -1 && request.max_new_tokens != -1) { + std::cout + << "Both `max_new_tokens` (=" << request.max_new_tokens + << ") and `max_length`(=" << request.max_length + << ") seem to have been set. `max_new_tokens` will take precedence."; + } request.peft_model_id = request_.peft_model_id; request.warmup = request_.warmup; if (bos_token_id >= 0 && model_type != ModelType::FALCON) { request.tokens.push_back(bos_token_id); } if (request_.benchmarking_tokens >= 0) { - assert(request_.benchmarking_tokens < get_max_sequence_length()); + assert(request_.benchmarking_tokens < get_max_sequence_length() && + "Benchmarking tokens exceed max sequence length"); request.benchmarking_tokens = request_.benchmarking_tokens; request.tokens.insert(request.tokens.end(), request_.benchmarking_tokens, 15); // insert random number } else { std::vector tokens = this->tokenizer_->Encode(request_.prompt); + // from here on, we will only use the max_length parameter + if (request.max_new_tokens != -1) { + request.max_length = tokens.size() + request.max_new_tokens; + } + // check that max sequence length is not exceeded + // 1. prompt itself should be less than max sequence length if (tokens.size() >= get_max_sequence_length()) { - std::cout << "Warning: too many tokens in prompt, only load up to " - << get_max_sequence_length() << " tokens, but got " - << tokens.size() << ".\n"; + std::cout << "Error: prompt (" << tokens.size() + << " tokens) exceeds max sequence length of " + << get_max_sequence_length() << ".\n"; + return INVALID_GUID; + } + // 2. max_length should not exceed the max_sequence_length + if (request.max_length >= get_max_sequence_length()) { + std::cout << "Error: max_length (" << request.max_length + << ") exceeds max sequence length of " + << get_max_sequence_length() << ".\n"; return INVALID_GUID; } for (int i = 0; i < tokens.size(); i++) { @@ -341,7 +363,18 @@ RequestManager::RequestGuid request.status = Request::PENDING; request.guid = next_available_guid++; request.initial_len = 0; - request.max_sequence_length = request_.max_sequence_length; + request.max_length = request_.max_length; + request.max_new_tokens = request_.max_new_tokens; + if (request.max_length != -1) { + std::cout << "Warning: max_length is set for PEFT finetuning, but it will " + "be ignored." + << std::endl; + } + if (request.max_new_tokens != -1) { + std::cout << "Warning: max_new_tokens is set for PEFT finetuning, but " + "it will be ignored." + << std::endl; + } request.peft_model_id = request_.peft_model_id; request.req_type = RequestType::REQ_FINETUNING; request.completed_training_steps = 0; @@ -352,7 +385,8 @@ RequestManager::RequestGuid // Load dataset if (request_.benchmarking_tokens >= 0) { - assert(request_.benchmarking_tokens <= get_max_sequence_length()); + assert(request_.benchmarking_tokens <= get_max_sequence_length() && + "Benchmarking tokens exceed max sequence length"); request.benchmarking_tokens = request_.benchmarking_tokens; std::vector input_tokens; std::vector output_tokens; @@ -385,9 +419,10 @@ RequestManager::RequestGuid this->tokenizer_->Encode(output_text); if (input_tokens.size() + output_tokens.size() > get_max_sequence_length()) { - std::cout << "Warning: too many tokens in sample, only load up to " - << get_max_sequence_length() << " tokens, but got " - << input_tokens.size() + output_tokens.size() << ".\n"; + std::cout << "Error: sample in training dataset is " + << input_tokens.size() + output_tokens.size() + << " tokens long, exceeding the maximum sequence length of " + << get_max_sequence_length() << " tokens.\n"; return INVALID_GUID; } else { request.dataset.push_back(std::make_pair(input_tokens, output_tokens)); @@ -515,7 +550,7 @@ bool RequestManager::check_inf_req_completion(BatchConfig const &old_bc, Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; bool request_completed = false; // printf("model_type = %d\n", this->model_type); - if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) { + if (request.tokens.size() >= old_bc.requestsInfo[i].max_length) { request_completed = true; } else if (request.tokens.back() == eos_token_id) { // Encounter EOS token id @@ -698,8 +733,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].peft_model_id = old_bc.requestsInfo[i].peft_model_id; new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length; num_active_req++; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 == @@ -765,8 +799,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].num_tokens_in_batch = std::min(get_max_tokens_per_batch() - new_bc.num_tokens, (int)new_request.tokens.size()); - new_bc.requestsInfo[i].max_sequence_length = - new_request.max_sequence_length; + new_bc.requestsInfo[i].max_length = new_request.max_length; new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id; new_bc.requestsInfo[i].peft_bwd = false; new_bc.request_completed[i] = false; @@ -932,8 +965,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.num_active_infr_tokens(); new_bc.requestsInfo[inference_batch_size].num_tokens_in_batch = num_peft_tokens; - new_bc.requestsInfo[inference_batch_size].max_sequence_length = - request.max_sequence_length; + new_bc.requestsInfo[inference_batch_size].max_length = request.max_length; new_bc.requestsInfo[inference_batch_size].request_guid = request.guid; new_bc.requestsInfo[inference_batch_size].peft_model_id = request.peft_model_id; @@ -1076,10 +1108,10 @@ BeamSearchBatchConfig verified_tokens.size()); // check if the request is finished if (verified_tokens.size() + request.tokens.size() >= - request.max_sequence_length) { + request.max_length) { // Append all verified tokens to the request for (auto const &token_pair : verified_tokens) { - if (token_pair.second < request.max_sequence_length) { + if (token_pair.second < request.max_length) { request.tokens.push_back(token_pair.first); } } @@ -1171,14 +1203,13 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length; new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // TODO: Beam Request Info, missing from VerifyTreeBatchConfig int new_max_depth = - new_bc.requestsInfo[i].max_sequence_length - + new_bc.requestsInfo[i].max_length - new_bc.requestsInfo[i].first_token_depth_in_request - verified_tokens.size(); new_bc.beamRequestsInfo[i].current_depth = 1; @@ -1254,8 +1285,7 @@ BeamSearchBatchConfig request.ssm_cache_size; new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length; new_bc.requestsInfo[i].num_tokens_in_batch = 0; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; @@ -1307,8 +1337,7 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].num_tokens_in_batch = std::min(get_max_tokens_per_batch() - new_bc.num_tokens, (int)new_request.tokens.size()); - new_bc.requestsInfo[i].max_sequence_length = - new_request.max_sequence_length; + new_bc.requestsInfo[i].max_length = new_request.max_length; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request @@ -1484,8 +1513,7 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length; profiling_requests[request.guid].ssm_decoding_steps += 1; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // update the beam search metadata @@ -1613,8 +1641,7 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // update the beam search metadata @@ -1816,8 +1843,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_batches.at(0).requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_batches.at(0).requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].max_length = + old_batches.at(0).requestsInfo[i].max_length; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // copy bitmask to verify batchconfig @@ -1958,8 +1985,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_batches.at(0).requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_batches.at(0).requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].max_length = + old_batches.at(0).requestsInfo[i].max_length; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; new_bc.request_completed[i] = false; diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc index a71b1070b2..f8ac6089fe 100644 --- a/src/runtime/tree_verify_batch_config.cc +++ b/src/runtime/tree_verify_batch_config.cc @@ -58,8 +58,8 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) { os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl; os << " PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; - os << " Max sequence length: " - << bc.requestsInfo[i].max_sequence_length << std::endl; + os << " Max sequence length: " << bc.requestsInfo[i].max_length + << std::endl; os << " Request completed: " << bc.request_completed[i] << std::endl; os << " Request running: " << bc.request_running[i] << std::endl; } From 85797e091cebad393649525fc5623ab32b03fe11 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 1 Oct 2024 07:50:51 +0000 Subject: [PATCH 07/37] backup --- include/flexflow/model.h | 7 +- include/flexflow/ops/lora_linear.h | 8 +- include/flexflow/ops/lora_linear_params.h | 4 +- python/flexflow/serve/serve.py | 42 +++++++++ src/ops/lora_linear.cc | 105 ++++++++++++++++++++-- src/runtime/inference_manager.cc | 1 + 6 files changed, 154 insertions(+), 13 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 4ad735ef7d..5ac91d5b81 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -845,7 +845,8 @@ class FFModel { // ======================================== // PEFT Layers // ======================================== - PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config); +// PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config); + void add_lora_layers(std::vector target_modules, int max_rank, int max_concurrent_adapters); // ======================================== // Inference APIs // ======================================== @@ -1180,8 +1181,8 @@ class FFModel { std::vector parameters; // PEFT related std::unordered_map base_layer_to_peft_layer; - std::unordered_map> peft_layer_to_peft_id; - std::unordered_map peft_configs; +// std::unordered_map> peft_layer_to_peft_id; +// std::unordered_map peft_configs; // std::vector peft_operators; FFHandler handlers[MAX_NUM_WORKERS]; diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h index 9e83c3f90e..8d37be0c64 100644 --- a/include/flexflow/ops/lora_linear.h +++ b/include/flexflow/ops/lora_linear.h @@ -23,7 +23,9 @@ class LoraLinear : public Op { OperatorType type, ParallelTensor const input, ParallelTensor const output, - std::unordered_map const &_peft_configs, + int max_rank, + int max_concurrent_adapters, + // std::unordered_map const &_peft_configs, char const *name = nullptr); LoraLinear(FFModel &model, LoraLinear const &other, @@ -91,7 +93,9 @@ class LoraLinear : public Op { // size_t get_params_hash() const override; LoraLinearParams get_params() const; - std::unordered_map peft_configs; + // std::unordered_map peft_configs; + int max_rank; + int max_concurrent_adapters; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h index 70539271f2..1cdeb65aa2 100644 --- a/include/flexflow/ops/lora_linear_params.h +++ b/include/flexflow/ops/lora_linear_params.h @@ -129,7 +129,9 @@ class LoraLinearParams { public: LayerID layer_guid; OperatorType type; - std::unordered_map peft_configs; + // std::unordered_map peft_configs; + int max_rank; + int max_concurrent_adapters; char name[MAX_OPNAME]; bool is_valid(std::pair const diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 32e8e49453..794f1babb3 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -611,3 +611,45 @@ def compile( model_specific_pipeline_parallelism_degree, ssms, ) + +from safetensors import safe_open +from huggingface_hub import hf_hub_download +def download_and_convert_peft_model(peft_model_id: str, data_type: DataType = DataType.DT_HALF, cache_path: str = "", refresh_cache: bool = False): + if data_type != DataType.DT_FLOAT and data_type != DataType.DT_HALF: + raise ValueError("data_type must be either DataType.DT_FLOAT or DataType.DT_HALF") + adapter_path = hf_hub_download(repo_id=peft_model_id, filename="adapter_model.safetensors") + peft_config = PeftConfig.from_pretrained(peft_model_id) + base_model_name_or_path = peft_config.base_model_name_or_path + llm = LLM(base_model_name_or_path, data_type, cache_path, refresh_cache) + + # Save peft config to file + peft_config_dir = os.path.join( + os.path.expanduser(llm.cache_path), "configs", peft_model_id.lower() + ) + os.makedirs(peft_config_dir, exist_ok=True) + peft_config_path = os.path.join(peft_config_dir, "config.json") + print(f"Saving {peft_model_id} configs to file {peft_config_path}...") + with open(peft_config_path, "w") as json_file: + + class SetEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, set): + return list(obj) + return super().default(obj) + + json.dump(peft_config.to_dict(), json_file, indent=2, cls=SetEncoder) + + # Save peft weights to file + with safe_open(adapter_path, framework="pt", device="cpu") as f: + for tensor_name in f.keys(): + tensor = f.get_tensor(tensor_name) + if data_type == DataType.DT_HALF: + tensor = tensor.half() + else: + tensor = tensor.float() + tensor_name = tensor_name.replace("base_model.model.model.", "").replace(".default", "") + print(tensor_name) + + tensor_name = llm.model_class.convert_hf_weight_name(tensor_name) + tensor.detach().cpu().numpy().tofile(f"{llm.weights_path}/{tensor_name}") + diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index fde6bc2b28..e97087ea68 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -51,6 +51,82 @@ bool check_lora_layer_match(Layer *potential_target, return false; } +void FFmodel::add_lora_layers(std::vector target_modules, int max_rank, int max_concurrent_adapters) { + assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled"); + assert(target_modules.size() > 0 && "LoRA target module name is empty"); + assrt(max_rank > 1 && max_rank <= 32 && "Invalid max LoRA rank"); + assert(max_concurrent_adapters > 0 && "Invalid number of LoRA concurrent adapters"); + + for (std::string target_module_name : target_modules) { + assert(target_module_name.length() > 0 && "LoRA target module name is empty"); + // find target layer + for (auto it = layers.begin(); it != layers.end(); ++it) { + Layer *target_module = *it; + bool match = check_lora_layer_match(target_module, target_module_name); + if (!match) { + continue; + } + assert(base_layer_to_peft_layer.find(target_module) == base_layer_to_peft_layer.end() && "LoRA layer already added, attempting to add again"); + // Get input and output tensors from target module + Tensor const input = target_module->inputs[0]; + Tensor const output = target_module->outputs[0]; + assert(input->data_type == output->data_type); + // Compute OP_LORA layer name, based on target module name + std::string name_ = target_module->name + ? std::string(target_module->name) + : std::string(""); + size_t last_underscore = name_.length() - 1; + for (int i = name_.length() - 1; i > 0; i--) { + if (!(std::isdigit(target_module->name[i]) || + target_module->name[i] == '_')) { + break; + } else if (target_module->name[i] == '_') { + last_underscore = i; + } + } + name_.erase(last_underscore); + name_ += ".lora"; + std::cout << "Adding layer " << name_ << std::endl; + // Create OP_LORA layer given input, output and name + Layer *peft_layer = new Layer(this, + OP_LORA, + output->data_type, + name_.c_str(), + 2 /*inputs*/, + 0 /*weights*/, + 1 /*outputs*/, + input, + output); + // fix LoRA layer's transformer layer ID and model ID (to be the same as target module) + peft_layer->layer_guid.transformer_layer_id = + target_module->layer_guid.transformer_layer_id; + peft_layer->layer_guid.model_id = target_module->layer_guid.model_id; + // set up output tensor for OP_LORA layer + { + int numdims = output->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = output->dims[i]; + } + peft_layer->outputs[0] = + create_tensor_legion_ordering(numdims, + dims, + output->data_type, + peft_layer, + 0, + true /*create_grad*/); + } + // pass max_rank and max_concurrent_adapters to OP_LORA layer + peft_layer->add_int_property("max_rank", max_rank); + peft_layer->add_int_property("max_concurrent_adapters", max_concurrent_adapters); + it = layers.insert(it + 1, peft_layer); + ++it; + base_layer_to_peft_layer[target_module] = peft_layer; + } + } +} + +#ifdef DEADCODE PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) { assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled"); @@ -175,11 +251,18 @@ PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) { return peft_model_id; } +#endif Op *LoraLinear::create_operator_from_layer( FFModel &model, Layer const *layer, std::vector const &inputs) { + long long value; + layer->get_int_property("max_rank", value); + int max_rank = value; + layer->get_int_property("max_concurrent_adapters", max_concurrent_adapters); + int max_concurrent_adapters = value; +#ifdef DEADCODE std::unordered_map _peft_configs; std::vector const &peft_ids = model.peft_layer_to_peft_id[(Layer *)layer]; @@ -187,12 +270,14 @@ Op *LoraLinear::create_operator_from_layer( _peft_configs.emplace( std::make_pair(peft_ids[i], model.peft_configs[peft_ids[i]])); } +#endif return new LoraLinear(model, layer->layer_guid, layer->op_type, inputs[0], inputs[1], - _peft_configs, + max_rank, + max_concurrent_adapters, layer->name); } @@ -205,7 +290,8 @@ LoraLinear::LoraLinear(FFModel &model, other.op_type, input, output, - other.peft_configs, + other.max_rank, + other.max_concurrent_adapters, other.name) {} LoraLinear::LoraLinear(FFModel &model, @@ -217,7 +303,8 @@ LoraLinear::LoraLinear(FFModel &model, params.type, inputs.first, inputs.second, - params.peft_configs, + params.max_rank, + params.max_concurrent_adapters, params.name) {} LoraLinear::LoraLinear( @@ -226,7 +313,9 @@ LoraLinear::LoraLinear( OperatorType _op_type, ParallelTensor const _input, ParallelTensor const _output, - std::unordered_map const &_peft_configs, + int _max_rank, + int _max_concurrent_adapters, + // std::unordered_map const &_peft_configs, char const *name) : Op(model, _op_type, @@ -256,9 +345,11 @@ LoraLinear::LoraLinear( outputs[0] = model.create_parallel_tensor_legion_ordering( numdim, dims, inputs[1]->data_type, this); } - for (auto const &kv : _peft_configs) { - peft_configs.insert(kv); - } + // for (auto const &kv : _peft_configs) { + // peft_configs.insert(kv); + // } + max_rank = _max_rank; + max_concurrent_adapters = _max_concurrent_adapters; // assert(check_output_input_weight_parallel_dims(allocate_weights)); } diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 1b65dfd869..20b2a5b963 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -837,4 +837,5 @@ std::string join_path(std::vector const &paths) { return joined; } + }; // namespace FlexFlow From bb08d695c127d5a0639c6399db2f7ba6c9fec315 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 1 Oct 2024 18:01:18 +0000 Subject: [PATCH 08/37] update --- include/flexflow/utils/peft_weight_allocator.h | 15 +++++++++++++++ src/runtime/model.cu | 3 ++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h index dae46a8af1..0d43d4722b 100644 --- a/include/flexflow/utils/peft_weight_allocator.h +++ b/include/flexflow/utils/peft_weight_allocator.h @@ -21,6 +21,7 @@ namespace FlexFlow { +#ifdef DEACODE class PEFTWeightAllocator { public: PEFTWeightAllocator(void *_base_ptr, size_t _total_size) @@ -86,6 +87,20 @@ class PEFTWeightAllocator { std::unordered_map> sync_weights; std::mutex peft_weight_allocator_mutex; }; +#endif + +class PEFTMemoryManager { +public: + PEFTMemoryManager(int max_rank_, int max_concurrent_adapters_, int lora_in_dim, int lora_out_dim) : max_rank(max_rank_), max_concurrent_adapters(max_concurrent_adapters_), lora_in_dim(lora_in_dim), lora_out_dim(lora_out_dim) {} + + void allocate_memory(); + void register_peft_model(PEFTModelID const &model_id); + + + + int max_rank, max_concurrent_adapters; + int lora_in_dim, lora_out_dim; +} }; // namespace FlexFlow diff --git a/src/runtime/model.cu b/src/runtime/model.cu index 5dab73e1a4..136ce99edd 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -168,7 +168,7 @@ FFHandler } else { handle.batch_config_metadata = nullptr; } - +#ifdef DEADCODE if (info->peft_activation_reserve_space_size > 0) { // allocate memory for peft activation reserve space Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) @@ -208,6 +208,7 @@ FFHandler } else { handle.peft_weight_allocator = nullptr; } +#endif // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL handle.ncclComm = NULL; From 62275c22aa37428a711bdfceacafd1477b4294dd Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 2 Oct 2024 04:13:24 +0000 Subject: [PATCH 09/37] backup --- .../ops/kernels/lora_linear_kernels.h | 5 +- .../flexflow/utils/peft_weight_allocator.h | 78 +++++++++++++++++-- src/ops/lora_linear.cc | 5 ++ 3 files changed, 79 insertions(+), 9 deletions(-) diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h index 5360b5f8ea..2fde38728a 100644 --- a/include/flexflow/ops/kernels/lora_linear_kernels.h +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -35,8 +35,9 @@ class LoraLinearMeta : public OpMeta { // PEFT related fields void *low_rank_activation; void *input_activation; - std::unordered_map model_state; - size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0; + // std::unordered_map model_state; + // size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0; + PEFTMemoryManager *peft_memory_manager; }; namespace Kernels { diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h index 0d43d4722b..9028656949 100644 --- a/include/flexflow/utils/peft_weight_allocator.h +++ b/include/flexflow/utils/peft_weight_allocator.h @@ -91,15 +91,79 @@ class PEFTWeightAllocator { class PEFTMemoryManager { public: - PEFTMemoryManager(int max_rank_, int max_concurrent_adapters_, int lora_in_dim, int lora_out_dim) : max_rank(max_rank_), max_concurrent_adapters(max_concurrent_adapters_), lora_in_dim(lora_in_dim), lora_out_dim(lora_out_dim) {} - - void allocate_memory(); - void register_peft_model(PEFTModelID const &model_id); + PEFTMemoryManager(size_t max_lora_size_, int max_concurrent_adapters_) + : max_concurrent_adapters(max_concurrent_adapters_), max_lora_size(max_lora_size_), base_ptr(nullptr) {} + + // allocate memory for all the PEFT adapters for a given layer on a given shard + void allocate_memory(Memory gpu_mem) { + // allocate chunk of memory for all the PEFT adapters + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(max_lora_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance::create_instance(peftLegionInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + base_ptr = peftLegionInst.pointer_untyped(0, sizeof(char)); + } + // Returns the slot in memory where the peft model weights are/will be stored. + // If the model is not in memory (cache miss), set the cache_miss flag to true. + void *get_peft_model_handle(PEFTModelID const &model_id, bool *cache_miss) { + assert(base_ptr != nullptr && "PEFT Memory Manager not initialized"); + assert(lru_hashtable.size() == lru_list.size() && + lru_list.size() == peft2mem_slot.size() && + "PEFT Memory Manager LRU hashtable/list and/or peft2mem_slot are out of sync"); + // check for cache hit + if (lru_hashtable.find(model_id) != lru_hashtable.end()) { + int lru_list_index = lru_hashtable[model_id]; + assert(lru_list[lru_list_index] == model_id && + "PEFT Memory Manager LRU hashtable/list are out of sync"); + // move the model to the end of the LRU list + lru_list.erase(lru_list.begin() + lru_list_index); + lru_list.push_back(model_id); + // update the LRU hashtable + lru_hashtable[model_id] = lru_list.size() - 1; + // get memory slot + assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() && "PEFT Memory Manager peft2mem_slot is out of sync"); + *cache_miss = false; + } else { + // cache miss + // check if you need to evict + bool need_to_evict = lru_list.size() == max_concurrent_adapters; + int mem_slot = -1; + if (need_to_evict) { + // evict the least recently used model + PEFTModelID lru_model_id = lru_list[0]; + lru_list.erase(lru_list.begin()); + lru_hashtable.erase(lru_model_id); + mem_slot = peft2mem_slot[lru_model_id]; + peft2mem_slot.erase(lru_model_id); + } else { + mem_slot = lru_list.size(); + } + // update the LRU list and hashtable + lru_list.push_back(model_id); + lru_hashtable[model_id] = lru_list.size() - 1; + // update the memory slot + peft2mem_slot[model_id] = mem_slot; + *cache_miss = true; + } + return static_cast(base_ptr) + peft2mem_slot[model_id]*max_lora_size; + } - - int max_rank, max_concurrent_adapters; - int lora_in_dim, lora_out_dim; + int max_concurrent_adapters; + size_t max_lora_size; + Realm::RegionInstance peftLegionInst; + void *base_ptr; + std::unordered_map lru_hashtable; + std::vector lru_list; // head = least recently used, tail=most recently used + std::unordered_map peft2mem_slot; } }; // namespace FlexFlow diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index e97087ea68..0277c008cc 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -518,6 +518,11 @@ OpMeta *LoraLinear::init_task(Task const *task, } std::string lora_layername_substr = lora_layername.substr(0, found + searchString.length()); + + size_t max_lora_size = data_type_size(dt) * (lora->max_rank * in_dim + lora->max_rank * out_dim); + m->peft_memory_manager = new PEFTMemoryManager(max_lora_size, lora->max_concurrent_adapters); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); + m->peft_memory_manager->allocate_memory(gpu_mem); for (auto const &kv : lora->peft_configs) { PEFTModelID const &model_id = kv.first; From 88d60ca294f36ef4ba54fc5c2369058f6a7210d4 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 2 Oct 2024 07:47:35 +0000 Subject: [PATCH 10/37] lora configs serialize / deserialize into single file --- include/flexflow/batch_config.h | 6 + include/flexflow/ops/lora_linear_params.h | 122 ++++++++++++++---- .../flexflow/utils/peft_weight_allocator.h | 7 +- src/ops/lora_linear.cc | 10 +- src/ops/lora_linear_params.cc | 32 ----- 5 files changed, 114 insertions(+), 63 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index a509af765c..29915bf2d9 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -44,6 +44,11 @@ struct OptimizerTasks { bool save_updated_weights = false; }; +struct NewPeftModelPath { + PEFTModelID peft_model_id; + std::string filepath; +}; + void set_optimizer_tasks(OptimizerTasks &tasks, int max_training_steps, int completed_training_steps, @@ -135,6 +140,7 @@ class BatchConfig { PerRequestInfo requestsInfo[MAX_NUM_REQUESTS]; PerTokenInfo tokensInfo[MAX_NUM_TOKENS]; PerTokenInfo labelsInfo[MAX_NUM_TOKENS]; + NewPeftModelPath new_peft_model_paths[MAX_NUM_REQUESTS]; bool request_completed[MAX_NUM_REQUESTS]; bool request_running[MAX_NUM_REQUESTS]; diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h index 1cdeb65aa2..2d8e5360dd 100644 --- a/include/flexflow/ops/lora_linear_params.h +++ b/include/flexflow/ops/lora_linear_params.h @@ -17,7 +17,10 @@ namespace FlexFlow { class LoraOptimizerConfig { public: LoraOptimizerConfig(); - virtual ~LoraOptimizerConfig() {} + virtual std::string getType() const = 0; + virtual nlohmann::json toJson() const = 0; + static std::unique_ptr fromJson(const nlohmann::json& j); + virtual ~LoraOptimizerConfig() = default; }; class LoraSGDOptimizerConfig : public LoraOptimizerConfig { @@ -29,9 +32,25 @@ class LoraSGDOptimizerConfig : public LoraOptimizerConfig { bool weight_decay_ = 0.0f); friend std::ostream &operator<<(std::ostream &os, LoraSGDOptimizerConfig const &llc); - - NLOHMANN_DEFINE_TYPE_INTRUSIVE( - LoraSGDOptimizerConfig, lr, momentum, nesterov, weight_decay) + + std::string getType() const override { return "SGD"; } + + nlohmann::json toJson() const override { + return {{"type", "SGD"}, + {"lr", lr}, + {"momentum", momentum}, + {"nesterov", nesterov}, + {"weight_decay", weight_decay}}; + } + + static std::unique_ptr fromJson(const nlohmann::json& j) { + auto sgd = std::make_unique(); + sgd->lr = j["lr"]; + sgd->momentum = j["momentum"]; + sgd->nesterov = j["nesterov"]; + sgd->weight_decay = j["weight_decay"]; + return sgd; + } public: double lr = 0.001f; @@ -50,9 +69,27 @@ class LoraAdamOptimizerConfig : public LoraOptimizerConfig { double epsilon_ = 1e-8); friend std::ostream &operator<<(std::ostream &os, LoraAdamOptimizerConfig const &llc); - - NLOHMANN_DEFINE_TYPE_INTRUSIVE( - LoraAdamOptimizerConfig, alpha, beta1, beta2, weight_decay, epsilon) + + std::string getType() const override { return "Adam"; } + + nlohmann::json toJson() const override { + return {{"type", "Adam"}, + {"alpha", alpha}, + {"beta1", beta1}, + {"beta2", beta2}, + {"weight_decay", weight_decay}, + {"epsilon", epsilon}}; + } + + static std::unique_ptr fromJson(const nlohmann::json& j) { + auto adam = std::make_unique(); + adam->alpha = j["alpha"]; + adam->beta1 = j["beta1"]; + adam->beta2 = j["beta2"]; + adam->weight_decay = j["weight_decay"]; + adam->epsilon = j["epsilon"]; + return adam; + } public: // Adam @@ -63,13 +100,13 @@ class LoraAdamOptimizerConfig : public LoraOptimizerConfig { double epsilon = 1e-8; }; -// Serialization helpers -template -void serialize_to_json_file(T const &obj, fs::path const &filepath); +std::unique_ptr LoraOptimizerConfig::fromJson(const nlohmann::json& j) { + std::string type = j["type"]; + if (type == "SGD") return LoraSGDOptimizerConfig::fromJson(j); + if (type == "Adam") return LoraAdamOptimizerConfig::fromJson(j); + throw std::runtime_error("Unknown optimizer type"); +} -// Function to deserialize JSON from file and create object -template -std::unique_ptr deserialize_from_json_file(fs::path const &filepath); class LoraLinearConfig { public: @@ -87,22 +124,54 @@ class LoraLinearConfig { std::vector const &target_modules_ = {}); // constructor used to support std::unordered_map LoraLinearConfig(); + template + void setOptimizer(T&& opt) { + optimizer_config = std::make_unique(std::forward(opt)); + } friend bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs); friend std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc); - - NLOHMANN_DEFINE_TYPE_INTRUSIVE(LoraLinearConfig, - cache_folder, - peft_model_id, - rank, - lora_alpha, - lora_dropout, - target_modules, - trainable, - init_lora_weights, - base_model_name_or_path, - precision) + void serialize_to_json_file(const std::string& filename) const { + json j = { + {"cache_folder", cache_folder}, + {"peft_model_id", peft_model_id}, + {"rank", rank}, + {"lora_alpha", lora_alpha}, + {"lora_dropout", lora_dropout}, + {"target_modules", target_modules}, + {"trainable", trainable}, + {"init_lora_weights", init_lora_weights}, + {"base_model_name_or_path", base_model_name_or_path}, + {"precision", precision}, + {"optimizer_config", optimizer_config ? optimizer_config->toJson() : nullptr} + }; + + std::ofstream file(filename); + file << j.dump(4); // Use 4 spaces for indentation + } + // Deserialization method + static LoraLinearConfig deserialize_from_json_file(const std::string& filename) { + std::ifstream file(filename); + json j; + file >> j; + LoraLinearConfig metadata( + j["cache_folder"].get(), + j["peft_model_id"].get>(), + j["rank"].get(), + j["lora_alpha"].get(), + j["lora_dropout"].get(), + j["target_modules"].get>(), + j["trainable"].get(), + j["init_lora_weights"].get(), + j["base_model_name_or_path"].get(), + j["precision"].get() + ); + if (!j["optimizer_config"].is_null()) { + metadata.optimizer_config = LoraOptimizerConfig::fromJson(j["optimizer_config"]); + } + return metadata; + } std::string cache_folder; // Huggingface model ID (for download and/or upload) @@ -116,7 +185,8 @@ class LoraLinearConfig { // whether the weights are trainable (fine-tuning scenario) or not // (inference-only). If set to true, allocate space for the gradients bool trainable = false; - LoraOptimizerConfig *optimizer_config; + // LoraOptimizerConfig *optimizer_config; + std::unique_ptr optimizer_config; // whether to initialize weights randomly (instead of attempting to load them // from file) bool init_lora_weights; diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h index 9028656949..7c1bd01ea5 100644 --- a/include/flexflow/utils/peft_weight_allocator.h +++ b/include/flexflow/utils/peft_weight_allocator.h @@ -95,7 +95,7 @@ class PEFTMemoryManager { : max_concurrent_adapters(max_concurrent_adapters_), max_lora_size(max_lora_size_), base_ptr(nullptr) {} // allocate memory for all the PEFT adapters for a given layer on a given shard - void allocate_memory(Memory gpu_mem) { + void allocate_inference_memory(Memory gpu_mem) { // allocate chunk of memory for all the PEFT adapters Realm::Rect<1, coord_t> bounds( Realm::Point<1, coord_t>(0), @@ -111,6 +111,9 @@ class PEFTMemoryManager { .wait(); base_ptr = peftLegionInst.pointer_untyped(0, sizeof(char)); } + void allocate_finetuning_memory(Memory gpu_mem) { + + } // Returns the slot in memory where the peft model weights are/will be stored. // If the model is not in memory (cache miss), set the cache_miss flag to true. @@ -160,7 +163,7 @@ class PEFTMemoryManager { int max_concurrent_adapters; size_t max_lora_size; Realm::RegionInstance peftLegionInst; - void *base_ptr; + void *base_ptr; void *finetuning_ptr; std::unordered_map lru_hashtable; std::vector lru_list; // head = least recently used, tail=most recently used std::unordered_map peft2mem_slot; diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 0277c008cc..f4c1ba9c35 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -519,12 +519,17 @@ OpMeta *LoraLinear::init_task(Task const *task, std::string lora_layername_substr = lora_layername.substr(0, found + searchString.length()); + // allocate space for lora weights size_t max_lora_size = data_type_size(dt) * (lora->max_rank * in_dim + lora->max_rank * out_dim); m->peft_memory_manager = new PEFTMemoryManager(max_lora_size, lora->max_concurrent_adapters); Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); - m->peft_memory_manager->allocate_memory(gpu_mem); + m->peft_memory_manager->allocate_inference_memory(gpu_mem); - for (auto const &kv : lora->peft_configs) { + return m; +} + +void load_peft_adapters(BatchConfig const *bc){ + for (auto const &kv : bc->peft_configs) { PEFTModelID const &model_id = kv.first; LoraLinearConfig const &lora_config = kv.second; @@ -680,7 +685,6 @@ OpMeta *LoraLinear::init_task(Task const *task, m->model_state[model_id].cache_folder = lora_config.cache_folder; m->model_state[model_id].peft_model_id = lora_config.peft_model_id; } - return m; } void LoraLinear::forward(FFModel const &ff) { diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc index 6e0c60e057..310b6d0973 100644 --- a/src/ops/lora_linear_params.cc +++ b/src/ops/lora_linear_params.cc @@ -50,38 +50,6 @@ std::ostream &operator<<(std::ostream &os, LoraAdamOptimizerConfig const &llc) { return os; } -// Serialization helpers -template -void serialize_to_json_file(T const &obj, fs::path const &filepath) { - json j = obj; - std::ofstream file(filepath); - file << j.dump(4); -} - -template -std::unique_ptr deserialize_from_json_file(fs::path const &filepath) { - std::ifstream file(filepath); - json j; - file >> j; - return std::make_unique(j.get()); -} - -template void - serialize_to_json_file(LoraLinearConfig const &obj, - fs::path const &filepath); -template void serialize_to_json_file( - LoraSGDOptimizerConfig const &obj, fs::path const &filepath); -template void serialize_to_json_file( - LoraAdamOptimizerConfig const &obj, fs::path const &filepath); -template std::unique_ptr - deserialize_from_json_file(fs::path const &filepath); -template std::unique_ptr - deserialize_from_json_file( - fs::path const &filepath); -template std::unique_ptr - deserialize_from_json_file( - fs::path const &filepath); - // ------------------ LoRA configs ------------------- // --------------------------------------------------- const LoraLinearConfig LoraLinearConfig::EmptyConfig = LoraLinearConfig("", ""); From e453237d71518a014bb2966418b38ef662378909 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 4 Oct 2024 23:03:32 +0000 Subject: [PATCH 11/37] backup --- include/flexflow/batch_config.h | 3 +- include/flexflow/ops/lora_linear_params.h | 48 +++++++++++-------- include/flexflow/request_manager.h | 3 ++ src/runtime/request_manager.cc | 56 +++++++++++++++++++++-- 4 files changed, 84 insertions(+), 26 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 29915bf2d9..cb2f8d3a3d 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -96,7 +96,6 @@ class BatchConfig { request_guid = 0; prompt_phase = false; batch_config_request_id = -1; - peft_model_id = PEFTModelID::NO_ID; peft_bwd = false; optimizer_tasks = {true, false, false, false}; } @@ -110,7 +109,7 @@ class BatchConfig { bool prompt_phase = false; RequestGuid request_guid; // PEFT fields - PEFTModelID peft_model_id; + std::unordered_map peft_adapters; bool peft_bwd; OptimizerTasks optimizer_tasks; }; diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h index 2d8e5360dd..84e76c4cc7 100644 --- a/include/flexflow/ops/lora_linear_params.h +++ b/include/flexflow/ops/lora_linear_params.h @@ -132,7 +132,7 @@ class LoraLinearConfig { LoraLinearConfig const &rhs); friend std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc); - void serialize_to_json_file(const std::string& filename) const { + std::string serialize_to_json_string(int indent=-1) const { json j = { {"cache_folder", cache_folder}, {"peft_model_id", peft_model_id}, @@ -147,30 +147,40 @@ class LoraLinearConfig { {"optimizer_config", optimizer_config ? optimizer_config->toJson() : nullptr} }; + return j.dump(indent); // No indentation + } + void serialize_to_json_file(const std::string& filename) const { + std::string j = serialize_to_json_string(4); std::ofstream file(filename); - file << j.dump(4); // Use 4 spaces for indentation + file << j; } // Deserialization method - static LoraLinearConfig deserialize_from_json_file(const std::string& filename) { - std::ifstream file(filename); - json j; - file >> j; - LoraLinearConfig metadata( - j["cache_folder"].get(), - j["peft_model_id"].get>(), - j["rank"].get(), - j["lora_alpha"].get(), - j["lora_dropout"].get(), - j["target_modules"].get>(), - j["trainable"].get(), - j["init_lora_weights"].get(), - j["base_model_name_or_path"].get(), - j["precision"].get() + static LoraLinearConfig deserialize_from_json_string(const std::string& json_string) { + json j = json::parse(json_string); + LoraLinearConfig config( + j["cache_folder"].get(), + j["peft_model_id"].get(), + j["trainable"].get(), + nullptr, // optimizer_config will be set later if present + j["init_lora_weights"].get(), + j["base_model_name_or_path"].get(), + j["precision"].get(), + j["rank"].get(), + j["lora_alpha"].get(), + j["lora_dropout"].get(), + j["target_modules"].get>() ); if (!j["optimizer_config"].is_null()) { - metadata.optimizer_config = LoraOptimizerConfig::fromJson(j["optimizer_config"]); + config.setOptimizer(LoraOptimizerConfig::fromJson(j["optimizer_config"])); } - return metadata; + return config; + } + // Deserialization method + static LoraLinearConfig deserialize_from_json_file(const std::string& filename) { + std::ifstream file(filename); + std::string j; + file >> j; + return deserialize_from_json_string(j); } std::string cache_folder; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 36a56012fc..bff0e4d90c 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -149,6 +149,8 @@ class RequestManager { int eos_token_id, std::string const &path); void register_output_filepath(std::string const &); + void register_peft_model(FFModel *model, PEFTModelID peft_model_id); + LoraLinearConfig get_peft_config(PEFTModelID peft_model_id); void initBitMask(BatchConfig::BitMask &bitmask, int initLength); void appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength); void appendBitMask(BatchConfig::BitMask &bitmask, @@ -289,6 +291,7 @@ class RequestManager { int max_sequence_length; Status request_manager_status; + std::unordered_map peft_configs; // peft benchmarking bool enable_peft_finetuning = false; static bool inference_finished; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 44b181fcb3..5e9a724d3f 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -255,6 +255,46 @@ size_t RequestManager::get_num_ssms() { return ssm_models.size(); } +void RequestManager::register_peft_config(PEFTModelID const &peft_model_id, + LoraLinearConfig const &peft_config) { + // check that peft_model_id is not already in use + assert(peft_configs.find(peft_model_id) == peft_configs.end() && + "PEFT model ID already in use"); + peft_configs[peft_model_id] = peft_config; +} + +LoraLinearConfig const &RequestManager::get_peft_config( + PEFTModelID const &peft_model_id) { + assert(peft_configs.find(peft_model_id) != peft_configs.end() && + "PEFT model ID not found"); + return peft_configs[peft_model_id]; +} + +PEFTModelID *FFModel::register_peft_adapter(LoraLinearConfig const peft_config) { + assert(config.enable_peft && + "Cannot add a LoRA layer if PEFT mode is not enabled"); + if (peft_config.target_modules.size() == 0) { + printf("PEFT config does not contain any target module\n"); + std::cout << peft_config << std::endl; + assert(false); + } + // go over base_layer_to_peft_layer and check that you can find at least one match + for (int i=0; i 0 && std::string(base_layer.name).find(peft_config.target_modules[0]) != std::string::npos) { + found = true; + break; + } + } + assert(found && "Attempting to add LoRA to a LLM target module that does not exist or does not support LoRA"); + } + PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++); + RequestManager *rm = RequestManager::get_request_manager(); + rm->register_peft_config(*peft_model_id, peft_config); + return peft_model_id; +} + RequestManager::RequestGuid RequestManager::register_new_request(Request const &request_) { const std::lock_guard lock(request_queue_mutex); @@ -730,8 +770,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].peft_model_id = - old_bc.requestsInfo[i].peft_model_id; + // new_bc.requestsInfo[i].peft_model_id = + // old_bc.requestsInfo[i].peft_model_id; + new_bc.requestsInfo[i].peft_adapters = + old_bc.requestsInfo[i].peft_adapters; new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd; new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length; num_active_req++; @@ -800,7 +842,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, std::min(get_max_tokens_per_batch() - new_bc.num_tokens, (int)new_request.tokens.size()); new_bc.requestsInfo[i].max_length = new_request.max_length; - new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id; + // new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id; + if (new_request.peft_model_id != PEFTModelID::NO_ID) { + new_bc.requestsInfo[i].peft_adapters[new_request.peft_model_id] = get_peft_config(new_request.peft_model_id).serialize_to_json_string(); + } new_bc.requestsInfo[i].peft_bwd = false; new_bc.request_completed[i] = false; new_bc.requestsInfo[i].prompt_phase = true; @@ -967,8 +1012,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, num_peft_tokens; new_bc.requestsInfo[inference_batch_size].max_length = request.max_length; new_bc.requestsInfo[inference_batch_size].request_guid = request.guid; - new_bc.requestsInfo[inference_batch_size].peft_model_id = - request.peft_model_id; + // new_bc.requestsInfo[inference_batch_size].peft_model_id = + // request.peft_model_id; + new_bc.requestsInfo[inference_batch_size].peft_adapters[request.peft_model_id] = get_peft_config(request.peft_model_id).serialize_to_json_string(); new_bc.requestsInfo[inference_batch_size].peft_bwd = true; set_optimizer_tasks( new_bc.requestsInfo[inference_batch_size].optimizer_tasks, From 5c8c4480b8a5b8dd4d41b54709cec703b9ade6fd Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 5 Oct 2024 04:38:28 +0000 Subject: [PATCH 12/37] . --- include/flexflow/model.h | 2 +- include/flexflow/ops/lora_linear_params.h | 2 +- include/flexflow/request_manager.h | 9 ++++- src/ops/lora_linear.cc | 41 ++++++++++++++++++----- src/runtime/request_manager.cc | 31 ++++++++++++++++- 5 files changed, 72 insertions(+), 13 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 5ac91d5b81..d1dbe72d7c 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -846,7 +846,7 @@ class FFModel { // PEFT Layers // ======================================== // PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config); - void add_lora_layers(std::vector target_modules, int max_rank, int max_concurrent_adapters); + void add_lora_layers(std::vector target_modules); // ======================================== // Inference APIs // ======================================== diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h index 84e76c4cc7..c5a327459f 100644 --- a/include/flexflow/ops/lora_linear_params.h +++ b/include/flexflow/ops/lora_linear_params.h @@ -208,7 +208,7 @@ class LoraLinearConfig { class LoraLinearParams { public: LayerID layer_guid; - OperatorType type; + // OperatorType type; // std::unordered_map peft_configs; int max_rank; int max_concurrent_adapters; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index bff0e4d90c..fcb09f15ed 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -151,6 +151,10 @@ class RequestManager { void register_output_filepath(std::string const &); void register_peft_model(FFModel *model, PEFTModelID peft_model_id); LoraLinearConfig get_peft_config(PEFTModelID peft_model_id); + void set_max_lora_rank(int max_lora_rank); + void set_max_concurrent_adapters(int max_concurrent_adapters); + int get_max_lora_rank(); + int get_max_concurrent_adapters(); void initBitMask(BatchConfig::BitMask &bitmask, int initLength); void appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength); void appendBitMask(BatchConfig::BitMask &bitmask, @@ -290,8 +294,11 @@ class RequestManager { int max_spec_tree_token_num; int max_sequence_length; Status request_manager_status; - + + // peft std::unordered_map peft_configs; + int max_lora_rank; + int max_concurrent_adapters; // peft benchmarking bool enable_peft_finetuning = false; static bool inference_finished; diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index f4c1ba9c35..1ba11ed75e 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -51,10 +51,13 @@ bool check_lora_layer_match(Layer *potential_target, return false; } -void FFmodel::add_lora_layers(std::vector target_modules, int max_rank, int max_concurrent_adapters) { +void FFmodel::add_lora_layers(std::vector target_modules) { assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled"); assert(target_modules.size() > 0 && "LoRA target module name is empty"); - assrt(max_rank > 1 && max_rank <= 32 && "Invalid max LoRA rank"); + RequestManager *rm = RequestManager::get_request_manager(); + int max_lora_rank = rm->get_max_lora_rank(); + int max_concurrent_adapters = rm->get_max_concurrent_adapters(); + assert(max_rank > 1 && max_rank <= 32 && "Invalid max LoRA rank"); assert(max_concurrent_adapters > 0 && "Invalid number of LoRA concurrent adapters"); for (std::string target_module_name : target_modules) { @@ -1197,14 +1200,17 @@ bool LoraLinear::measure_operator_cost(Simulator *sim, } bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) { - if (lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type && - lhs.peft_configs.size() == rhs.peft_configs.size()) { + if (lhs.layer_guid == rhs.layer_guid && lhs.max_rank == rhs.max_rank && + lhs.max_concurrent_adapters == rhs.max_concurrent_adapters && + strcmp(lhs.name, rhs.name) == 0) { +#ifdef DEADCODE for (auto const &kv : lhs.peft_configs) { auto it = rhs.peft_configs.find(kv.first); if (it == rhs.peft_configs.end() || !(it->second == kv.second)) { return false; } } +#endif return true; } return false; @@ -1243,6 +1249,9 @@ void LoraLinear::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->layer_guid.model_id); + sez.serialize(this->max_rank); + sez.serialize(this->max_concurrent_adapters); +#ifdef DEADCODE sez.serialize(this->op_type); sez.serialize(this->peft_configs.size()); for (auto const &kv : this->peft_configs) { @@ -1285,6 +1294,7 @@ void LoraLinear::serialize(Legion::Serializer &sez) const { } } } +#endif sez.serialize(strlen(this->name)); sez.serialize(this->name, strlen(this->name)); } @@ -1297,8 +1307,9 @@ Node LoraLinear::deserialize(FFModel &ff, int num_inputs) { assert(num_inputs == 2); size_t id, transformer_layer_id, deserialized_model_id; - OperatorType op_type; - size_t num_pefts; + int max_rank, max_concurrent_adapters; + // OperatorType op_type; + // size_t num_pefts; size_t name_len; char name[MAX_OPNAME] = {0}; @@ -1307,6 +1318,9 @@ Node LoraLinear::deserialize(FFModel &ff, dez.deserialize(id); dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); + dez.deserialize(max_rank); + dez.deserialize(max_concurrent_adapters); +#ifdef DEADCODE dez.deserialize(op_type); dez.deserialize(num_pefts); for (int i = 0; i < num_pefts; i++) { @@ -1357,12 +1371,15 @@ Node LoraLinear::deserialize(FFModel &ff, params.peft_configs.emplace( std::make_pair(peft_model_id, *lora_linear_config)); } +#endif dez.deserialize(name_len); dez.deserialize(name, name_len); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); params.layer_guid = layer_guid; - params.type = op_type; + // params.type = op_type; + params.max_rank = max_rank; + params.max_concurrent_adapters = max_concurrent_adapters; strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } @@ -1377,11 +1394,13 @@ Op *LoraLinear::materialize(FFModel &ff, LoraLinearParams LoraLinear::get_params() const { LoraLinearParams params; params.layer_guid = this->layer_guid; - params.type = this->op_type; + params.max_rank = this->max_rank; + params.max_concurrent_adapters = this->max_concurrent_adapters; + // params.type = this->op_type; if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } - params.peft_configs = this->peft_configs; + // params.peft_configs = this->peft_configs; return params; } @@ -1400,6 +1419,9 @@ size_t hash::operator()( hash_combine(key, params.layer_guid.id); hash_combine(key, params.layer_guid.transformer_layer_id); hash_combine(key, params.layer_guid.model_id); + hash_combine(key, params.max_rank); + hash_combine(key, params.max_concurrent_adapters); +#ifdef DEADCODE for (auto const &kv : params.peft_configs) { hash_combine(key, kv.first.id); hash_combine(key, kv.second.rank); @@ -1411,6 +1433,7 @@ size_t hash::operator()( hash_combine(key, kv.second.target_modules); hash_combine(key, kv.second.init_lora_weights); } +#endif return key; } }; // namespace std diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 5e9a724d3f..79fcdfdcfe 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -270,6 +270,20 @@ LoraLinearConfig const &RequestManager::get_peft_config( return peft_configs[peft_model_id]; } +void RequestManager::set_max_lora_rank(int max_lora_rank_) { + max_lora_rank = max_lora_rank_; +} + +void RequestManager::set_max_concurrent_adapters(int max_concurrent_adapters_) { + max_concurrent_adapters = max_concurrent_adapters_; +} + +int RequestManager::get_max_lora_rank() { return max_lora_rank; } + +int RequestManager::get_max_concurrent_adapters() { + return max_concurrent_adapters; +} + PEFTModelID *FFModel::register_peft_adapter(LoraLinearConfig const peft_config) { assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled"); @@ -679,6 +693,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, int inference_batch_size = BatchConfig::max_requests_per_batch() - (int)enable_peft_finetuning; + int num_concurrent_adapters = 0; + // Step 2: prepare the next batch for existing inference requests BatchConfig new_bc; for (int i = 0; i < inference_batch_size; i++) { @@ -774,6 +790,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, // old_bc.requestsInfo[i].peft_model_id; new_bc.requestsInfo[i].peft_adapters = old_bc.requestsInfo[i].peft_adapters; + num_concurrent_adapters += new_bc.requestsInfo[i].peft_adapters.size(); new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd; new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length; num_active_req++; @@ -825,6 +842,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } new_bc.num_generation_tokens = num_generation_tokens; + assert(num_concurrent_adapters <= get_max_concurrent_adapters() && + "Number of concurrent adapters exceeded the limit"); + // Step 3: add new inference requests to the next batch if there is space for (int i = 0; i < inference_batch_size; i++) { if (new_bc.request_completed[i]) { @@ -832,6 +852,12 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.num_tokens < get_max_tokens_per_batch()) { Request new_request = pending_infr_request_queue.front(); assert(new_request.req_type == RequestType::REQ_INFERENCE); + + // if the request has peft adapters and we are at capacity, don't add it yet + if (new_request.peft_model_id != PEFTModelID::NO_ID && num_concurrent_adapters == get_max_concurrent_adapters()) { + break; + } + pending_infr_request_queue.pop(); // all_requests[new_request.guid] = new_request; @@ -1000,7 +1026,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, int num_peft_label_tokens = request.dataset[dataset_entry].second.size(); assert(num_peft_label_tokens == 0); - if (num_peft_tokens > 0) { + if (num_peft_tokens > 0 && num_concurrent_adapters < get_max_concurrent_adapters()) { assert(new_bc.request_completed[inference_batch_size]); // request info new_bc.request_completed[inference_batch_size] = false; @@ -1033,8 +1059,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.num_tokens++; new_bc.num_peft_tokens++; } + num_concurrent_adapters +=1; } } + assert(num_concurrent_adapters <= get_max_concurrent_adapters() && + "Number of concurrent adapters exceeded the limit"); return new_bc; } From 21f8cb97e768d3dab074e8b8070d7322ebdb3a9c Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 5 Oct 2024 20:32:27 +0000 Subject: [PATCH 13/37] . --- .../ops/kernels/lora_linear_kernels.h | 13 +- .../flexflow/utils/peft_weight_allocator.h | 120 ++++---- src/ops/kernels/lora_linear_kernels.cu | 101 +++---- src/ops/lora_linear.cc | 57 +--- src/runtime/peft_weight_allocator.cc | 263 ++++++++++++++++++ src/runtime/peft_weight_allocator.cu | 70 +++++ 6 files changed, 447 insertions(+), 177 deletions(-) create mode 100644 src/runtime/peft_weight_allocator.cc create mode 100644 src/runtime/peft_weight_allocator.cu diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h index 2fde38728a..55ca34ff7d 100644 --- a/include/flexflow/ops/kernels/lora_linear_kernels.h +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -9,16 +9,7 @@ namespace FlexFlow { -struct LoraLinearWeight { - // weights - void *w0_ptr, *w1_ptr; - // gradients - void *w0_grad_ptr, *w1_grad_ptr; - // v values for SGD optimizer (when using momentum) - void *w0_v_values_ptr, *w1_v_values_ptr; - int in_dim, out_dim, rank, num_shards; -}; - +#ifdef DEADCODE struct LoraLinearModelState { LoraLinearWeight weights; LoraOptimizerConfig const *optimizer_config; @@ -27,6 +18,7 @@ struct LoraLinearModelState { // Huggingface model ID (for download and/or upload) std::string peft_model_id; }; +#endif class LoraLinearMeta : public OpMeta { public: @@ -35,6 +27,7 @@ class LoraLinearMeta : public OpMeta { // PEFT related fields void *low_rank_activation; void *input_activation; + std::unordeded_map model_state; // std::unordered_map model_state; // size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0; PEFTMemoryManager *peft_memory_manager; diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h index 7c1bd01ea5..5235ac9f38 100644 --- a/include/flexflow/utils/peft_weight_allocator.h +++ b/include/flexflow/utils/peft_weight_allocator.h @@ -17,7 +17,8 @@ #define _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_ #include "flexflow/config.h" -#include +#include "lora_linear_params.h" +// #include namespace FlexFlow { @@ -89,84 +90,73 @@ class PEFTWeightAllocator { }; #endif +struct LoraLinearWeight { + // weights + void *w0_ptr, *w1_ptr; + // gradients + void *w0_grad_ptr, *w1_grad_ptr; + // v values for SGD optimizer (when using momentum) + void *w0_v_values_ptr, *w1_v_values_ptr; + // int in_dim, out_dim, rank, num_shards; + LoraLinearWeight(void *w0=nullptr, void *w1=nullptr, void *w0_grad=nullptr, void *w1_grad=nullptr, + void *w0_v_values=nullptr, void *w1_v_values=nullptr) + : w0_ptr(w0), w1_ptr(w1), + w0_grad_ptr(w0_grad), w1_grad_ptr(w1_grad), + w0_v_values_ptr(w0_v_values), w1_v_values_ptr(w1_v_values) {} +}; + class PEFTMemoryManager { public: - PEFTMemoryManager(size_t max_lora_size_, int max_concurrent_adapters_) - : max_concurrent_adapters(max_concurrent_adapters_), max_lora_size(max_lora_size_), base_ptr(nullptr) {} + PEFTMemoryManager(Memory gpu_mem_, size_t max_lora_size_, int max_concurrent_adapters_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_) + : gpu_mem(gpu_mem_), + max_concurrent_adapters(max_concurrent_adapters_), + max_lora_size(max_lora_size_), + in_dim(in_dim_), out_dim(out_dim_), num_shards(num_shards_), shard_id(shard_id_), + lora_layername_substr(lora_layername_substr_), dt(dt_), + base_ptr(nullptr), + finetuning_ptr(nullptr), + finetuning_model_id(PEFTModelID::NO_ID) { + + assert(max_concurrent_adapters > 0 && "PEFT Memory Manager max_concurrent_adapters must be > 0"); + assert(max_lora_size > 0 && "PEFT Memory Manager max_lora_size must be > 0"); + allocate_inference_memory(); + // finetuning memory is allocated upon the first finetuning request, so we can skip for inference-only workloads + } // allocate memory for all the PEFT adapters for a given layer on a given shard - void allocate_inference_memory(Memory gpu_mem) { - // allocate chunk of memory for all the PEFT adapters - Realm::Rect<1, coord_t> bounds( - Realm::Point<1, coord_t>(0), - Realm::Point<1, coord_t>(max_lora_size - 1)); - std::vector field_sizes; - field_sizes.push_back(sizeof(char)); - Realm::RegionInstance::create_instance(peftLegionInst, - gpu_mem, - bounds, - field_sizes, - 0, - Realm::ProfilingRequestSet()) - .wait(); - base_ptr = peftLegionInst.pointer_untyped(0, sizeof(char)); - } - void allocate_finetuning_memory(Memory gpu_mem) { + void allocate_inference_memory(); + // allocate memory for the PEFT adapter for a finetuning request for a given layer and shard + void allocate_finetuning_memory(); - } + // Check if the PEFT adapter for the given model is in memory. If not, sets the cache_miss flag to true. If this is the first finetuning request, allocate memory for the finetuning adapter. + void get_finetuning_slot(PEFTModelID const &model_id, bool *cache_miss); // Returns the slot in memory where the peft model weights are/will be stored. // If the model is not in memory (cache miss), set the cache_miss flag to true. - void *get_peft_model_handle(PEFTModelID const &model_id, bool *cache_miss) { - assert(base_ptr != nullptr && "PEFT Memory Manager not initialized"); - assert(lru_hashtable.size() == lru_list.size() && - lru_list.size() == peft2mem_slot.size() && - "PEFT Memory Manager LRU hashtable/list and/or peft2mem_slot are out of sync"); - // check for cache hit - if (lru_hashtable.find(model_id) != lru_hashtable.end()) { - int lru_list_index = lru_hashtable[model_id]; - assert(lru_list[lru_list_index] == model_id && - "PEFT Memory Manager LRU hashtable/list are out of sync"); - // move the model to the end of the LRU list - lru_list.erase(lru_list.begin() + lru_list_index); - lru_list.push_back(model_id); - // update the LRU hashtable - lru_hashtable[model_id] = lru_list.size() - 1; - // get memory slot - assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() && "PEFT Memory Manager peft2mem_slot is out of sync"); - *cache_miss = false; - } else { - // cache miss - // check if you need to evict - bool need_to_evict = lru_list.size() == max_concurrent_adapters; - int mem_slot = -1; - if (need_to_evict) { - // evict the least recently used model - PEFTModelID lru_model_id = lru_list[0]; - lru_list.erase(lru_list.begin()); - lru_hashtable.erase(lru_model_id); - mem_slot = peft2mem_slot[lru_model_id]; - peft2mem_slot.erase(lru_model_id); - } else { - mem_slot = lru_list.size(); - } - // update the LRU list and hashtable - lru_list.push_back(model_id); - lru_hashtable[model_id] = lru_list.size() - 1; - // update the memory slot - peft2mem_slot[model_id] = mem_slot; - *cache_miss = true; - } - return static_cast(base_ptr) + peft2mem_slot[model_id]*max_lora_size; - } + int get_inference_peft_slot(PEFTModelID const &model_id, bool *cache_miss); + + void load_peft_model(LoraLinearWeight &weight, LoraLinearConfig const &lora_config); + + LoraLinearWeight get_inference_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config); + LoraLinearWeight get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config); + + // Legion memory management apparatus + Memory gpu_mem; + Realm::RegionInstance peftLegionInst; + void *base_ptr, *finetuning_ptr; + // Size and shapes int max_concurrent_adapters; size_t max_lora_size; - Realm::RegionInstance peftLegionInst; - void *base_ptr; void *finetuning_ptr; + int in_dim, out_dim, num_shards, shard_id; + // LRU cache apparatus std::unordered_map lru_hashtable; std::vector lru_list; // head = least recently used, tail=most recently used std::unordered_map peft2mem_slot; + // Miscellanea + std::string lora_layername_substr; + DataType dt; + PEFTModelID finetuning_model_id; } }; // namespace FlexFlow diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 93e5820f9c..0bb5cb64fc 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -147,56 +147,8 @@ void peft_bwd_kernel_wrapper(LoraLinearMeta *m, namespace Internal { -template -void init_kernel(LoraLinearMeta *m, int seed, cudaStream_t stream) { - // Initialize generator - std::mt19937 gen(seed); - - // Get handle to weights by iterating over m->model_state to get each - // LoraLinearWeight object - for (auto &model_state : m->model_state) { - LoraLinearWeight weight = model_state.second.weights; - int w0_num_elements = weight.rank * weight.in_dim; - int w1_num_elements = weight.rank * weight.out_dim; - - // LoRA_A weight: [in_dim, rank] - float stdv_lora_a = 1.0f / sqrt(weight.in_dim); - std::uniform_real_distribution dis_lora_a(-stdv_lora_a, stdv_lora_a); - std::vector
lora_a_random_init(w0_num_elements); - for (auto &num : lora_a_random_init) { - float num_float = dis_lora_a(gen); - if (std::is_same::value) { - num = __float2half(num_float); - } else { - num = num_float; - } - } - checkCUDA(cudaMemcpyAsync(static_cast
(weight.w0_ptr), - lora_a_random_init.data(), - w0_num_elements * sizeof(DT), - cudaMemcpyHostToDevice, - stream)); - - // LoRA_B weight: [rank, out_dim] - float stdv_lora_b = 1.0f / sqrt(weight.rank); - std::uniform_real_distribution dis_lora_b(-stdv_lora_b, stdv_lora_b); - std::vector lora_b_random_init(w1_num_elements); - for (auto &num : lora_b_random_init) { - float num_float = dis_lora_b(gen); - if (std::is_same::value) { - num = __float2half(num_float); - } else { - num = num_float; - } - } - checkCUDA(cudaMemcpyAsync(static_cast
(weight.w1_ptr), - lora_b_random_init.data(), - w1_num_elements * sizeof(DT), - cudaMemcpyHostToDevice, - stream)); - } -} +#ifdef DEADCODE template void inference_kernel(LoraLinearMeta *m, BatchConfig const *bc, @@ -335,6 +287,57 @@ void inference_kernel(LoraLinearMeta *m, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } } +#endif + +bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config) { + for (std::string s : config.target_modules) { + std::string n(m->op_name); + if (n.find(s) != std::string::npos) { + return true; + } + } + return false; +} + + +template +void inference_kernel(LoraLinearMeta *m, + BatchConfig const *bc, + DT const *input_ptr, + DT *output_ptr, + int in_dim, + int out_dim, + int num_shards, + ffStream_t stream) { + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + + int num_peft_requests = 0; + for (int i=0; i< bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + LoraLinearConfig deserialized_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]); + if (!lora_applies_to_this_layer(m, deserialized_config)) { + continue; + } + assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && "Trainable flag mismatch"); + bool cache_miss; + void *peft_slot; + if (!lora_config.trainable) { + peft_slot = m->peft_memory_manager->get_peft_model_handle(bc->requestsInfo[i].peft_model_id, &cache_miss); + } else { + peft_slot = m->peft_memory_manager->get_finetuning_handle(bc->requestsInfo[i].peft_model_id, &cache_miss); + } + if (cache_miss) { + // load model into memory + load_peft_model(m, peft_slot, deserialized_config, in_dim, out_dim, num_shards); + } + } +} template __global__ void sgd_update(size_t count, diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 1ba11ed75e..a18f47c4ac 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -407,56 +407,6 @@ void LoraLinear::init_inference( set_opmeta_from_futuremap_inference(ff, fm, output_tensor); } -template -void load_peft_from_file(DT *ptr, - size_t num_rows, - size_t num_columns, - int num_shards, - int shard_id, - std::string filepath) { - std::ifstream in(filepath, std::ios::in | std::ios::binary); - if (!in.good()) { - printf("Could not open file: %s\n", filepath.c_str()); - } - assert(in.good() && "incorrect weight file path"); - - // HuggingFace dims (serialized in row-major order) - // lora_A: [rank, intermediate_dim] - // lora_B: [hidden_dim, rank] - // FlexFlow dims (serialized in column-major order) - // lora_A: [intermediate_dim, rank] - // lora_B: [rank, out_dim] - // Tensor parallelism: shard lora_A along intermediate_dim, replicate lora_B - assert(num_rows % num_shards == 0); - size_t chunk_size = num_rows / num_shards; - size_t offset = (num_shards > 1) ? shard_id * chunk_size : 0; - - // Allocate memory for the weight shard - std::vector
host_array(chunk_size * num_columns); - // Read the chunk - size_t total_size_read = 0; - for (int i = 0; i < num_columns; ++i) { - in.seekg((i * num_rows + offset) * sizeof(DT)); - in.read(reinterpret_cast(host_array.data() + i * chunk_size), - chunk_size * sizeof(DT)); - total_size_read += in.gcount(); - } - // Check weight shard size - size_t expected_data_size = chunk_size * num_columns * sizeof(DT); - if (total_size_read != expected_data_size) { - printf("load weight data error: expected %lu bytes, got: %lu bytes, data " - "size: %lu\n", - expected_data_size, - total_size_read, - sizeof(DT)); - assert(false); - } - assert(host_array.size() == chunk_size * num_columns); - // Copy weight to device memory - copy_tensor_host_to_dev(ptr, host_array.data(), chunk_size * num_columns); - in.close(); -} - /* regions[0](O): output regions[1](I): kernel @@ -524,13 +474,13 @@ OpMeta *LoraLinear::init_task(Task const *task, // allocate space for lora weights size_t max_lora_size = data_type_size(dt) * (lora->max_rank * in_dim + lora->max_rank * out_dim); - m->peft_memory_manager = new PEFTMemoryManager(max_lora_size, lora->max_concurrent_adapters); Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); - m->peft_memory_manager->allocate_inference_memory(gpu_mem); - + m->peft_memory_manager = new PEFTMemoryManager(gpu_mem, max_lora_size, lora->max_concurrent_adapters, in_dim, out_dim, num_shards, shard_id, lora_layername_substr, dt); + m->peft_memory_manager->allocate_inference_memory(); return m; } +#ifdef DEADCODE void load_peft_adapters(BatchConfig const *bc){ for (auto const &kv : bc->peft_configs) { PEFTModelID const &model_id = kv.first; @@ -689,6 +639,7 @@ void load_peft_adapters(BatchConfig const *bc){ m->model_state[model_id].peft_model_id = lora_config.peft_model_id; } } +#endif void LoraLinear::forward(FFModel const &ff) { assert(false && "LoraLinear does not support normal init"); diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc new file mode 100644 index 0000000000..ab0e1ccd21 --- /dev/null +++ b/src/runtime/peft_weight_allocator.cc @@ -0,0 +1,263 @@ +#include "peft_weight_allocator.h" + +namespace FlexFlow { + +void PEFTMemoryManager::allocate_inference_memory() { + // allocate chunk of memory for all the PEFT adapters + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(max_lora_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance::create_instance(peftLegionInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + base_ptr = peftLegionInst.pointer_untyped(0, sizeof(char)); +} + +void PEFTMemoryManager::allocate_finetuning_memory() { + size_t ft_size = max_lora_size*3; // weights, gradients, momentum values + // allocate chunk of memory for PEFT adapter + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(ft_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance::create_instance(peftLegionInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + finetuning_ptr = peftLegionInst.pointer_untyped(0, sizeof(char)); +} + +void PEFTMemoryManager::get_finetuning_slot(PEFTModelID const &model_id, bool *cache_miss) { + if (finetuning_ptr == nullptr) { + allocate_finetuning_memory(); + } + assert(finetuning_ptr != nullptr && "PEFT Memory Manager finetuning_ptr is null"); + *cache_miss = (model_id.id != finetuning_model_id.id); +} + +int PEFTMemoryManager::get_inference_peft_slot(PEFTModelID const &model_id, bool *cache_miss) { + assert(base_ptr != nullptr && "PEFT Memory Manager not initialized"); + assert(lru_hashtable.size() == lru_list.size() && + lru_list.size() == peft2mem_slot.size() && + "PEFT Memory Manager LRU hashtable/list and/or peft2mem_slot are out of sync"); + // check for cache hit + if (lru_hashtable.find(model_id) != lru_hashtable.end()) { + int lru_list_index = lru_hashtable[model_id]; + assert(lru_list[lru_list_index] == model_id && + "PEFT Memory Manager LRU hashtable/list are out of sync"); + // move the model to the end of the LRU list + lru_list.erase(lru_list.begin() + lru_list_index); + lru_list.push_back(model_id); + // update the LRU hashtable + lru_hashtable[model_id] = lru_list.size() - 1; + // get memory slot + assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() && "PEFT Memory Manager peft2mem_slot is out of sync"); + *cache_miss = false; + } else { + // cache miss + // check if you need to evict + bool need_to_evict = lru_list.size() == max_concurrent_adapters; + int mem_slot = -1; + if (need_to_evict) { + // evict the least recently used model + PEFTModelID lru_model_id = lru_list[0]; + lru_list.erase(lru_list.begin()); + lru_hashtable.erase(lru_model_id); + mem_slot = peft2mem_slot[lru_model_id]; + peft2mem_slot.erase(lru_model_id); + } else { + mem_slot = lru_list.size(); + } + // update the LRU list and hashtable + lru_list.push_back(model_id); + lru_hashtable[model_id] = lru_list.size() - 1; + // update the memory slot + peft2mem_slot[model_id] = mem_slot; + *cache_miss = true; + } + assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() && "PEFT Memory Manager peft2mem_slot is out of sync"); + int slot = peft2mem_slot[model_id]; + assert(slot >= 0 && slot < max_concurrent_adapters && "PEFT Memory Manager peft2mem_slot is out of bounds"); + return slot; +} + +template +void load_peft_from_file(DT *ptr, + size_t num_rows, + size_t num_columns, + int num_shards, + int shard_id, + std::string filepath) { + std::ifstream in(filepath, std::ios::in | std::ios::binary); + if (!in.good()) { + printf("Could not open file: %s\n", filepath.c_str()); + } + assert(in.good() && "incorrect weight file path"); + + // HuggingFace dims (serialized in row-major order) + // lora_A: [rank, intermediate_dim] + // lora_B: [hidden_dim, rank] + // FlexFlow dims (serialized in column-major order) + // lora_A: [intermediate_dim, rank] + // lora_B: [rank, out_dim] + // Tensor parallelism: shard lora_A along intermediate_dim, replicate lora_B + assert(num_rows % num_shards == 0); + size_t chunk_size = num_rows / num_shards; + size_t offset = (num_shards > 1) ? shard_id * chunk_size : 0; + + // Allocate memory for the weight shard + std::vector
host_array(chunk_size * num_columns); + // Read the chunk + size_t total_size_read = 0; + for (int i = 0; i < num_columns; ++i) { + in.seekg((i * num_rows + offset) * sizeof(DT)); + in.read(reinterpret_cast(host_array.data() + i * chunk_size), + chunk_size * sizeof(DT)); + total_size_read += in.gcount(); + } + // Check weight shard size + size_t expected_data_size = chunk_size * num_columns * sizeof(DT); + if (total_size_read != expected_data_size) { + printf("load weight data error: expected %lu bytes, got: %lu bytes, data " + "size: %lu\n", + expected_data_size, + total_size_read, + sizeof(DT)); + assert(false); + } + assert(host_array.size() == chunk_size * num_columns); + // Copy weight to device memory + copy_tensor_host_to_dev(ptr, host_array.data(), chunk_size * num_columns); + in.close(); +} + +void PEFTMemoryManager::load_peft_model(LoraLinearWeight &weight, LoraLinearConfig const &lora_config) { + // Load weights + assert(weight.w0_ptr != nullptr && weight.w1_ptr != nullptr "PEFT Memory Manager weight ptr null"); + int w0_num_elements = lora_config.rank * in_dim; + int w1_num_elements = lora_config.rank * out_dim; + // values below represent total weight sizes before sharding. Lora B is not + // sharded. + int lora_A_num_rows = in_dim * num_shards; + int lora_A_num_cols = lora_config.rank; + int lora_B_num_rows = lora_config.rank; + int lora_B_num_cols = out_dim; + int lora_A_num_shards = num_shards; + int lora_B_num_shards = 1; + if (lora_config.init_lora_weights) { + // initialize weights randomly + int seed = 0; + init_peft_weight_wrapper(weight, in_dim, out_dim, lora_config.rank, dt, seed); + } else { + // load weights from file + std::string weights_folder_filepath = join_path({ + lora_config.cache_folder, + "weights", + lora_config.peft_model_id, + dt == DT_FLOAT ? "full-precision" : "half-precision", + }); + std::string w0_filepath = join_path( + {weights_folder_filepath, lora_layername_substr + "_A.weight"}); + std::string w1_filepath = join_path( + {weights_folder_filepath, lora_layername_substr + "_B.weight"}); + if (dt == DT_FLOAT) { + std::cout << "Loading LORA weight " + << lora_layername_substr + "_A.weight" + << ", num_rows: " << lora_A_num_rows + << ", num_cols: " << lora_A_num_cols + << ", num_shards: " << lora_A_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((float *)weight.w0_ptr, + lora_A_num_rows, + lora_A_num_cols, + lora_A_num_shards, + shard_id, + w0_filepath); + std::cout << "Loading LORA weight " + << lora_layername_substr + "_B.weight" + << ", num_rows: " << lora_B_num_rows + << ", num_cols: " << lora_B_num_cols + << ", num_shards: " << lora_B_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((float *)weight.w1_ptr, + lora_B_num_rows, + lora_B_num_cols, + lora_B_num_shards, + shard_id, + w1_filepath); + } else if (dt == DT_HALF) { + std::cout << "Loading LORA weight " + << lora_layername_substr + "_A.weight" + << ", num_rows: " << lora_A_num_rows + << ", num_cols: " << lora_A_num_cols + << ", num_shards: " << lora_A_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((half *)weight.w0_ptr, + lora_A_num_rows, + lora_A_num_cols, + lora_A_num_shards, + shard_id, + w0_filepath); + std::cout << "Loading LORA weight " + << lora_layername_substr + "_B.weight" + << ", num_rows: " << lora_B_num_rows + << ", num_cols: " << lora_B_num_cols + << ", num_shards: " << lora_B_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((half *)weight.w1_ptr, + lora_B_num_rows, + lora_B_num_cols, + lora_B_num_shards, + shard_id, + w1_filepath); + } else { + assert(false && "Data type not supported"); + } + } +} + +LoraLinearWeight PEFTMemoryManager::get_inference_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config) { + assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set"); + bool cache_miss; + int mem_slot = get_inference_peft_slot(model_id, &cache_miss); + int w0_num_elements = lora_config.rank * in_dim; + int data_size = data_type_size(dt); + LoraLinearWeight result; + result.w0_ptr = static_cast(base_ptr) + mem_slot * max_lora_size; + result.w1_ptr = result.w0_ptr + w0_num_elements * data_size; + if (cache_miss) { + load_peft_model(result, lora_config); + } + return result; +} + +LoraLinearWeight PEFTMemoryManager::get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config) { + assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set"); + bool cache_miss = get_finetuning_slot(model_id); + int w0_num_elements = lora_config.rank * in_dim; + int w1_num_elements = lora_config.rank * out_dim; + int data_size = data_type_size(dt); + LoraLinearWeight result; + result.w0_ptr = finetuning_ptr; + result.w1_ptr = result.w0_ptr + w0_num_elements*data_size; + result.w0_grad_ptr = result.w1_ptr + w1_num_elements*data_size; + result.w1_grad_ptr = result.w0_grad_ptr + w0_num_elements*data_size; + result.w0_v_values_ptr = result.w1_grad_ptr + w1_num_elements*data_size; + result.w1_v_values_ptr = result.w0_v_values_ptr + w0_num_elements*data_size; + if (cache_miss) { + load_peft_model(result, lora_config); + } + return result; +} + +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/runtime/peft_weight_allocator.cu b/src/runtime/peft_weight_allocator.cu new file mode 100644 index 0000000000..cc8d095069 --- /dev/null +++ b/src/runtime/peft_weight_allocator.cu @@ -0,0 +1,70 @@ + + +#include "flexflow/ops/kernels/decompress_kernels.h" +#include "flexflow/utils/peft_weight_allocator.h" +#include "flexflow/utils/cuda_helper.h" +#include +#include +namespace FlexFlow { + +template +void init_kernel(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, int seed, cudaStream_t stream) { + // Initialize generator + std::mt19937 gen(seed); + + // Get handle to weights by iterating over m->model_state to get each + // LoraLinearWeight object + int w0_num_elements = rank * in_dim; + int w1_num_elements = rank * out_dim; + + // LoRA_A weight: [in_dim, rank] + float stdv_lora_a = 1.0f / sqrt(in_dim); + std::uniform_real_distribution dis_lora_a(-stdv_lora_a, stdv_lora_a); + std::vector
lora_a_random_init(w0_num_elements); + for (auto &num : lora_a_random_init) { + float num_float = dis_lora_a(gen); + if (std::is_same::value) { + num = __float2half(num_float); + } else { + num = num_float; + } + } + checkCUDA(cudaMemcpyAsync(static_cast
(weight.w0_ptr), + lora_a_random_init.data(), + w0_num_elements * sizeof(DT), + cudaMemcpyHostToDevice, + stream)); + + // LoRA_B weight: [rank, out_dim] + float stdv_lora_b = 1.0f / sqrt(rank); + std::uniform_real_distribution dis_lora_b(-stdv_lora_b, stdv_lora_b); + std::vector lora_b_random_init(w1_num_elements); + for (auto &num : lora_b_random_init) { + float num_float = dis_lora_b(gen); + if (std::is_same::value) { + num = __float2half(num_float); + } else { + num = num_float; + } + } + checkCUDA(cudaMemcpyAsync(static_cast
(w1_ptr), + lora_b_random_init.data(), + w1_num_elements * sizeof(DT), + cudaMemcpyHostToDevice, + stream)); +} + +void init_peft_weight_wrapper(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, DataType dt, int seed) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + if (dt == DT_FLOAT) { + Internal::init_kernel(weight, in_di, out_dim, rank, seed, stream); + } else if (dt == DT_HALF) { + Internal::init_kernel(weight, in_di, out_dim, rank, seed, stream); + } else { + assert(false && "Unsupported data type"); + } +} + +} // namespace FlexFlow \ No newline at end of file From c5e813bea5e15934a4fbb77e4f0561d87dc3dd8a Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 5 Oct 2024 21:13:18 +0000 Subject: [PATCH 14/37] . --- .../ops/kernels/lora_linear_kernels.h | 6 +- .../flexflow/utils/peft_weight_allocator.h | 20 +++-- src/ops/kernels/lora_linear_kernels.cu | 80 ++++++++++++++++--- src/runtime/peft_weight_allocator.cc | 11 +++ 4 files changed, 95 insertions(+), 22 deletions(-) diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h index 55ca34ff7d..eef3b392b3 100644 --- a/include/flexflow/ops/kernels/lora_linear_kernels.h +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -25,9 +25,9 @@ class LoraLinearMeta : public OpMeta { LoraLinearMeta(FFHandler handle, LoraLinear const *li); ~LoraLinearMeta(void); // PEFT related fields - void *low_rank_activation; - void *input_activation; - std::unordeded_map model_state; + // void *low_rank_activation; + // void *input_activation; + // std::unordeded_map model_state; // std::unordered_map model_state; // size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0; PEFTMemoryManager *peft_memory_manager; diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h index 5235ac9f38..19b987a728 100644 --- a/include/flexflow/utils/peft_weight_allocator.h +++ b/include/flexflow/utils/peft_weight_allocator.h @@ -95,23 +95,27 @@ struct LoraLinearWeight { void *w0_ptr, *w1_ptr; // gradients void *w0_grad_ptr, *w1_grad_ptr; + // activations + void *input_activation; + void *low_rank_activation; // v values for SGD optimizer (when using momentum) void *w0_v_values_ptr, *w1_v_values_ptr; - // int in_dim, out_dim, rank, num_shards; LoraLinearWeight(void *w0=nullptr, void *w1=nullptr, void *w0_grad=nullptr, void *w1_grad=nullptr, - void *w0_v_values=nullptr, void *w1_v_values=nullptr) + void *w0_v_values=nullptr, void *w1_v_values=nullptr, void *low_rank_activation_=nullptr, void *input_activation_=nullptr) : w0_ptr(w0), w1_ptr(w1), w0_grad_ptr(w0_grad), w1_grad_ptr(w1_grad), - w0_v_values_ptr(w0_v_values), w1_v_values_ptr(w1_v_values) {} + w0_v_values_ptr(w0_v_values), w1_v_values_ptr(w1_v_values), + low_rank_activation(low_rank_activation_), input_activation(input_activation_) {} }; class PEFTMemoryManager { public: - PEFTMemoryManager(Memory gpu_mem_, size_t max_lora_size_, int max_concurrent_adapters_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_) + PEFTMemoryManager(Memory gpu_mem_, size_t max_lora_size_, int max_concurrent_adapters_, int max_peft_tokens_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_) : gpu_mem(gpu_mem_), max_concurrent_adapters(max_concurrent_adapters_), max_lora_size(max_lora_size_), in_dim(in_dim_), out_dim(out_dim_), num_shards(num_shards_), shard_id(shard_id_), + max_peft_tokens(max_peft_tokens_), lora_layername_substr(lora_layername_substr_), dt(dt_), base_ptr(nullptr), finetuning_ptr(nullptr), @@ -128,17 +132,16 @@ class PEFTMemoryManager { // allocate memory for the PEFT adapter for a finetuning request for a given layer and shard void allocate_finetuning_memory(); + LoraLinearWeight get_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config); + +private: // Check if the PEFT adapter for the given model is in memory. If not, sets the cache_miss flag to true. If this is the first finetuning request, allocate memory for the finetuning adapter. void get_finetuning_slot(PEFTModelID const &model_id, bool *cache_miss); - // Returns the slot in memory where the peft model weights are/will be stored. // If the model is not in memory (cache miss), set the cache_miss flag to true. int get_inference_peft_slot(PEFTModelID const &model_id, bool *cache_miss); - void load_peft_model(LoraLinearWeight &weight, LoraLinearConfig const &lora_config); - LoraLinearWeight get_inference_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config); - LoraLinearWeight get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config); // Legion memory management apparatus @@ -149,6 +152,7 @@ class PEFTMemoryManager { int max_concurrent_adapters; size_t max_lora_size; int in_dim, out_dim, num_shards, shard_id; + int max_peft_tokens; // LRU cache apparatus std::unordered_map lru_hashtable; std::vector lru_list; // head = least recently used, tail=most recently used diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 0bb5cb64fc..eab8b30227 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -311,6 +311,12 @@ void inference_kernel(LoraLinearMeta *m, ffStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); + cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]); + cudaDataType_t lr_actv_type = output_type; + assert(input_type == output_type); + cudaDataType_t weight_type = output_type; + cudaDataType_t compute_type = output_type; int num_peft_requests = 0; for (int i=0; i< bc->max_requests_per_batch(); i++) { @@ -320,22 +326,74 @@ void inference_kernel(LoraLinearMeta *m, if (bc->requestsInfo[i].peft_bwd) { num_peft_requests++; } - LoraLinearConfig deserialized_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]); - if (!lora_applies_to_this_layer(m, deserialized_config)) { + LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]); + if (!lora_applies_to_this_layer(m, lora_config)) { continue; } assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && "Trainable flag mismatch"); - bool cache_miss; - void *peft_slot; - if (!lora_config.trainable) { - peft_slot = m->peft_memory_manager->get_peft_model_handle(bc->requestsInfo[i].peft_model_id, &cache_miss); + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + // int max_peft_tokens = bc->requestsInfo[i].max_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config); + void *intermediate_result_ptr = (bc->requestsInfo[i].peft_bwd) ? weight.low_rank_activation : m->handle.workSpace; + if (bc->requestsInfo[i].peft_bwd) { + checkCUDA(cudaMemcpyAsync(weight.input_activation, + input_ptr + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * + num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); } else { - peft_slot = m->peft_memory_manager->get_finetuning_handle(bc->requestsInfo[i].peft_model_id, &cache_miss); - } - if (cache_miss) { - // load model into memory - load_peft_model(m, peft_slot, deserialized_config, in_dim, out_dim, num_shards); + // use workspace to save intermediate result + assert(m->handle.workSpaceSize >= + data_type_size(m->input_type[1]) * num_peft_tokens * lora_config.rank); } + DT alpha = 1.0f, beta = 0.0f; + // buffer = weight_first * input + // [rank, num_peft_tokens] = [in_dim, rank].T * [in_dim, num_peft_tokens] + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + lora_config.rank, + num_peft_tokens, + in_dim, + &alpha, + weight.w0_ptr, + weight_type, + in_dim, + input_ptr + first_token_offset * in_dim, + input_type, + in_dim, + &beta, + intermediate_result_ptr, + lr_actv_type, + lora_config.rank, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // output = weight_second * buffer + // [out_dim, num_peft_tokens] = [rank, out_dim].T * [rank, num_peft_tokens] + // Note that we use alpha in both places since we do + // an in-place update for LoraLinear + DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank); + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + out_dim, + num_peft_tokens, + lora_config.rank, + &scaling_constant, + weight.w1_ptr, + weight_type, + lora_config.rank, + intermediate_result_ptr, + lr_actv_type, + lora_config.rank, + &alpha, + output_ptr + first_token_offset * out_dim, + output_type, + out_dim, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } } diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc index ab0e1ccd21..83fa66aa15 100644 --- a/src/runtime/peft_weight_allocator.cc +++ b/src/runtime/peft_weight_allocator.cc @@ -21,6 +21,7 @@ void PEFTMemoryManager::allocate_inference_memory() { void PEFTMemoryManager::allocate_finetuning_memory() { size_t ft_size = max_lora_size*3; // weights, gradients, momentum values + ft_size += max_peft_tokens*(in_dim+rank); // input, low-rank activations // allocate chunk of memory for PEFT adapter Realm::Rect<1, coord_t> bounds( Realm::Point<1, coord_t>(0), @@ -254,10 +255,20 @@ LoraLinearWeight PEFTMemoryManager::get_finetuning_peft(PEFTModelID const &model result.w1_grad_ptr = result.w0_grad_ptr + w0_num_elements*data_size; result.w0_v_values_ptr = result.w1_grad_ptr + w1_num_elements*data_size; result.w1_v_values_ptr = result.w0_v_values_ptr + w0_num_elements*data_size; + result.input_activation = result.w1_v_values_ptr + w1_num_elements*data_size; // max_peft_tokens*in_dim + result.low_rank_activation = result.input_activation + max_peft_tokens*in_dim*data_size; // max_peft_tokens*rank if (cache_miss) { load_peft_model(result, lora_config); } return result; } +LoraLinearWeight PEFTMemoryManager::get_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config) { + if (lora_config.trainable) { + return get_finetuning_peft(model_id, lora_config); + } else { + return get_inference_peft(model_id, lora_config); + } +} + }; // namespace FlexFlow \ No newline at end of file From aa57f9807401adf05b03713918bf2be3a4cb4396 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 5 Oct 2024 21:30:36 +0000 Subject: [PATCH 15/37] . --- .../flexflow/utils/peft_weight_allocator.h | 1 + src/ops/kernels/lora_linear_kernels.cu | 82 ++++++++----------- src/runtime/peft_weight_allocator.cc | 4 + 3 files changed, 38 insertions(+), 49 deletions(-) diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h index 19b987a728..3c9efc0812 100644 --- a/include/flexflow/utils/peft_weight_allocator.h +++ b/include/flexflow/utils/peft_weight_allocator.h @@ -133,6 +133,7 @@ class PEFTMemoryManager { void allocate_finetuning_memory(); LoraLinearWeight get_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config); + void check_ft_model_id(PEFTModelID const &model_id); private: // Check if the PEFT adapter for the given model is in memory. If not, sets the cache_miss flag to true. If this is the first finetuning request, allocate memory for the finetuning adapter. diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index eab8b30227..d5baf49cdc 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -395,6 +395,7 @@ void inference_kernel(LoraLinearMeta *m, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } + assert(num_peft_requests <= 1); } template @@ -437,39 +438,24 @@ void peft_bwd_kernel(LoraLinearMeta *m, cudaDataType_t weight_type = output_type; cudaDataType_t lr_actv_type = output_type; cudaDataType_t compute_type = output_type; - // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - // cudaDataType_t compute_type = output_type; - // #else - // // For best performance, set the default cublas compute type to - // // CUBLAS_COMPUTE_16F for half precision and to - // // CUBLAS_COMPUTE_32F_FAST_16F for full precision - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - // if (m->output_type[0] == DT_FLOAT) { - // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - // } - // #endif + for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } - // Skip non-PEFT requests - if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + // Skip completed, non-PEFT and PEFT forward-only requests + if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || !bc->requestsInfo[i].peft_bwd) { continue; } - // Skip PEFT forward-only requests - if (!bc->requestsInfo[i].peft_bwd) { + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]); + if (!lora_applies_to_this_layer(m, lora_config)) { continue; } + assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && "Trainable flag mismatch"); + m->peft_memory_manager->check_ft_model_id(bc->requestsInfo[i].peft_model_id); int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; - assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != - m->model_state.end()); - LoraLinearWeight weight = - m->model_state[bc->requestsInfo[i].peft_model_id].weights; - int rank = weight.rank; - float lora_alpha = - m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha; - DT scaling_constant = (DT)(lora_alpha / rank); + // int max_peft_tokens = bc->requestsInfo[i].max_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config); + DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank); // Compute LORA_B weight's gradient if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) { @@ -480,20 +466,20 @@ void peft_bwd_kernel(LoraLinearMeta *m, checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_T, - rank, + lora_config.rank, out_dim, num_peft_tokens, &scaling_constant, - m->low_rank_activation, + weight.low_rank_activation, lr_actv_type, - rank, + lora_config.rank, output_grad_ptr, output_type, out_dim, &beta, weight.w1_grad_ptr, weight_type, - rank, + lora_config.rank, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } @@ -505,20 +491,20 @@ void peft_bwd_kernel(LoraLinearMeta *m, checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_N, - rank, + lora_config.rank, num_peft_tokens, out_dim, &scaling_constant, weight.w1_ptr, weight_type, - rank, + lora_config.rank, output_grad_ptr, output_type, out_dim, &beta, - m->low_rank_activation, + weight.low_rank_activation, lr_actv_type, - rank, + lora_config.rank, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } @@ -533,15 +519,15 @@ void peft_bwd_kernel(LoraLinearMeta *m, CUBLAS_OP_N, CUBLAS_OP_T, in_dim, - rank, + lora_config.rank, num_peft_tokens, &alpha, - m->input_activation, + weight.input_activation, input_type, in_dim, - m->low_rank_activation, + weight.low_rank_activation, lr_actv_type, - rank, + lora_config.rank, &beta, weight.w0_grad_ptr, weight_type, @@ -559,14 +545,14 @@ void peft_bwd_kernel(LoraLinearMeta *m, CUBLAS_OP_N, in_dim, num_peft_tokens, - rank, + lora_config.rank, &alpha, weight.w0_ptr, weight_type, in_dim, - m->low_rank_activation, + weight.low_rank_activation, lr_actv_type, - rank, + lora_config.rank, &beta, input_grad_ptr, input_type, @@ -576,15 +562,13 @@ void peft_bwd_kernel(LoraLinearMeta *m, } if (bc->requestsInfo[i].optimizer_tasks.update_weights) { - LoraOptimizerConfig const *optimizer_config = - m->model_state[bc->requestsInfo[i].peft_model_id].optimizer_config; + LoraOptimizerConfig const *optimizer_config = lora_config.optimizer_config; assert(optimizer_config != nullptr); - assert(typeid(*optimizer_config) != typeid(LoraOptimizerConfig)); - int w0_num_elements = rank * in_dim; - int w1_num_elements = rank * out_dim; + int w0_num_elements = lora_config.rank * in_dim; + int w1_num_elements = lora_config.rank * out_dim; // Get optimizer config - if (typeid(*optimizer_config) == typeid(LoraSGDOptimizerConfig)) { + if (optimizer_config->getType() == "SGD") { LoraSGDOptimizerConfig const *sgd_config = (LoraSGDOptimizerConfig const *)optimizer_config; // LoRA_A weight is split in tensor parallelism, so no need to apply @@ -625,7 +609,7 @@ void peft_bwd_kernel(LoraLinearMeta *m, static_cast
(weight.w1_grad_ptr), static_cast
(weight.w1_v_values_ptr), static_cast
(weight.w1_ptr)); - } else if (typeid(*optimizer_config) == typeid(LoraAdamOptimizerConfig)) { + } else if (optimizer_config->getType() == "Adam") { assert(false && "Adam optimizer type not implemented yet"); } else { assert(false && "Unsupported optimizer type"); diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc index 83fa66aa15..cc40d666ed 100644 --- a/src/runtime/peft_weight_allocator.cc +++ b/src/runtime/peft_weight_allocator.cc @@ -271,4 +271,8 @@ LoraLinearWeight PEFTMemoryManager::get_peft(PEFTModelID const &model_id, LoraLi } } +void PEFTMemoryManager::check_ft_model_id(PEFTModelID const &model_id) { + assert(finetuning_model_id == model_id && "PEFT bwd model is not in memory!"); +} + }; // namespace FlexFlow \ No newline at end of file From 53c408c3111e43bd0bbe084c4310df5a5ed1c1b1 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 5 Oct 2024 22:33:49 +0000 Subject: [PATCH 16/37] frontend --- include/flexflow/flexflow_c.h | 8 ++++++-- include/flexflow/request_manager.h | 1 - inference/models/falcon.cc | 7 +++++++ inference/models/llama.cc | 10 +++++++--- inference/models/mpt.cc | 8 ++++++++ inference/models/opt.cc | 7 +++++++ inference/models/starcoder.cc | 7 +++++++ python/flexflow/core/flexflow_cffi.py | 12 ++++++++++-- python/flexflow/serve/models/falcon.py | 4 ++++ python/flexflow/serve/models/llama.py | 4 ++++ python/flexflow/serve/models/mpt.py | 4 ++++ python/flexflow/serve/models/opt.py | 4 ++++ python/flexflow/serve/models/starcoder.py | 4 ++++ python/flexflow/serve/serve.py | 13 +++++++------ src/c/flexflow_c.cc | 23 ++++++++++++++++++++--- 15 files changed, 99 insertions(+), 17 deletions(-) diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 5aa2fdd551..19b2bc7c83 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -91,6 +91,8 @@ int flexflow_config_get_tensor_parallelism_degree(flexflow_config_t handle_); int flexflow_config_get_pipeline_parallelism_degree(flexflow_config_t handle_); +bool flexflow_config_get_enable_peft(flexflow_config_t handle_); + void flexflow_config_set_data_parallelism_degree(flexflow_config_t handle_, int value); @@ -598,8 +600,10 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, bool beam_search, char const *name); -flexflow_peft_model_id_t flexflow_model_add_lora_layer( - flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_); +void flexflow_model_add_lora_layers(flexflow_model_t handle_, int num_target_modules, char const **target_modules_); + + +flexflow_peft_model_id_t flexflow_model_register_peft_adapter(flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_); void flexflow_model_set_sgd_optimizer(flexflow_model_t handle, flexflow_sgd_optimizer_t optimizer); diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index fcb09f15ed..542deb336d 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -149,7 +149,6 @@ class RequestManager { int eos_token_id, std::string const &path); void register_output_filepath(std::string const &); - void register_peft_model(FFModel *model, PEFTModelID peft_model_id); LoraLinearConfig get_peft_config(PEFTModelID peft_model_id); void set_max_lora_rank(int max_lora_rank); void set_max_concurrent_adapters(int max_concurrent_adapters); diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index 195d6ba7e3..945c55f296 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -242,6 +242,13 @@ void FALCON::create_falcon_model(FFModel &ff, output = ff.argmax(lm_head, /*beam_Search*/ false); } + // If PEFT is enabled, add LoRA layers + if (ff.config.enable_peft) { + // todo: add attention projections + std::vector target_modules = {"dense_h_to_4h", "dense_4h_to_h"}; + ff.add_lora_layers(); + } + FileDataLoader *fileloader = new FileDataLoader("", weight_file_path, diff --git a/inference/models/llama.cc b/inference/models/llama.cc index cf26194597..6a70620942 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -226,9 +226,6 @@ void LLAMA::create_llama_model(FFModel &ff, REG_MODE_NONE, 0.0f, std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str()); - // Low-Rank Adapter (LoRA) for the second linear layer - // ff.lora_linear(std::string("down_proj"), std::string("layers." + - // std::to_string(i) + ".mlp.down_proj.lora").c_str()); } // final normalization and linear Tensor final_rms_norm_output[2] = {nullptr, nullptr}; @@ -273,6 +270,13 @@ void LLAMA::create_llama_model(FFModel &ff, } } + // If PEFT is enabled, add LoRA layers + if (ff.config.enable_peft) { + // todo: add attention projections + std::vector target_modules = {"gate_proj", "up_proj", "down_proj"}; + ff.add_lora_layers(); + } + FileDataLoader *fileloader = new FileDataLoader( "", weight_file_path, diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index e4a7e0056d..6946ed18c3 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -250,6 +250,14 @@ void MPT::create_mpt_model(FFModel &ff, } else { output = ff.argmax(lm_head, /*beam_Search*/ false); } + + // If PEFT is enabled, add LoRA layers + if (ff.config.enable_peft) { + // todo: add attention projections + std::vector target_modules = {"up_proj", "down_proj"}; + ff.add_lora_layers(); + } + FileDataLoader *fileloader = new FileDataLoader("", weight_file_path, diff --git a/inference/models/opt.cc b/inference/models/opt.cc index b3f2ef4e17..b78dafbe95 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -262,6 +262,13 @@ void OPT::create_opt_model(FFModel &ff, output = ff.argmax(softmax, /*beam_Search*/ false); } + // If PEFT is enabled, add LoRA layers + if (ff.config.enable_peft) { + // todo: add attention projections + std::vector target_modules = {"fc1", "fc2"}; + ff.add_lora_layers(); + } + FileDataLoader *fileloader = new FileDataLoader( "", weight_file_path, diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index cd8bf3a9a7..3da1e82a79 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -224,6 +224,13 @@ void STARCODER::create_starcoder_model( } } + // If PEFT is enabled, add LoRA layers + if (ff.config.enable_peft) { + // todo: add attention projections + std::vector target_modules = {"c_fc", "c_proj"}; + ff.add_lora_layers(); + } + InferenceManager *im = InferenceManager::get_inference_manager(); FileDataLoader *fileloader = new FileDataLoader( "", diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index ec07ee9a5f..5a16fbc34f 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -810,6 +810,10 @@ def pipeline_parallelism_degree(self, value): @property def python_data_loader_type(self): return ffc().flexflow_config_get_python_data_loader_type(self.handle) + + @property + def enable_peft(self): + return ffc().flexflow_config_get_enable_peft(self.handle) @property def cpu_offload(self): @@ -4284,8 +4288,12 @@ def argmax(self, input, beam_search, name=None): self.add_layer(OpType.ARGMAX, name) return Tensor(handle, owner_op_type=OpType.ARGMAX) - def add_lora_layer(self, peft_config): - return ffc().flexflow_model_add_lora_layer(self.handle, peft_config.handle) + def add_lora_layers(self, target_modules: List[str]): + c_target_modules = [get_c_name(module) for module in target_modules] + return ffc().flexflow_model_add_lora_layers(self.handle, len(target_modules), c_target_modules) + + def register_peft_adapter(self, peft_config): + return ffc().flexflow_model_register_peft_adapter(self.handle, peft_config.handle) def reset_metrics(self): """Reset performance metrics. diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index 0e8fbcbd7d..b38ffb2963 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -241,6 +241,10 @@ def build_model(self, max_tokens_per_batch): # output = ffmodel.arg_top_k(lm_head, 1, False) softmax = ffmodel.softmax(lm_head, -1) output = ffmodel.argmax(softmax, False) + + if self.ffconfig.enable_peft: + # TODO: add attention projections + ffmodel.add_lora_layers(["dense_h_to_4h", "dense_4h_to_h"]) self.ffmodel = ffmodel diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index 96f0258572..0cb2847556 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -248,6 +248,10 @@ def build_model(self, max_tokens_per_batch): # output = ffmodel.arg_top_k(dense, 1, False) softmax = ffmodel.softmax(dense, -1) output = ffmodel.argmax(softmax, False) + + if self.ffconfig.enable_peft: + # TODO: add attention projections + ffmodel.add_lora_layers(["gate_proj", "up_proj", "down_proj"]) self.ffmodel = ffmodel diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py index b350ae106d..4bc3026989 100644 --- a/python/flexflow/serve/models/mpt.py +++ b/python/flexflow/serve/models/mpt.py @@ -252,6 +252,10 @@ def build_model(self, max_tokens_per_batch): softmax = ffmodel.softmax(lm_head, -1) output = ffmodel.argmax(softmax, False) + if self.ffconfig.enable_peft: + # TODO: add attention projections + ffmodel.add_lora_layers(["up_proj", "down_proj"]) + self.ffmodel = ffmodel # TODO: finish this diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index 02668abf59..047e2df013 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -282,6 +282,10 @@ def build_model(self, max_tokens_per_batch): softmax = ffmodel.softmax(lm_head, -1) output = ffmodel.argmax(softmax, False) + if self.ffconfig.enable_peft: + # TODO: add attention projections + ffmodel.add_lora_layers(["fc1", "fc2"]) + self.ffmodel = ffmodel def convert_hf_weight_name(name): diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index 2d4471201f..58c2bf621a 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -220,6 +220,10 @@ def build_model(self, max_tokens_per_batch): softmax = ffmodel.softmax(lm_head, -1) output = ffmodel.argmax(softmax, False) + if self.ffconfig.enable_peft: + # TODO: add attention projections + ffmodel.add_lora_layers(["c_fc", "c_proj"]) + self.ffmodel = ffmodel def convert_hf_model(model, dst_folder): diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 794f1babb3..cfa723d3c6 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -443,12 +443,6 @@ def compile( # Download the weights from huggingface (if needed) self.download_hf_weights_if_needed() - # Add PEFT layer if registered - for ff_peft_config, peft_dict in self.pefts.items(): - ff_peft_config.ff_compile() - ff_peft_model_id = self.model.ffmodel.add_lora_layer(ff_peft_config) - peft_dict["ff_peft_model_id"] = ff_peft_model_id - # Create file data loader, load weights into tensors model_configs = self.config_class(self.hf_config) @@ -487,6 +481,13 @@ def compile( for ssm in self.ssms: self.rm.register_ssm_model(ssm.model.ffmodel) + # Add PEFT layer if registered + for ff_peft_config, peft_dict in self.pefts.items(): + ff_peft_config.ff_compile() + ff_peft_model_id = self.model.ffmodel.register_peft_adapter(ff_peft_config) + peft_dict["ff_peft_model_id"] = ff_peft_model_id + + # start background server if (mode == InferenceMode.TREE_VERIFY_MODE) or ( mode == InferenceMode.INC_DECODING_MODE diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index e6b246597f..8810cfb30c 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -173,6 +173,11 @@ void flexflow_config_set_pipeline_parallelism_degree(flexflow_config_t handle_, handle->pipeline_parallelism_degree = value; } +bool flexflow_config_get_enable_peft(flexflow_config_t handle_) { + FFConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->enable_peft; +} + int flexflow_config_get_python_data_loader_type(flexflow_config_t handle_) { FFConfig *handle = FFCObjectWrapper::unwrap(handle_); return handle->python_data_loader_type; @@ -1549,14 +1554,26 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, return FFCObjectWrapper::wrap(tensor); } -flexflow_peft_model_id_t flexflow_model_add_lora_layer( +void flexflow_model_add_lora_layers(flexflow_model_t handle_, int num_target_modules, char const **target_modules_) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + std::vector target_modules; + for (int i = 0; i < num_target_modules; i++) { + target_modules.push_back(target_modules_[i]); + } + DEBUG_PRINT("[Add Lora Layers] model handle: %p, num_target_modules %d", + handle, + num_target_modules); + handle->add_lora_layers(target_modules); +} + +flexflow_peft_model_id_t flexflow_model_register_peft_adapter( flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); LoraLinearConfig const *peft_config = FFCObjectWrapper::unwrap(peft_config_); - PEFTModelID *peft_model_id = handle->add_lora_layer(*peft_config); + PEFTModelID *peft_model_id = handle->register_peft_adapter(*peft_config); - DEBUG_PRINT("[Add Lora Layer] model handle: %p, peft_config handle %p, " + DEBUG_PRINT("[Register PEFT Adapter] model handle: %p, peft_config handle %p, " "peft_model_id: %p", handle, peft_config, From 1691100906ddf25191fb0e1444fa75d0675cd44d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 6 Oct 2024 05:10:58 +0000 Subject: [PATCH 17/37] bug fix --- include/flexflow/batch_config.h | 4 +- include/flexflow/fftype.h | 1 + include/flexflow/model.h | 1 + .../ops/kernels/lora_linear_kernels.h | 4 + include/flexflow/ops/lora_linear.h | 2 - include/flexflow/ops/lora_linear_params.h | 25 ++++-- include/flexflow/request_manager.h | 2 + .../flexflow/utils/peft_weight_allocator.h | 22 +++-- src/ops/kernels/lora_linear_kernels.cu | 41 +++++---- src/ops/lora_linear.cc | 88 +++++++++++-------- src/ops/lora_linear_params.cc | 9 +- src/runtime/fftype.cc | 2 + src/runtime/peft_weight_allocator.cc | 43 ++++++--- src/runtime/peft_weight_allocator.cu | 8 +- src/runtime/request_manager.cc | 5 +- 15 files changed, 156 insertions(+), 101 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index cb2f8d3a3d..44d829a7f7 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -94,6 +94,7 @@ class BatchConfig { num_tokens_in_batch = 0; max_length = 0; request_guid = 0; + peft_model_id = PEFTModelID::NO_ID; prompt_phase = false; batch_config_request_id = -1; peft_bwd = false; @@ -109,7 +110,8 @@ class BatchConfig { bool prompt_phase = false; RequestGuid request_guid; // PEFT fields - std::unordered_map peft_adapters; + PEFTModelID peft_model_id; + std::string peft_model_config; bool peft_bwd; OptimizerTasks optimizer_tasks; }; diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h index 3e482b8d67..ebc811c262 100644 --- a/include/flexflow/fftype.h +++ b/include/flexflow/fftype.h @@ -27,6 +27,7 @@ class PEFTModelID { PEFTModelID(size_t id); bool is_valid_id() const; friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs); + friend bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs); friend std::ostream &operator<<(std::ostream &os, PEFTModelID const &peft_model_id); diff --git a/include/flexflow/model.h b/include/flexflow/model.h index d1dbe72d7c..e3beafe20c 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -847,6 +847,7 @@ class FFModel { // ======================================== // PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config); void add_lora_layers(std::vector target_modules); + PEFTModelID *register_peft_adapter(LoraLinearConfig const &peft_config); // ======================================== // Inference APIs // ======================================== diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h index eef3b392b3..00f16af146 100644 --- a/include/flexflow/ops/kernels/lora_linear_kernels.h +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -6,6 +6,7 @@ #include "flexflow/fftype.h" #include "flexflow/op_meta.h" #include "flexflow/ops/lora_linear.h" +#include "flexflow/utils/peft_weight_allocator.h" namespace FlexFlow { @@ -35,6 +36,9 @@ class LoraLinearMeta : public OpMeta { namespace Kernels { namespace LoraLinear { + +bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config); + void init_kernel_wrapper(LoraLinearMeta *m, int seed); void inference_kernel_wrapper(LoraLinearMeta *m, BatchConfig const *bc, diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h index 8d37be0c64..1c6070afe4 100644 --- a/include/flexflow/ops/lora_linear.h +++ b/include/flexflow/ops/lora_linear.h @@ -20,12 +20,10 @@ class LoraLinear : public Op { LoraLinear( FFModel &model, LayerID const &layer_guid, - OperatorType type, ParallelTensor const input, ParallelTensor const output, int max_rank, int max_concurrent_adapters, - // std::unordered_map const &_peft_configs, char const *name = nullptr); LoraLinear(FFModel &model, LoraLinear const &other, diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h index c5a327459f..525a9209d3 100644 --- a/include/flexflow/ops/lora_linear_params.h +++ b/include/flexflow/ops/lora_linear_params.h @@ -124,16 +124,28 @@ class LoraLinearConfig { std::vector const &target_modules_ = {}); // constructor used to support std::unordered_map LoraLinearConfig(); + + // Method to set optimizer template - void setOptimizer(T&& opt) { - optimizer_config = std::make_unique(std::forward(opt)); + void setOptimizer(T&& opt) { + if constexpr (std::is_base_of_v>) { + optimizer_config = std::make_unique>(std::forward(opt)); + } else if constexpr (std::is_same_v, std::remove_reference_t>) { + optimizer_config = std::move(opt); + } else { + static_assert(always_false, "Unsupported optimizer type"); } + } + // Helper template for static_assert + template + static inline constexpr bool always_false = false; + friend bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs); friend std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc); std::string serialize_to_json_string(int indent=-1) const { - json j = { + nlohmann::json j = { {"cache_folder", cache_folder}, {"peft_model_id", peft_model_id}, {"rank", rank}, @@ -144,7 +156,8 @@ class LoraLinearConfig { {"init_lora_weights", init_lora_weights}, {"base_model_name_or_path", base_model_name_or_path}, {"precision", precision}, - {"optimizer_config", optimizer_config ? optimizer_config->toJson() : nullptr} + // {"optimizer_config", optimizer_config ? optimizer_config->toJson() : nullptr} + {"optimizer_config", optimizer_config ? nlohmann::json(optimizer_config->toJson()) : nlohmann::json()} }; return j.dump(indent); // No indentation @@ -156,7 +169,7 @@ class LoraLinearConfig { } // Deserialization method static LoraLinearConfig deserialize_from_json_string(const std::string& json_string) { - json j = json::parse(json_string); + nlohmann::json j = nlohmann::json::parse(json_string); LoraLinearConfig config( j["cache_folder"].get(), j["peft_model_id"].get(), @@ -208,8 +221,6 @@ class LoraLinearConfig { class LoraLinearParams { public: LayerID layer_guid; - // OperatorType type; - // std::unordered_map peft_configs; int max_rank; int max_concurrent_adapters; char name[MAX_OPNAME]; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 542deb336d..628714dcc0 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -149,6 +149,8 @@ class RequestManager { int eos_token_id, std::string const &path); void register_output_filepath(std::string const &); + void register_peft_config(PEFTModelID const &peft_model_id, + LoraLinearConfig const &peft_config); LoraLinearConfig get_peft_config(PEFTModelID peft_model_id); void set_max_lora_rank(int max_lora_rank); void set_max_concurrent_adapters(int max_concurrent_adapters); diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h index 3c9efc0812..9670da8a4f 100644 --- a/include/flexflow/utils/peft_weight_allocator.h +++ b/include/flexflow/utils/peft_weight_allocator.h @@ -17,12 +17,13 @@ #define _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_ #include "flexflow/config.h" -#include "lora_linear_params.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/lora_linear_params.h" // #include namespace FlexFlow { -#ifdef DEACODE +#ifdef DEADCODE class PEFTWeightAllocator { public: PEFTWeightAllocator(void *_base_ptr, size_t _total_size) @@ -108,19 +109,21 @@ struct LoraLinearWeight { low_rank_activation(low_rank_activation_), input_activation(input_activation_) {} }; +void init_peft_weight_wrapper(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, DataType dt, int seed); + class PEFTMemoryManager { public: - PEFTMemoryManager(Memory gpu_mem_, size_t max_lora_size_, int max_concurrent_adapters_, int max_peft_tokens_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_) + PEFTMemoryManager(Legion::Memory gpu_mem_, int max_rank_, int max_concurrent_adapters_, int max_peft_tokens_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_) : gpu_mem(gpu_mem_), max_concurrent_adapters(max_concurrent_adapters_), - max_lora_size(max_lora_size_), + max_rank(max_rank_), in_dim(in_dim_), out_dim(out_dim_), num_shards(num_shards_), shard_id(shard_id_), max_peft_tokens(max_peft_tokens_), lora_layername_substr(lora_layername_substr_), dt(dt_), base_ptr(nullptr), finetuning_ptr(nullptr), finetuning_model_id(PEFTModelID::NO_ID) { - + max_lora_size = data_type_size(dt) * (max_rank * in_dim + max_rank * out_dim); assert(max_concurrent_adapters > 0 && "PEFT Memory Manager max_concurrent_adapters must be > 0"); assert(max_lora_size > 0 && "PEFT Memory Manager max_lora_size must be > 0"); allocate_inference_memory(); @@ -146,12 +149,13 @@ class PEFTMemoryManager { LoraLinearWeight get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config); // Legion memory management apparatus - Memory gpu_mem; + Legion::Memory gpu_mem; Realm::RegionInstance peftLegionInst; void *base_ptr, *finetuning_ptr; // Size and shapes int max_concurrent_adapters; - size_t max_lora_size; + int max_rank; + int max_lora_size; int in_dim, out_dim, num_shards, shard_id; int max_peft_tokens; // LRU cache apparatus @@ -162,8 +166,8 @@ class PEFTMemoryManager { std::string lora_layername_substr; DataType dt; PEFTModelID finetuning_model_id; -} +}; -}; // namespace FlexFlow +} // namespace FlexFlow #endif // _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_ diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index d5baf49cdc..134af3ca6e 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -24,8 +24,10 @@ namespace FlexFlow { LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li) : OpMeta(handler, li) { +#ifdef DEADCODE allocated_peft_buffer_size1 = 0; allocated_peft_buffer_size2 = 0; +#endif } LoraLinearMeta::~LoraLinearMeta(void) {} @@ -145,6 +147,16 @@ void peft_bwd_kernel_wrapper(LoraLinearMeta *m, } } +bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config) { + for (std::string s : config.target_modules) { + std::string n(m->op_name); + if (n.find(s) != std::string::npos) { + return true; + } + } + return false; +} + namespace Internal { @@ -289,17 +301,6 @@ void inference_kernel(LoraLinearMeta *m, } #endif -bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config) { - for (std::string s : config.target_modules) { - std::string n(m->op_name); - if (n.find(s) != std::string::npos) { - return true; - } - } - return false; -} - - template void inference_kernel(LoraLinearMeta *m, BatchConfig const *bc, @@ -326,7 +327,7 @@ void inference_kernel(LoraLinearMeta *m, if (bc->requestsInfo[i].peft_bwd) { num_peft_requests++; } - LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]); + LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config); if (!lora_applies_to_this_layer(m, lora_config)) { continue; } @@ -444,8 +445,7 @@ void peft_bwd_kernel(LoraLinearMeta *m, if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || !bc->requestsInfo[i].peft_bwd) { continue; } - int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_adapters[bc->requestsInfo[i].peft_model_id]); + LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config); if (!lora_applies_to_this_layer(m, lora_config)) { continue; } @@ -453,7 +453,7 @@ void peft_bwd_kernel(LoraLinearMeta *m, m->peft_memory_manager->check_ft_model_id(bc->requestsInfo[i].peft_model_id); int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; // int max_peft_tokens = bc->requestsInfo[i].max_length; - int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config); DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank); @@ -562,15 +562,14 @@ void peft_bwd_kernel(LoraLinearMeta *m, } if (bc->requestsInfo[i].optimizer_tasks.update_weights) { - LoraOptimizerConfig const *optimizer_config = lora_config.optimizer_config; - assert(optimizer_config != nullptr); + assert(lora_config.optimizer_config != nullptr); int w0_num_elements = lora_config.rank * in_dim; int w1_num_elements = lora_config.rank * out_dim; // Get optimizer config - if (optimizer_config->getType() == "SGD") { - LoraSGDOptimizerConfig const *sgd_config = - (LoraSGDOptimizerConfig const *)optimizer_config; + + if (lora_config.optimizer_config->getType() == "SGD") { + LoraSGDOptimizerConfig const *sgd_config = static_cast(lora_config.optimizer_config.get()); // LoRA_A weight is split in tensor parallelism, so no need to apply // all-reduce sgd_update<<(weight.w1_grad_ptr), static_cast
(weight.w1_v_values_ptr), static_cast
(weight.w1_ptr)); - } else if (optimizer_config->getType() == "Adam") { + } else if (lora_config.optimizer_config->getType() == "Adam") { assert(false && "Adam optimizer type not implemented yet"); } else { assert(false && "Unsupported optimizer type"); diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index a18f47c4ac..f7ac4ff06e 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -6,6 +6,7 @@ #include "flexflow/utils/hash_utils.h" #include "flexflow/utils/peft_weight_allocator.h" #include "legion/legion_utilities.h" +#include "flexflow/request_manager.h" #include #include #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) @@ -51,13 +52,13 @@ bool check_lora_layer_match(Layer *potential_target, return false; } -void FFmodel::add_lora_layers(std::vector target_modules) { +void FFModel::add_lora_layers(std::vector target_modules) { assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled"); assert(target_modules.size() > 0 && "LoRA target module name is empty"); RequestManager *rm = RequestManager::get_request_manager(); int max_lora_rank = rm->get_max_lora_rank(); int max_concurrent_adapters = rm->get_max_concurrent_adapters(); - assert(max_rank > 1 && max_rank <= 32 && "Invalid max LoRA rank"); + assert(max_lora_rank > 1 && max_lora_rank <= 32 && "Invalid max LoRA rank"); assert(max_concurrent_adapters > 0 && "Invalid number of LoRA concurrent adapters"); for (std::string target_module_name : target_modules) { @@ -120,7 +121,7 @@ void FFmodel::add_lora_layers(std::vector target_modules) { true /*create_grad*/); } // pass max_rank and max_concurrent_adapters to OP_LORA layer - peft_layer->add_int_property("max_rank", max_rank); + peft_layer->add_int_property("max_rank", max_lora_rank); peft_layer->add_int_property("max_concurrent_adapters", max_concurrent_adapters); it = layers.insert(it + 1, peft_layer); ++it; @@ -263,7 +264,7 @@ Op *LoraLinear::create_operator_from_layer( long long value; layer->get_int_property("max_rank", value); int max_rank = value; - layer->get_int_property("max_concurrent_adapters", max_concurrent_adapters); + layer->get_int_property("max_concurrent_adapters", value); int max_concurrent_adapters = value; #ifdef DEADCODE std::unordered_map _peft_configs; @@ -276,7 +277,6 @@ Op *LoraLinear::create_operator_from_layer( #endif return new LoraLinear(model, layer->layer_guid, - layer->op_type, inputs[0], inputs[1], max_rank, @@ -290,7 +290,6 @@ LoraLinear::LoraLinear(FFModel &model, ParallelTensor const output) : LoraLinear(model, other.layer_guid, - other.op_type, input, output, other.max_rank, @@ -303,7 +302,6 @@ LoraLinear::LoraLinear(FFModel &model, char const *name) : LoraLinear(model, params.layer_guid, - params.type, inputs.first, inputs.second, params.max_rank, @@ -313,7 +311,6 @@ LoraLinear::LoraLinear(FFModel &model, LoraLinear::LoraLinear( FFModel &model, LayerID const &_layer_guid, - OperatorType _op_type, ParallelTensor const _input, ParallelTensor const _output, int _max_rank, @@ -321,7 +318,7 @@ LoraLinear::LoraLinear( // std::unordered_map const &_peft_configs, char const *name) : Op(model, - _op_type, + OP_LORA, _output->data_type, name, 2 /*inputs*/, @@ -473,9 +470,8 @@ OpMeta *LoraLinear::init_task(Task const *task, lora_layername.substr(0, found + searchString.length()); // allocate space for lora weights - size_t max_lora_size = data_type_size(dt) * (lora->max_rank * in_dim + lora->max_rank * out_dim); Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); - m->peft_memory_manager = new PEFTMemoryManager(gpu_mem, max_lora_size, lora->max_concurrent_adapters, in_dim, out_dim, num_shards, shard_id, lora_layername_substr, dt); + m->peft_memory_manager = new PEFTMemoryManager(gpu_mem, lora->max_rank, lora->max_concurrent_adapters, BatchConfig::max_sequence_length(), in_dim, out_dim, num_shards, shard_id, lora_layername_substr, dt); m->peft_memory_manager->allocate_inference_memory(); return m; } @@ -709,8 +705,8 @@ void LoraLinear::inference_task(Task const *task, m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorRW( m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); - // int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; - // int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; // int num_infr_tokens = bc->num_active_infr_tokens(); // int num_peft_tokens = bc->num_active_peft_tokens(); @@ -761,12 +757,15 @@ void LoraLinear::inference_task(Task const *task, assert(false); } - int rank, num_tokens; - for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) { - PEFTModelID peft_model_id = it->first; - LoraLinearWeight weight = m->model_state[peft_model_id].weights; - rank = weight.rank; - num_tokens = input.domain.get_volume() / weight.in_dim; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config); + if (!lora_applies_to_this_layer(m, lora_config)) { + continue; + } + LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config); fs::path dst_filepath_weights = get_dst_folder("weights", m->decoding_step, shard_id) / layername; std::string filenameA = @@ -775,21 +774,38 @@ void LoraLinear::inference_task(Task const *task, dst_filepath_weights.string() + ".weight_B.original"; if (m->input_type[0] == DT_FLOAT) { save_tensor((float *)weight.w0_ptr, - weight.rank * weight.in_dim, + lora_config.rank * in_dim, filenameA.c_str()); save_tensor((float *)weight.w1_ptr, - weight.rank * weight.out_dim, + lora_config.rank * out_dim, filenameB.c_str()); } else if (m->input_type[0] == DT_HALF) { save_tensor((half *)weight.w0_ptr, - weight.rank * weight.in_dim, + lora_config.rank * in_dim, filenameA.c_str()); save_tensor((half *)weight.w1_ptr, - weight.rank * weight.out_dim, + lora_config.rank * out_dim, filenameB.c_str()); } else { assert(false && "Data type not supported"); } + + if (bc->requestsInfo[i].peft_bwd) { + int num_tokens = input.domain.get_volume() / in_dim; + // input activation (intermediate) + filename = dst_filepath.string() + ".low_rank_activation"; + if (output.data_type == DT_FLOAT) { + save_tensor((float *)weight.low_rank_activation, + lora_config.rank * num_tokens, + filename.c_str()); + } else if (output.data_type == DT_HALF) { + save_tensor((half *)weight.low_rank_activation, + lora_config.rank * num_tokens, + filename.c_str()); + } else { + assert(false); + } + } } filename = dst_filepath.string() + ".output_0"; @@ -803,21 +819,7 @@ void LoraLinear::inference_task(Task const *task, assert(false); } - if (bc->num_active_peft_tokens() > 0) { - // input activation (intermediate) - filename = dst_filepath.string() + ".low_rank_activation"; - if (output.data_type == DT_FLOAT) { - save_tensor((float *)m->low_rank_activation, - rank * num_tokens, - filename.c_str()); - } else if (output.data_type == DT_HALF) { - save_tensor((half *)m->low_rank_activation, - rank * num_tokens, - filename.c_str()); - } else { - assert(false); - } - } + m->decoding_step++; } } @@ -905,6 +907,16 @@ void lora_inference_debugging(LoraLinearMeta *m, // weights, weights gradients fs::path dst_filepath_weights = get_dst_folder("weights", m->bwd_step, shard_id) / layername; + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config); + if (!lora_applies_to_this_layer(m, lora_config)) { + continue; + } + assert(m->model_state.size() >= 1 && "Model state empty!"); for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) { PEFTModelID peft_model_id = it->first; diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc index 310b6d0973..c7b9fcc711 100644 --- a/src/ops/lora_linear_params.cc +++ b/src/ops/lora_linear_params.cc @@ -170,11 +170,10 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) { os << "trainable: " << llc.trainable << ", "; if (llc.optimizer_config != nullptr) { os << "optimizer_config: "; - if (typeid(*llc.optimizer_config) == typeid(LoraSGDOptimizerConfig)) { - os << *static_cast(llc.optimizer_config); - } else if (typeid(*llc.optimizer_config) == - typeid(LoraAdamOptimizerConfig)) { - os << *static_cast(llc.optimizer_config); + if (llc.optimizer_config.get()->getType() == "SGD") { + os << *static_cast(llc.optimizer_config.get()); + } else if (llc.optimizer_config.get()->getType() == "Adam") { + os << *static_cast(llc.optimizer_config.get()); } else { os << "Unknown optimizer config type"; } diff --git a/src/runtime/fftype.cc b/src/runtime/fftype.cc index 8213726e8a..0af5f45350 100644 --- a/src/runtime/fftype.cc +++ b/src/runtime/fftype.cc @@ -46,6 +46,8 @@ bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs) { return lhs.id == rhs.id; } +bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs) { return !(lhs == rhs); } + std::ostream &operator<<(std::ostream &os, PEFTModelID const &peft_model_id) { if (peft_model_id == PEFTModelID::NO_ID) { os << "NO_ID"; diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc index cc40d666ed..287eb7e20a 100644 --- a/src/runtime/peft_weight_allocator.cc +++ b/src/runtime/peft_weight_allocator.cc @@ -1,6 +1,24 @@ -#include "peft_weight_allocator.h" +#include "flexflow/utils/peft_weight_allocator.h" namespace FlexFlow { +// declare legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::InlineLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; void PEFTMemoryManager::allocate_inference_memory() { // allocate chunk of memory for all the PEFT adapters @@ -21,7 +39,7 @@ void PEFTMemoryManager::allocate_inference_memory() { void PEFTMemoryManager::allocate_finetuning_memory() { size_t ft_size = max_lora_size*3; // weights, gradients, momentum values - ft_size += max_peft_tokens*(in_dim+rank); // input, low-rank activations + ft_size += max_peft_tokens * (in_dim + max_rank); // input, low-rank activations // allocate chunk of memory for PEFT adapter Realm::Rect<1, coord_t> bounds( Realm::Point<1, coord_t>(0), @@ -144,7 +162,7 @@ void load_peft_from_file(DT *ptr, void PEFTMemoryManager::load_peft_model(LoraLinearWeight &weight, LoraLinearConfig const &lora_config) { // Load weights - assert(weight.w0_ptr != nullptr && weight.w1_ptr != nullptr "PEFT Memory Manager weight ptr null"); + assert(weight.w0_ptr != nullptr && weight.w1_ptr != nullptr && "PEFT Memory Manager weight ptr null"); int w0_num_elements = lora_config.rank * in_dim; int w1_num_elements = lora_config.rank * out_dim; // values below represent total weight sizes before sharding. Lora B is not @@ -235,7 +253,7 @@ LoraLinearWeight PEFTMemoryManager::get_inference_peft(PEFTModelID const &model_ int data_size = data_type_size(dt); LoraLinearWeight result; result.w0_ptr = static_cast(base_ptr) + mem_slot * max_lora_size; - result.w1_ptr = result.w0_ptr + w0_num_elements * data_size; + result.w1_ptr = static_cast(result.w0_ptr) + w0_num_elements * data_size; if (cache_miss) { load_peft_model(result, lora_config); } @@ -244,19 +262,20 @@ LoraLinearWeight PEFTMemoryManager::get_inference_peft(PEFTModelID const &model_ LoraLinearWeight PEFTMemoryManager::get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config) { assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set"); - bool cache_miss = get_finetuning_slot(model_id); + bool cache_miss; + get_finetuning_slot(model_id, &cache_miss); int w0_num_elements = lora_config.rank * in_dim; int w1_num_elements = lora_config.rank * out_dim; int data_size = data_type_size(dt); LoraLinearWeight result; result.w0_ptr = finetuning_ptr; - result.w1_ptr = result.w0_ptr + w0_num_elements*data_size; - result.w0_grad_ptr = result.w1_ptr + w1_num_elements*data_size; - result.w1_grad_ptr = result.w0_grad_ptr + w0_num_elements*data_size; - result.w0_v_values_ptr = result.w1_grad_ptr + w1_num_elements*data_size; - result.w1_v_values_ptr = result.w0_v_values_ptr + w0_num_elements*data_size; - result.input_activation = result.w1_v_values_ptr + w1_num_elements*data_size; // max_peft_tokens*in_dim - result.low_rank_activation = result.input_activation + max_peft_tokens*in_dim*data_size; // max_peft_tokens*rank + result.w1_ptr = static_cast(result.w0_ptr)+ w0_num_elements*data_size; + result.w0_grad_ptr = static_cast(result.w1_ptr) + w1_num_elements*data_size; + result.w1_grad_ptr = static_cast(result.w0_grad_ptr) + w0_num_elements*data_size; + result.w0_v_values_ptr = static_cast(result.w1_grad_ptr) + w1_num_elements*data_size; + result.w1_v_values_ptr = static_cast(result.w0_v_values_ptr) + w0_num_elements*data_size; + result.input_activation = static_cast(result.w1_v_values_ptr) + w1_num_elements*data_size; // max_peft_tokens*in_dim + result.low_rank_activation = static_cast(result.input_activation) + max_peft_tokens*in_dim*data_size; // max_peft_tokens*rank if (cache_miss) { load_peft_model(result, lora_config); } diff --git a/src/runtime/peft_weight_allocator.cu b/src/runtime/peft_weight_allocator.cu index cc8d095069..bc9ab443cb 100644 --- a/src/runtime/peft_weight_allocator.cu +++ b/src/runtime/peft_weight_allocator.cu @@ -8,7 +8,7 @@ namespace FlexFlow { template -void init_kernel(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, int seed, cudaStream_t stream) { +void lora_init_kernel(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, int seed, cudaStream_t stream) { // Initialize generator std::mt19937 gen(seed); @@ -47,7 +47,7 @@ void init_kernel(LoraLinearWeight const &weight, int in_dim, int out_dim, int ra num = num_float; } } - checkCUDA(cudaMemcpyAsync(static_cast
(w1_ptr), + checkCUDA(cudaMemcpyAsync(static_cast
(weight.w1_ptr), lora_b_random_init.data(), w1_num_elements * sizeof(DT), cudaMemcpyHostToDevice, @@ -59,9 +59,9 @@ void init_peft_weight_wrapper(LoraLinearWeight const &weight, int in_dim, int ou checkCUDA(get_legion_stream(&stream)); if (dt == DT_FLOAT) { - Internal::init_kernel(weight, in_di, out_dim, rank, seed, stream); + lora_init_kernel(weight, in_dim, out_dim, rank, seed, stream); } else if (dt == DT_HALF) { - Internal::init_kernel(weight, in_di, out_dim, rank, seed, stream); + lora_init_kernel(weight, in_dim, out_dim, rank, seed, stream); } else { assert(false && "Unsupported data type"); } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 79fcdfdcfe..2377a4f938 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -260,7 +260,8 @@ void RequestManager::register_peft_config(PEFTModelID const &peft_model_id, // check that peft_model_id is not already in use assert(peft_configs.find(peft_model_id) == peft_configs.end() && "PEFT model ID already in use"); - peft_configs[peft_model_id] = peft_config; + // peft_configs[peft_model_id] = std::move(peft_config); + peft_configs.emplace(peft_model_id, std::move(peft_config)); } LoraLinearConfig const &RequestManager::get_peft_config( @@ -284,7 +285,7 @@ int RequestManager::get_max_concurrent_adapters() { return max_concurrent_adapters; } -PEFTModelID *FFModel::register_peft_adapter(LoraLinearConfig const peft_config) { +PEFTModelID *FFModel::register_peft_adapter(LoraLinearConfig const &peft_config) { assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled"); if (peft_config.target_modules.size() == 0) { From 7ff96d782ac71fc05c943f2bebdd4be616fbe91d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 6 Oct 2024 06:16:05 +0000 Subject: [PATCH 18/37] fixes --- include/flexflow/flexflow_c.h | 8 +- include/flexflow/model.h | 12 +- .../ops/kernels/lora_linear_kernels.h | 3 +- include/flexflow/ops/lora_linear.h | 15 +- include/flexflow/ops/lora_linear_params.h | 130 ++---- include/flexflow/request_manager.h | 4 +- .../flexflow/utils/peft_weight_allocator.h | 94 ++-- inference/models/falcon.cc | 5 +- inference/models/llama.cc | 5 +- inference/models/mpt.cc | 4 +- inference/models/opt.cc | 2 +- inference/models/starcoder.cc | 2 +- src/c/flexflow_c.cc | 15 +- src/ops/kernels/lora_linear_kernels.cu | 50 ++- src/ops/lora_linear.cc | 143 +++--- src/ops/lora_linear_params.cc | 115 ++++- src/runtime/fftype.cc | 4 +- src/runtime/inference_manager.cc | 1 - src/runtime/peft_weight_allocator.cc | 418 +++++++++--------- src/runtime/peft_weight_allocator.cu | 92 ++-- src/runtime/request_manager.cc | 72 +-- 21 files changed, 673 insertions(+), 521 deletions(-) diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 19b2bc7c83..7a68c6566f 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -600,10 +600,12 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, bool beam_search, char const *name); -void flexflow_model_add_lora_layers(flexflow_model_t handle_, int num_target_modules, char const **target_modules_); +void flexflow_model_add_lora_layers(flexflow_model_t handle_, + int num_target_modules, + char const **target_modules_); - -flexflow_peft_model_id_t flexflow_model_register_peft_adapter(flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_); +flexflow_peft_model_id_t flexflow_model_register_peft_adapter( + flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_); void flexflow_model_set_sgd_optimizer(flexflow_model_t handle, flexflow_sgd_optimizer_t optimizer); diff --git a/include/flexflow/model.h b/include/flexflow/model.h index e3beafe20c..82f0a9add1 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -845,9 +845,9 @@ class FFModel { // ======================================== // PEFT Layers // ======================================== -// PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config); - void add_lora_layers(std::vector target_modules); - PEFTModelID *register_peft_adapter(LoraLinearConfig const &peft_config); + // PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config); + void add_lora_layers(std::vector target_modules); + PEFTModelID *register_peft_adapter(LoraLinearConfig const &peft_config); // ======================================== // Inference APIs // ======================================== @@ -1182,9 +1182,9 @@ class FFModel { std::vector parameters; // PEFT related std::unordered_map base_layer_to_peft_layer; -// std::unordered_map> peft_layer_to_peft_id; -// std::unordered_map peft_configs; - // std::vector peft_operators; + // std::unordered_map> + // peft_layer_to_peft_id; std::unordered_map + // peft_configs; std::vector peft_operators; FFHandler handlers[MAX_NUM_WORKERS]; Legion::Future current_metrics; diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h index 00f16af146..b3e047fc0e 100644 --- a/include/flexflow/ops/kernels/lora_linear_kernels.h +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -37,7 +37,8 @@ class LoraLinearMeta : public OpMeta { namespace Kernels { namespace LoraLinear { -bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config); +bool lora_applies_to_this_layer(LoraLinearMeta *m, + LoraLinearConfig const &config); void init_kernel_wrapper(LoraLinearMeta *m, int seed); void inference_kernel_wrapper(LoraLinearMeta *m, diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h index 1c6070afe4..cc625cafc2 100644 --- a/include/flexflow/ops/lora_linear.h +++ b/include/flexflow/ops/lora_linear.h @@ -17,14 +17,13 @@ class LoraLinear : public Op { using Params = LoraLinearParams; using Input = std::pair; - LoraLinear( - FFModel &model, - LayerID const &layer_guid, - ParallelTensor const input, - ParallelTensor const output, - int max_rank, - int max_concurrent_adapters, - char const *name = nullptr); + LoraLinear(FFModel &model, + LayerID const &layer_guid, + ParallelTensor const input, + ParallelTensor const output, + int max_rank, + int max_concurrent_adapters, + char const *name = nullptr); LoraLinear(FFModel &model, LoraLinear const &other, ParallelTensor const input, diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h index 525a9209d3..1dfe5f17bd 100644 --- a/include/flexflow/ops/lora_linear_params.h +++ b/include/flexflow/ops/lora_linear_params.h @@ -19,7 +19,7 @@ class LoraOptimizerConfig { LoraOptimizerConfig(); virtual std::string getType() const = 0; virtual nlohmann::json toJson() const = 0; - static std::unique_ptr fromJson(const nlohmann::json& j); + static std::unique_ptr fromJson(nlohmann::json const &j); virtual ~LoraOptimizerConfig() = default; }; @@ -32,26 +32,16 @@ class LoraSGDOptimizerConfig : public LoraOptimizerConfig { bool weight_decay_ = 0.0f); friend std::ostream &operator<<(std::ostream &os, LoraSGDOptimizerConfig const &llc); - - std::string getType() const override { return "SGD"; } - - nlohmann::json toJson() const override { - return {{"type", "SGD"}, - {"lr", lr}, - {"momentum", momentum}, - {"nesterov", nesterov}, - {"weight_decay", weight_decay}}; - } - static std::unique_ptr fromJson(const nlohmann::json& j) { - auto sgd = std::make_unique(); - sgd->lr = j["lr"]; - sgd->momentum = j["momentum"]; - sgd->nesterov = j["nesterov"]; - sgd->weight_decay = j["weight_decay"]; - return sgd; + std::string getType() const override { + return "SGD"; } + nlohmann::json toJson() const override; + + static std::unique_ptr + fromJson(nlohmann::json const &j); + public: double lr = 0.001f; double momentum = 0.0f; @@ -69,28 +59,16 @@ class LoraAdamOptimizerConfig : public LoraOptimizerConfig { double epsilon_ = 1e-8); friend std::ostream &operator<<(std::ostream &os, LoraAdamOptimizerConfig const &llc); - - std::string getType() const override { return "Adam"; } - - nlohmann::json toJson() const override { - return {{"type", "Adam"}, - {"alpha", alpha}, - {"beta1", beta1}, - {"beta2", beta2}, - {"weight_decay", weight_decay}, - {"epsilon", epsilon}}; - } - static std::unique_ptr fromJson(const nlohmann::json& j) { - auto adam = std::make_unique(); - adam->alpha = j["alpha"]; - adam->beta1 = j["beta1"]; - adam->beta2 = j["beta2"]; - adam->weight_decay = j["weight_decay"]; - adam->epsilon = j["epsilon"]; - return adam; + std::string getType() const override { + return "Adam"; } + nlohmann::json toJson() const override; + + static std::unique_ptr + fromJson(nlohmann::json const &j); + public: // Adam double alpha = 0.001f; @@ -100,14 +78,6 @@ class LoraAdamOptimizerConfig : public LoraOptimizerConfig { double epsilon = 1e-8; }; -std::unique_ptr LoraOptimizerConfig::fromJson(const nlohmann::json& j) { - std::string type = j["type"]; - if (type == "SGD") return LoraSGDOptimizerConfig::fromJson(j); - if (type == "Adam") return LoraAdamOptimizerConfig::fromJson(j); - throw std::runtime_error("Unknown optimizer type"); -} - - class LoraLinearConfig { public: static const LoraLinearConfig EmptyConfig; @@ -126,11 +96,14 @@ class LoraLinearConfig { LoraLinearConfig(); // Method to set optimizer - template - void setOptimizer(T&& opt) { - if constexpr (std::is_base_of_v>) { - optimizer_config = std::make_unique>(std::forward(opt)); - } else if constexpr (std::is_same_v, std::remove_reference_t>) { + template + void setOptimizer(T &&opt) { + if constexpr (std::is_base_of_v>) { + optimizer_config = + std::make_unique>(std::forward(opt)); + } else if constexpr (std::is_same_v, + std::remove_reference_t>) { optimizer_config = std::move(opt); } else { static_assert(always_false, "Unsupported optimizer type"); @@ -139,62 +112,19 @@ class LoraLinearConfig { // Helper template for static_assert template static inline constexpr bool always_false = false; - + friend bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs); friend std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc); - std::string serialize_to_json_string(int indent=-1) const { - nlohmann::json j = { - {"cache_folder", cache_folder}, - {"peft_model_id", peft_model_id}, - {"rank", rank}, - {"lora_alpha", lora_alpha}, - {"lora_dropout", lora_dropout}, - {"target_modules", target_modules}, - {"trainable", trainable}, - {"init_lora_weights", init_lora_weights}, - {"base_model_name_or_path", base_model_name_or_path}, - {"precision", precision}, - // {"optimizer_config", optimizer_config ? optimizer_config->toJson() : nullptr} - {"optimizer_config", optimizer_config ? nlohmann::json(optimizer_config->toJson()) : nlohmann::json()} - }; - - return j.dump(indent); // No indentation - } - void serialize_to_json_file(const std::string& filename) const { - std::string j = serialize_to_json_string(4); - std::ofstream file(filename); - file << j; - } + std::string serialize_to_json_string(int indent = -1) const; + void serialize_to_json_file(std::string const &filename) const; // Deserialization method - static LoraLinearConfig deserialize_from_json_string(const std::string& json_string) { - nlohmann::json j = nlohmann::json::parse(json_string); - LoraLinearConfig config( - j["cache_folder"].get(), - j["peft_model_id"].get(), - j["trainable"].get(), - nullptr, // optimizer_config will be set later if present - j["init_lora_weights"].get(), - j["base_model_name_or_path"].get(), - j["precision"].get(), - j["rank"].get(), - j["lora_alpha"].get(), - j["lora_dropout"].get(), - j["target_modules"].get>() - ); - if (!j["optimizer_config"].is_null()) { - config.setOptimizer(LoraOptimizerConfig::fromJson(j["optimizer_config"])); - } - return config; - } + static LoraLinearConfig + deserialize_from_json_string(std::string const &json_string); // Deserialization method - static LoraLinearConfig deserialize_from_json_file(const std::string& filename) { - std::ifstream file(filename); - std::string j; - file >> j; - return deserialize_from_json_string(j); - } + static LoraLinearConfig + deserialize_from_json_file(std::string const &filename); std::string cache_folder; // Huggingface model ID (for download and/or upload) diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 628714dcc0..ce75d2e0d3 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -151,7 +151,7 @@ class RequestManager { void register_output_filepath(std::string const &); void register_peft_config(PEFTModelID const &peft_model_id, LoraLinearConfig const &peft_config); - LoraLinearConfig get_peft_config(PEFTModelID peft_model_id); + LoraLinearConfig const &get_peft_config(PEFTModelID const &peft_model_id); void set_max_lora_rank(int max_lora_rank); void set_max_concurrent_adapters(int max_concurrent_adapters); int get_max_lora_rank(); @@ -295,7 +295,7 @@ class RequestManager { int max_spec_tree_token_num; int max_sequence_length; Status request_manager_status; - + // peft std::unordered_map peft_configs; int max_lora_rank; diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h index 9670da8a4f..bd8ddb1dce 100644 --- a/include/flexflow/utils/peft_weight_allocator.h +++ b/include/flexflow/utils/peft_weight_allocator.h @@ -101,52 +101,83 @@ struct LoraLinearWeight { void *low_rank_activation; // v values for SGD optimizer (when using momentum) void *w0_v_values_ptr, *w1_v_values_ptr; - LoraLinearWeight(void *w0=nullptr, void *w1=nullptr, void *w0_grad=nullptr, void *w1_grad=nullptr, - void *w0_v_values=nullptr, void *w1_v_values=nullptr, void *low_rank_activation_=nullptr, void *input_activation_=nullptr) - : w0_ptr(w0), w1_ptr(w1), - w0_grad_ptr(w0_grad), w1_grad_ptr(w1_grad), - w0_v_values_ptr(w0_v_values), w1_v_values_ptr(w1_v_values), - low_rank_activation(low_rank_activation_), input_activation(input_activation_) {} + LoraLinearWeight(void *w0 = nullptr, + void *w1 = nullptr, + void *w0_grad = nullptr, + void *w1_grad = nullptr, + void *w0_v_values = nullptr, + void *w1_v_values = nullptr, + void *low_rank_activation_ = nullptr, + void *input_activation_ = nullptr) + : w0_ptr(w0), w1_ptr(w1), w0_grad_ptr(w0_grad), w1_grad_ptr(w1_grad), + w0_v_values_ptr(w0_v_values), w1_v_values_ptr(w1_v_values), + low_rank_activation(low_rank_activation_), + input_activation(input_activation_) {} }; -void init_peft_weight_wrapper(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, DataType dt, int seed); +void init_peft_weight_wrapper(LoraLinearWeight const &weight, + int in_dim, + int out_dim, + int rank, + DataType dt, + int seed); class PEFTMemoryManager { public: - PEFTMemoryManager(Legion::Memory gpu_mem_, int max_rank_, int max_concurrent_adapters_, int max_peft_tokens_, int in_dim_, int out_dim_, int num_shards_, int shard_id_, std::string const &lora_layername_substr_, DataType dt_) - : gpu_mem(gpu_mem_), - max_concurrent_adapters(max_concurrent_adapters_), - max_rank(max_rank_), - in_dim(in_dim_), out_dim(out_dim_), num_shards(num_shards_), shard_id(shard_id_), - max_peft_tokens(max_peft_tokens_), - lora_layername_substr(lora_layername_substr_), dt(dt_), - base_ptr(nullptr), - finetuning_ptr(nullptr), - finetuning_model_id(PEFTModelID::NO_ID) { - max_lora_size = data_type_size(dt) * (max_rank * in_dim + max_rank * out_dim); - assert(max_concurrent_adapters > 0 && "PEFT Memory Manager max_concurrent_adapters must be > 0"); - assert(max_lora_size > 0 && "PEFT Memory Manager max_lora_size must be > 0"); + PEFTMemoryManager(Legion::Memory gpu_mem_, + int max_rank_, + int max_concurrent_adapters_, + int max_peft_tokens_, + int in_dim_, + int out_dim_, + int num_shards_, + int shard_id_, + std::string const &lora_layername_substr_, + DataType dt_) + : gpu_mem(gpu_mem_), max_concurrent_adapters(max_concurrent_adapters_), + max_rank(max_rank_), in_dim(in_dim_), out_dim(out_dim_), + num_shards(num_shards_), shard_id(shard_id_), + max_peft_tokens(max_peft_tokens_), + lora_layername_substr(lora_layername_substr_), dt(dt_), + base_ptr(nullptr), finetuning_ptr(nullptr), + finetuning_model_id(PEFTModelID::NO_ID) { + max_lora_size = + data_type_size(dt) * (max_rank * in_dim + max_rank * out_dim); + assert(max_concurrent_adapters > 0 && + "PEFT Memory Manager max_concurrent_adapters must be > 0"); + assert(max_lora_size > 0 && + "PEFT Memory Manager max_lora_size must be > 0"); allocate_inference_memory(); - // finetuning memory is allocated upon the first finetuning request, so we can skip for inference-only workloads + // finetuning memory is allocated upon the first finetuning request, so we + // can skip for inference-only workloads } - // allocate memory for all the PEFT adapters for a given layer on a given shard + // allocate memory for all the PEFT adapters for a given layer on a given + // shard void allocate_inference_memory(); - // allocate memory for the PEFT adapter for a finetuning request for a given layer and shard + // allocate memory for the PEFT adapter for a finetuning request for a given + // layer and shard void allocate_finetuning_memory(); - LoraLinearWeight get_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config); + LoraLinearWeight get_peft(PEFTModelID const &model_id, + LoraLinearConfig const &lora_config); void check_ft_model_id(PEFTModelID const &model_id); private: - // Check if the PEFT adapter for the given model is in memory. If not, sets the cache_miss flag to true. If this is the first finetuning request, allocate memory for the finetuning adapter. + // Check if the PEFT adapter for the given model is in memory. If not, sets + // the cache_miss flag to true. If this is the first finetuning request, + // allocate memory for the finetuning adapter. void get_finetuning_slot(PEFTModelID const &model_id, bool *cache_miss); - // Returns the slot in memory where the peft model weights are/will be stored. - // If the model is not in memory (cache miss), set the cache_miss flag to true. + // Returns the slot in memory where the peft model weights are/will be stored. + // If the model is not in memory (cache miss), set the cache_miss flag to + // true. int get_inference_peft_slot(PEFTModelID const &model_id, bool *cache_miss); - void load_peft_model(LoraLinearWeight &weight, LoraLinearConfig const &lora_config); - LoraLinearWeight get_inference_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config); - LoraLinearWeight get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config); + void load_peft_model(LoraLinearWeight &weight, + LoraLinearConfig const &lora_config); + LoraLinearWeight get_inference_peft(PEFTModelID const &model_id, + LoraLinearConfig const &lora_config); + LoraLinearWeight get_finetuning_peft(PEFTModelID const &model_id, + LoraLinearConfig const &lora_config); // Legion memory management apparatus Legion::Memory gpu_mem; @@ -160,7 +191,8 @@ class PEFTMemoryManager { int max_peft_tokens; // LRU cache apparatus std::unordered_map lru_hashtable; - std::vector lru_list; // head = least recently used, tail=most recently used + std::vector + lru_list; // head = least recently used, tail=most recently used std::unordered_map peft2mem_slot; // Miscellanea std::string lora_layername_substr; diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index 945c55f296..318ee128ad 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -245,8 +245,9 @@ void FALCON::create_falcon_model(FFModel &ff, // If PEFT is enabled, add LoRA layers if (ff.config.enable_peft) { // todo: add attention projections - std::vector target_modules = {"dense_h_to_4h", "dense_4h_to_h"}; - ff.add_lora_layers(); + std::vector target_modules = {"dense_h_to_4h", + "dense_4h_to_h"}; + ff.add_lora_layers(target_modules); } FileDataLoader *fileloader = diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 6a70620942..bc4c80b155 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -273,8 +273,9 @@ void LLAMA::create_llama_model(FFModel &ff, // If PEFT is enabled, add LoRA layers if (ff.config.enable_peft) { // todo: add attention projections - std::vector target_modules = {"gate_proj", "up_proj", "down_proj"}; - ff.add_lora_layers(); + std::vector target_modules = { + "gate_proj", "up_proj", "down_proj"}; + ff.add_lora_layers(target_modules); } FileDataLoader *fileloader = new FileDataLoader( diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index 6946ed18c3..b16729f02e 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -250,12 +250,12 @@ void MPT::create_mpt_model(FFModel &ff, } else { output = ff.argmax(lm_head, /*beam_Search*/ false); } - + // If PEFT is enabled, add LoRA layers if (ff.config.enable_peft) { // todo: add attention projections std::vector target_modules = {"up_proj", "down_proj"}; - ff.add_lora_layers(); + ff.add_lora_layers(target_modules); } FileDataLoader *fileloader = diff --git a/inference/models/opt.cc b/inference/models/opt.cc index b78dafbe95..a892cb9891 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -266,7 +266,7 @@ void OPT::create_opt_model(FFModel &ff, if (ff.config.enable_peft) { // todo: add attention projections std::vector target_modules = {"fc1", "fc2"}; - ff.add_lora_layers(); + ff.add_lora_layers(target_modules); } FileDataLoader *fileloader = new FileDataLoader( diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index 3da1e82a79..18d51cbae0 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -228,7 +228,7 @@ void STARCODER::create_starcoder_model( if (ff.config.enable_peft) { // todo: add attention projections std::vector target_modules = {"c_fc", "c_proj"}; - ff.add_lora_layers(); + ff.add_lora_layers(target_modules); } InferenceManager *im = InferenceManager::get_inference_manager(); diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 8810cfb30c..b9b4300828 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1554,7 +1554,9 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, return FFCObjectWrapper::wrap(tensor); } -void flexflow_model_add_lora_layers(flexflow_model_t handle_, int num_target_modules, char const **target_modules_) { +void flexflow_model_add_lora_layers(flexflow_model_t handle_, + int num_target_modules, + char const **target_modules_) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); std::vector target_modules; for (int i = 0; i < num_target_modules; i++) { @@ -1573,11 +1575,12 @@ flexflow_peft_model_id_t flexflow_model_register_peft_adapter( LoraLinearConfig const *peft_config = FFCObjectWrapper::unwrap(peft_config_); PEFTModelID *peft_model_id = handle->register_peft_adapter(*peft_config); - DEBUG_PRINT("[Register PEFT Adapter] model handle: %p, peft_config handle %p, " - "peft_model_id: %p", - handle, - peft_config, - peft_model_id); + DEBUG_PRINT( + "[Register PEFT Adapter] model handle: %p, peft_config handle %p, " + "peft_model_id: %p", + handle, + peft_config, + peft_model_id); return FFCObjectWrapper::wrap(peft_model_id); } diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 134af3ca6e..5e24b6a873 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -147,7 +147,8 @@ void peft_bwd_kernel_wrapper(LoraLinearMeta *m, } } -bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config) { +bool lora_applies_to_this_layer(LoraLinearMeta *m, + LoraLinearConfig const &config) { for (std::string s : config.target_modules) { std::string n(m->op_name); if (n.find(s) != std::string::npos) { @@ -159,7 +160,6 @@ bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &confi namespace Internal { - #ifdef DEADCODE template void inference_kernel(LoraLinearMeta *m, @@ -320,23 +320,30 @@ void inference_kernel(LoraLinearMeta *m, cudaDataType_t compute_type = output_type; int num_peft_requests = 0; - for (int i=0; i< bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i] || + bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { continue; } if (bc->requestsInfo[i].peft_bwd) { num_peft_requests++; } - LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config); + LoraLinearConfig lora_config = + LoraLinearConfig::deserialize_from_json_string( + bc->requestsInfo[i].peft_model_config); if (!lora_applies_to_this_layer(m, lora_config)) { continue; } - assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && "Trainable flag mismatch"); + assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && + "Trainable flag mismatch"); int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; // int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; - LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config); - void *intermediate_result_ptr = (bc->requestsInfo[i].peft_bwd) ? weight.low_rank_activation : m->handle.workSpace; + LoraLinearWeight weight = m->peft_memory_manager->get_peft( + bc->requestsInfo[i].peft_model_id, lora_config); + void *intermediate_result_ptr = (bc->requestsInfo[i].peft_bwd) + ? weight.low_rank_activation + : m->handle.workSpace; if (bc->requestsInfo[i].peft_bwd) { checkCUDA(cudaMemcpyAsync(weight.input_activation, input_ptr + first_token_offset * in_dim, @@ -346,8 +353,8 @@ void inference_kernel(LoraLinearMeta *m, stream)); } else { // use workspace to save intermediate result - assert(m->handle.workSpaceSize >= - data_type_size(m->input_type[1]) * num_peft_tokens * lora_config.rank); + assert(m->handle.workSpaceSize >= data_type_size(m->input_type[1]) * + num_peft_tokens * lora_config.rank); } DT alpha = 1.0f, beta = 0.0f; // buffer = weight_first * input @@ -439,22 +446,29 @@ void peft_bwd_kernel(LoraLinearMeta *m, cudaDataType_t weight_type = output_type; cudaDataType_t lr_actv_type = output_type; cudaDataType_t compute_type = output_type; - + for (int i = 0; i < bc->max_requests_per_batch(); i++) { // Skip completed, non-PEFT and PEFT forward-only requests - if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || !bc->requestsInfo[i].peft_bwd) { + if (bc->request_completed[i] || + bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || + !bc->requestsInfo[i].peft_bwd) { continue; } - LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config); + LoraLinearConfig lora_config = + LoraLinearConfig::deserialize_from_json_string( + bc->requestsInfo[i].peft_model_config); if (!lora_applies_to_this_layer(m, lora_config)) { continue; } - assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && "Trainable flag mismatch"); - m->peft_memory_manager->check_ft_model_id(bc->requestsInfo[i].peft_model_id); + assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && + "Trainable flag mismatch"); + m->peft_memory_manager->check_ft_model_id( + bc->requestsInfo[i].peft_model_id); int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; // int max_peft_tokens = bc->requestsInfo[i].max_length; // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; - LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config); + LoraLinearWeight weight = m->peft_memory_manager->get_peft( + bc->requestsInfo[i].peft_model_id, lora_config); DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank); // Compute LORA_B weight's gradient @@ -569,7 +583,9 @@ void peft_bwd_kernel(LoraLinearMeta *m, // Get optimizer config if (lora_config.optimizer_config->getType() == "SGD") { - LoraSGDOptimizerConfig const *sgd_config = static_cast(lora_config.optimizer_config.get()); + LoraSGDOptimizerConfig const *sgd_config = + static_cast( + lora_config.optimizer_config.get()); // LoRA_A weight is split in tensor parallelism, so no need to apply // all-reduce sgd_update<< #include #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) @@ -53,16 +53,19 @@ bool check_lora_layer_match(Layer *potential_target, } void FFModel::add_lora_layers(std::vector target_modules) { - assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled"); + assert(config.enable_peft && + "Cannot add a LoRA layer if PEFT mode is not enabled"); assert(target_modules.size() > 0 && "LoRA target module name is empty"); RequestManager *rm = RequestManager::get_request_manager(); int max_lora_rank = rm->get_max_lora_rank(); int max_concurrent_adapters = rm->get_max_concurrent_adapters(); assert(max_lora_rank > 1 && max_lora_rank <= 32 && "Invalid max LoRA rank"); - assert(max_concurrent_adapters > 0 && "Invalid number of LoRA concurrent adapters"); + assert(max_concurrent_adapters > 0 && + "Invalid number of LoRA concurrent adapters"); for (std::string target_module_name : target_modules) { - assert(target_module_name.length() > 0 && "LoRA target module name is empty"); + assert(target_module_name.length() > 0 && + "LoRA target module name is empty"); // find target layer for (auto it = layers.begin(); it != layers.end(); ++it) { Layer *target_module = *it; @@ -70,15 +73,16 @@ void FFModel::add_lora_layers(std::vector target_modules) { if (!match) { continue; } - assert(base_layer_to_peft_layer.find(target_module) == base_layer_to_peft_layer.end() && "LoRA layer already added, attempting to add again"); + assert(base_layer_to_peft_layer.find(target_module) == + base_layer_to_peft_layer.end() && + "LoRA layer already added, attempting to add again"); // Get input and output tensors from target module Tensor const input = target_module->inputs[0]; Tensor const output = target_module->outputs[0]; assert(input->data_type == output->data_type); // Compute OP_LORA layer name, based on target module name - std::string name_ = target_module->name - ? std::string(target_module->name) - : std::string(""); + std::string name_ = target_module->name ? std::string(target_module->name) + : std::string(""); size_t last_underscore = name_.length() - 1; for (int i = name_.length() - 1; i > 0; i--) { if (!(std::isdigit(target_module->name[i]) || @@ -101,7 +105,8 @@ void FFModel::add_lora_layers(std::vector target_modules) { 1 /*outputs*/, input, output); - // fix LoRA layer's transformer layer ID and model ID (to be the same as target module) + // fix LoRA layer's transformer layer ID and model ID (to be the same as + // target module) peft_layer->layer_guid.transformer_layer_id = target_module->layer_guid.transformer_layer_id; peft_layer->layer_guid.model_id = target_module->layer_guid.model_id; @@ -122,7 +127,8 @@ void FFModel::add_lora_layers(std::vector target_modules) { } // pass max_rank and max_concurrent_adapters to OP_LORA layer peft_layer->add_int_property("max_rank", max_lora_rank); - peft_layer->add_int_property("max_concurrent_adapters", max_concurrent_adapters); + peft_layer->add_int_property("max_concurrent_adapters", + max_concurrent_adapters); it = layers.insert(it + 1, peft_layer); ++it; base_layer_to_peft_layer[target_module] = peft_layer; @@ -293,7 +299,7 @@ LoraLinear::LoraLinear(FFModel &model, input, output, other.max_rank, - other.max_concurrent_adapters, + other.max_concurrent_adapters, other.name) {} LoraLinear::LoraLinear(FFModel &model, @@ -468,16 +474,26 @@ OpMeta *LoraLinear::init_task(Task const *task, } std::string lora_layername_substr = lora_layername.substr(0, found + searchString.length()); - + // allocate space for lora weights Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); - m->peft_memory_manager = new PEFTMemoryManager(gpu_mem, lora->max_rank, lora->max_concurrent_adapters, BatchConfig::max_sequence_length(), in_dim, out_dim, num_shards, shard_id, lora_layername_substr, dt); + m->peft_memory_manager = + new PEFTMemoryManager(gpu_mem, + lora->max_rank, + lora->max_concurrent_adapters, + BatchConfig::max_sequence_length(), + in_dim, + out_dim, + num_shards, + shard_id, + lora_layername_substr, + dt); m->peft_memory_manager->allocate_inference_memory(); return m; } #ifdef DEADCODE -void load_peft_adapters(BatchConfig const *bc){ +void load_peft_adapters(BatchConfig const *bc) { for (auto const &kv : bc->peft_configs) { PEFTModelID const &model_id = kv.first; LoraLinearConfig const &lora_config = kv.second; @@ -758,14 +774,18 @@ void LoraLinear::inference_task(Task const *task, } for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + if (bc->request_completed[i] || + bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { continue; } - LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config); + LoraLinearConfig lora_config = + LoraLinearConfig::deserialize_from_json_string( + bc->requestsInfo[i].peft_model_config); if (!lora_applies_to_this_layer(m, lora_config)) { continue; } - LoraLinearWeight weight = m->peft_memory_manager->get_peft(bc->requestsInfo[i].peft_model_id, lora_config); + LoraLinearWeight weight = m->peft_memory_manager->get_peft( + bc->requestsInfo[i].peft_model_id, lora_config); fs::path dst_filepath_weights = get_dst_folder("weights", m->decoding_step, shard_id) / layername; std::string filenameA = @@ -819,7 +839,6 @@ void LoraLinear::inference_task(Task const *task, assert(false); } - m->decoding_step++; } } @@ -874,6 +893,8 @@ void lora_inference_debugging(LoraLinearMeta *m, GenericTensorAccessorW input_grad, GenericTensorAccessorR output_grad, int shard_id) { + int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; // get layer name std::string lora_layername = std::string(m->op_name); std::string searchString = "lora"; @@ -907,20 +928,21 @@ void lora_inference_debugging(LoraLinearMeta *m, // weights, weights gradients fs::path dst_filepath_weights = get_dst_folder("weights", m->bwd_step, shard_id) / layername; - + for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i] || bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + if (bc->request_completed[i] || + bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || + !bc->requestsInfo[i].peft_bwd) { continue; } - LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(bc->requestsInfo[i].peft_model_config); + LoraLinearConfig lora_config = + LoraLinearConfig::deserialize_from_json_string( + bc->requestsInfo[i].peft_model_config); if (!lora_applies_to_this_layer(m, lora_config)) { continue; } - - assert(m->model_state.size() >= 1 && "Model state empty!"); - for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) { - PEFTModelID peft_model_id = it->first; - LoraLinearWeight weight = m->model_state[peft_model_id].weights; + LoraLinearWeight weight = m->peft_memory_manager->get_peft( + bc->requestsInfo[i].peft_model_id, lora_config); std::string filename_weight_A = dst_filepath_weights.string() + ".weight_A.finetuned"; std::string filename_weight_B = @@ -932,36 +954,36 @@ void lora_inference_debugging(LoraLinearMeta *m, if (m->input_type[0] == DT_FLOAT) { // weight A save_tensor((float *)weight.w0_ptr, - weight.rank * weight.in_dim, + lora_config.rank * in_dim, filename_weight_A.c_str()); // weight grad A save_tensor((float *)weight.w0_grad_ptr, - weight.rank * weight.in_dim, + lora_config.rank * in_dim, filename_grad_A.c_str()); // weight B save_tensor((float *)weight.w1_ptr, - weight.rank * weight.out_dim, + lora_config.rank * out_dim, filename_weight_B.c_str()); // weight grad B save_tensor((float *)weight.w1_grad_ptr, - weight.rank * weight.out_dim, + lora_config.rank * out_dim, filename_grad_B.c_str()); } else if (m->input_type[0] == DT_HALF) { // weight A save_tensor((half *)weight.w0_ptr, - weight.rank * weight.in_dim, + lora_config.rank * in_dim, filename_weight_A.c_str()); // weight grad A save_tensor((half *)weight.w0_grad_ptr, - weight.rank * weight.in_dim, + lora_config.rank * in_dim, filename_grad_A.c_str()); // weight B save_tensor((half *)weight.w1_ptr, - weight.rank * weight.out_dim, + lora_config.rank * out_dim, filename_weight_B.c_str()); // weight grad B save_tensor((half *)weight.w1_grad_ptr, - weight.rank * weight.out_dim, + lora_config.rank * out_dim, filename_grad_B.c_str()); } else { assert(false && "Data type not supported"); @@ -1040,62 +1062,49 @@ void save_peft_weights_if_needed(LoraLinearMeta *m, } std::string lora_layername_substr = lora_layername.substr(0, found + searchString.length()); + for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { + if (bc->request_completed[i] || + bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || + !bc->requestsInfo[i].peft_bwd) { continue; } - // Skip non-PEFT requests - if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { - continue; - } - // Skip PEFT forward-only requests - if (!bc->requestsInfo[i].peft_bwd) { + LoraLinearConfig lora_config = + LoraLinearConfig::deserialize_from_json_string( + bc->requestsInfo[i].peft_model_config); + if (!lora_applies_to_this_layer(m, lora_config)) { continue; } if (bc->requestsInfo[i].optimizer_tasks.save_updated_weights) { - assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != - m->model_state.end()); std::string weight_export_folder = join_path({ - m->model_state[bc->requestsInfo[i].peft_model_id].cache_folder, + lora_config.cache_folder, "finetuned_models", - m->model_state[bc->requestsInfo[i].peft_model_id].peft_model_id, + lora_config.peft_model_id, "weights", "shard_" + std::to_string(shard_id), }); fs::create_directories(weight_export_folder); - int rank = m->model_state[bc->requestsInfo[i].peft_model_id].weights.rank; + int rank = lora_config.rank; int w0_num_elements = rank * in_dim; int w1_num_elements = rank * out_dim; std::string w0_filepath = join_path( {weight_export_folder, lora_layername_substr + "_A.weight"}); std::string w1_filepath = join_path( {weight_export_folder, lora_layername_substr + "_B.weight"}); + LoraLinearWeight weight = m->peft_memory_manager->get_peft( + bc->requestsInfo[i].peft_model_id, lora_config); if (m->input_type[0] == DT_FLOAT) { - save_peft_to_file( - (float *)m->model_state[bc->requestsInfo[i].peft_model_id] - .weights.w0_ptr, - w0_num_elements, - w0_filepath); + save_peft_to_file((float *)weight.w0_ptr, w0_num_elements, w0_filepath); if (shard_id == 0) { save_peft_to_file( - (float *)m->model_state[bc->requestsInfo[i].peft_model_id] - .weights.w1_ptr, - w1_num_elements, - w1_filepath); + (float *)weight.w1_ptr, w1_num_elements, w1_filepath); } } else if (m->input_type[0] == DT_HALF) { - save_peft_to_file( - (half *)m->model_state[bc->requestsInfo[i].peft_model_id] - .weights.w0_ptr, - w0_num_elements, - w0_filepath); + save_peft_to_file((half *)weight.w0_ptr, w0_num_elements, w0_filepath); if (shard_id == 0) { save_peft_to_file( - (half *)m->model_state[bc->requestsInfo[i].peft_model_id] - .weights.w1_ptr, - w1_num_elements, - w1_filepath); + (half *)weight.w1_ptr, w1_num_elements, w1_filepath); } } else { assert(false && "Data type not supported"); @@ -1214,7 +1223,7 @@ void LoraLinear::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->max_rank); sez.serialize(this->max_concurrent_adapters); -#ifdef DEADCODE +#ifdef DEADCODE sez.serialize(this->op_type); sez.serialize(this->peft_configs.size()); for (auto const &kv : this->peft_configs) { @@ -1334,7 +1343,7 @@ Node LoraLinear::deserialize(FFModel &ff, params.peft_configs.emplace( std::make_pair(peft_model_id, *lora_linear_config)); } -#endif +#endif dez.deserialize(name_len); dez.deserialize(name, name_len); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); @@ -1384,7 +1393,7 @@ size_t hash::operator()( hash_combine(key, params.layer_guid.model_id); hash_combine(key, params.max_rank); hash_combine(key, params.max_concurrent_adapters); -#ifdef DEADCODE +#ifdef DEADCODE for (auto const &kv : params.peft_configs) { hash_combine(key, kv.first.id); hash_combine(key, kv.second.rank); diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc index c7b9fcc711..61c9c15336 100644 --- a/src/ops/lora_linear_params.cc +++ b/src/ops/lora_linear_params.cc @@ -12,6 +12,18 @@ namespace FlexFlow { // empty optimizer LoraOptimizerConfig::LoraOptimizerConfig() {} +std::unique_ptr + LoraOptimizerConfig::fromJson(nlohmann::json const &j) { + std::string type = j["type"]; + if (type == "SGD") { + return LoraSGDOptimizerConfig::fromJson(j); + } + if (type == "Adam") { + return LoraAdamOptimizerConfig::fromJson(j); + } + throw std::runtime_error("Unknown optimizer type"); +} + // SGD optimizer LoraSGDOptimizerConfig::LoraSGDOptimizerConfig() : lr(0.001f), momentum(0.0f), nesterov(false), weight_decay(0.0f) {} @@ -30,6 +42,24 @@ std::ostream &operator<<(std::ostream &os, LoraSGDOptimizerConfig const &llc) { return os; } +nlohmann::json LoraSGDOptimizerConfig::toJson() const { + return {{"type", "SGD"}, + {"lr", lr}, + {"momentum", momentum}, + {"nesterov", nesterov}, + {"weight_decay", weight_decay}}; +} + +std::unique_ptr + LoraSGDOptimizerConfig::fromJson(nlohmann::json const &j) { + auto sgd = std::make_unique(); + sgd->lr = j["lr"]; + sgd->momentum = j["momentum"]; + sgd->nesterov = j["nesterov"]; + sgd->weight_decay = j["weight_decay"]; + return sgd; +} + // Adam optimizer LoraAdamOptimizerConfig::LoraAdamOptimizerConfig() : alpha(0.001f), beta1(0.9f), beta2(0.999f), weight_decay(0.0f), @@ -50,6 +80,26 @@ std::ostream &operator<<(std::ostream &os, LoraAdamOptimizerConfig const &llc) { return os; } +nlohmann::json LoraAdamOptimizerConfig::toJson() const { + return {{"type", "Adam"}, + {"alpha", alpha}, + {"beta1", beta1}, + {"beta2", beta2}, + {"weight_decay", weight_decay}, + {"epsilon", epsilon}}; +} + +std::unique_ptr + LoraAdamOptimizerConfig::fromJson(nlohmann::json const &j) { + auto adam = std::make_unique(); + adam->alpha = j["alpha"]; + adam->beta1 = j["beta1"]; + adam->beta2 = j["beta2"]; + adam->weight_decay = j["weight_decay"]; + adam->epsilon = j["epsilon"]; + return adam; +} + // ------------------ LoRA configs ------------------- // --------------------------------------------------- const LoraLinearConfig LoraLinearConfig::EmptyConfig = LoraLinearConfig("", ""); @@ -171,9 +221,11 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) { if (llc.optimizer_config != nullptr) { os << "optimizer_config: "; if (llc.optimizer_config.get()->getType() == "SGD") { - os << *static_cast(llc.optimizer_config.get()); + os << *static_cast( + llc.optimizer_config.get()); } else if (llc.optimizer_config.get()->getType() == "Adam") { - os << *static_cast(llc.optimizer_config.get()); + os << *static_cast( + llc.optimizer_config.get()); } else { os << "Unknown optimizer config type"; } @@ -185,4 +237,63 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) { return os; } +std::string LoraLinearConfig::serialize_to_json_string(int indent) const { + nlohmann::json j = {{"cache_folder", cache_folder}, + {"peft_model_id", peft_model_id}, + {"rank", rank}, + {"lora_alpha", lora_alpha}, + {"lora_dropout", lora_dropout}, + {"target_modules", target_modules}, + {"trainable", trainable}, + {"init_lora_weights", init_lora_weights}, + {"base_model_name_or_path", base_model_name_or_path}, + {"precision", precision}, + // {"optimizer_config", optimizer_config ? + // optimizer_config->toJson() : nullptr} + {"optimizer_config", + optimizer_config + ? nlohmann::json(optimizer_config->toJson()) + : nlohmann::json()}}; + + return j.dump(indent); // No indentation +} + +void LoraLinearConfig::serialize_to_json_file( + std::string const &filename) const { + std::string j = serialize_to_json_string(4); + std::ofstream file(filename); + file << j; +} + +// Deserialization method +LoraLinearConfig LoraLinearConfig::deserialize_from_json_string( + std::string const &json_string) { + nlohmann::json j = nlohmann::json::parse(json_string); + LoraLinearConfig config( + j["cache_folder"].get(), + j["peft_model_id"].get(), + j["trainable"].get(), + nullptr, // optimizer_config will be set later if present + j["init_lora_weights"].get(), + j["base_model_name_or_path"].get(), + j["precision"].get(), + j["rank"].get(), + j["lora_alpha"].get(), + j["lora_dropout"].get(), + j["target_modules"].get>()); + if (!j["optimizer_config"].is_null()) { + config.setOptimizer(LoraOptimizerConfig::fromJson(j["optimizer_config"])); + } + return config; +} + +// Deserialization method +LoraLinearConfig + LoraLinearConfig::deserialize_from_json_file(std::string const &filename) { + std::ifstream file(filename); + std::string j; + file >> j; + return deserialize_from_json_string(j); +} + }; // namespace FlexFlow diff --git a/src/runtime/fftype.cc b/src/runtime/fftype.cc index 0af5f45350..31937cef66 100644 --- a/src/runtime/fftype.cc +++ b/src/runtime/fftype.cc @@ -46,7 +46,9 @@ bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs) { return lhs.id == rhs.id; } -bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs) { return !(lhs == rhs); } +bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs) { + return !(lhs == rhs); +} std::ostream &operator<<(std::ostream &os, PEFTModelID const &peft_model_id) { if (peft_model_id == PEFTModelID::NO_ID) { diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 20b2a5b963..1b65dfd869 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -837,5 +837,4 @@ std::string join_path(std::vector const &paths) { return joined; } - }; // namespace FlexFlow diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc index 287eb7e20a..81b412e049 100644 --- a/src/runtime/peft_weight_allocator.cc +++ b/src/runtime/peft_weight_allocator.cc @@ -21,93 +21,99 @@ using Legion::TaskArgument; using Legion::TaskLauncher; void PEFTMemoryManager::allocate_inference_memory() { - // allocate chunk of memory for all the PEFT adapters - Realm::Rect<1, coord_t> bounds( - Realm::Point<1, coord_t>(0), - Realm::Point<1, coord_t>(max_lora_size - 1)); - std::vector field_sizes; - field_sizes.push_back(sizeof(char)); - Realm::RegionInstance::create_instance(peftLegionInst, - gpu_mem, - bounds, - field_sizes, - 0, - Realm::ProfilingRequestSet()) - .wait(); - base_ptr = peftLegionInst.pointer_untyped(0, sizeof(char)); + // allocate chunk of memory for all the PEFT adapters + Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(max_lora_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance::create_instance(peftLegionInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + base_ptr = peftLegionInst.pointer_untyped(0, sizeof(char)); } void PEFTMemoryManager::allocate_finetuning_memory() { - size_t ft_size = max_lora_size*3; // weights, gradients, momentum values - ft_size += max_peft_tokens * (in_dim + max_rank); // input, low-rank activations - // allocate chunk of memory for PEFT adapter - Realm::Rect<1, coord_t> bounds( - Realm::Point<1, coord_t>(0), - Realm::Point<1, coord_t>(ft_size - 1)); - std::vector field_sizes; - field_sizes.push_back(sizeof(char)); - Realm::RegionInstance::create_instance(peftLegionInst, - gpu_mem, - bounds, - field_sizes, - 0, - Realm::ProfilingRequestSet()) - .wait(); - finetuning_ptr = peftLegionInst.pointer_untyped(0, sizeof(char)); + size_t ft_size = max_lora_size * 3; // weights, gradients, momentum values + ft_size += + max_peft_tokens * (in_dim + max_rank); // input, low-rank activations + // allocate chunk of memory for PEFT adapter + Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(ft_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance::create_instance(peftLegionInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + finetuning_ptr = peftLegionInst.pointer_untyped(0, sizeof(char)); } -void PEFTMemoryManager::get_finetuning_slot(PEFTModelID const &model_id, bool *cache_miss) { - if (finetuning_ptr == nullptr) { - allocate_finetuning_memory(); - } - assert(finetuning_ptr != nullptr && "PEFT Memory Manager finetuning_ptr is null"); - *cache_miss = (model_id.id != finetuning_model_id.id); +void PEFTMemoryManager::get_finetuning_slot(PEFTModelID const &model_id, + bool *cache_miss) { + if (finetuning_ptr == nullptr) { + allocate_finetuning_memory(); + } + assert(finetuning_ptr != nullptr && + "PEFT Memory Manager finetuning_ptr is null"); + *cache_miss = (model_id.id != finetuning_model_id.id); } -int PEFTMemoryManager::get_inference_peft_slot(PEFTModelID const &model_id, bool *cache_miss) { - assert(base_ptr != nullptr && "PEFT Memory Manager not initialized"); - assert(lru_hashtable.size() == lru_list.size() && - lru_list.size() == peft2mem_slot.size() && - "PEFT Memory Manager LRU hashtable/list and/or peft2mem_slot are out of sync"); - // check for cache hit - if (lru_hashtable.find(model_id) != lru_hashtable.end()) { - int lru_list_index = lru_hashtable[model_id]; - assert(lru_list[lru_list_index] == model_id && - "PEFT Memory Manager LRU hashtable/list are out of sync"); - // move the model to the end of the LRU list - lru_list.erase(lru_list.begin() + lru_list_index); - lru_list.push_back(model_id); - // update the LRU hashtable - lru_hashtable[model_id] = lru_list.size() - 1; - // get memory slot - assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() && "PEFT Memory Manager peft2mem_slot is out of sync"); - *cache_miss = false; +int PEFTMemoryManager::get_inference_peft_slot(PEFTModelID const &model_id, + bool *cache_miss) { + assert(base_ptr != nullptr && "PEFT Memory Manager not initialized"); + assert(lru_hashtable.size() == lru_list.size() && + lru_list.size() == peft2mem_slot.size() && + "PEFT Memory Manager LRU hashtable/list and/or peft2mem_slot are out " + "of sync"); + // check for cache hit + if (lru_hashtable.find(model_id) != lru_hashtable.end()) { + int lru_list_index = lru_hashtable[model_id]; + assert(lru_list[lru_list_index] == model_id && + "PEFT Memory Manager LRU hashtable/list are out of sync"); + // move the model to the end of the LRU list + lru_list.erase(lru_list.begin() + lru_list_index); + lru_list.push_back(model_id); + // update the LRU hashtable + lru_hashtable[model_id] = lru_list.size() - 1; + // get memory slot + assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() && + "PEFT Memory Manager peft2mem_slot is out of sync"); + *cache_miss = false; + } else { + // cache miss + // check if you need to evict + bool need_to_evict = lru_list.size() == max_concurrent_adapters; + int mem_slot = -1; + if (need_to_evict) { + // evict the least recently used model + PEFTModelID lru_model_id = lru_list[0]; + lru_list.erase(lru_list.begin()); + lru_hashtable.erase(lru_model_id); + mem_slot = peft2mem_slot[lru_model_id]; + peft2mem_slot.erase(lru_model_id); } else { - // cache miss - // check if you need to evict - bool need_to_evict = lru_list.size() == max_concurrent_adapters; - int mem_slot = -1; - if (need_to_evict) { - // evict the least recently used model - PEFTModelID lru_model_id = lru_list[0]; - lru_list.erase(lru_list.begin()); - lru_hashtable.erase(lru_model_id); - mem_slot = peft2mem_slot[lru_model_id]; - peft2mem_slot.erase(lru_model_id); - } else { - mem_slot = lru_list.size(); - } - // update the LRU list and hashtable - lru_list.push_back(model_id); - lru_hashtable[model_id] = lru_list.size() - 1; - // update the memory slot - peft2mem_slot[model_id] = mem_slot; - *cache_miss = true; + mem_slot = lru_list.size(); } - assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() && "PEFT Memory Manager peft2mem_slot is out of sync"); - int slot = peft2mem_slot[model_id]; - assert(slot >= 0 && slot < max_concurrent_adapters && "PEFT Memory Manager peft2mem_slot is out of bounds"); - return slot; + // update the LRU list and hashtable + lru_list.push_back(model_id); + lru_hashtable[model_id] = lru_list.size() - 1; + // update the memory slot + peft2mem_slot[model_id] = mem_slot; + *cache_miss = true; + } + assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() && + "PEFT Memory Manager peft2mem_slot is out of sync"); + int slot = peft2mem_slot[model_id]; + assert(slot >= 0 && slot < max_concurrent_adapters && + "PEFT Memory Manager peft2mem_slot is out of bounds"); + return slot; } template @@ -160,138 +166,152 @@ void load_peft_from_file(DT *ptr, in.close(); } -void PEFTMemoryManager::load_peft_model(LoraLinearWeight &weight, LoraLinearConfig const &lora_config) { - // Load weights - assert(weight.w0_ptr != nullptr && weight.w1_ptr != nullptr && "PEFT Memory Manager weight ptr null"); - int w0_num_elements = lora_config.rank * in_dim; - int w1_num_elements = lora_config.rank * out_dim; - // values below represent total weight sizes before sharding. Lora B is not - // sharded. - int lora_A_num_rows = in_dim * num_shards; - int lora_A_num_cols = lora_config.rank; - int lora_B_num_rows = lora_config.rank; - int lora_B_num_cols = out_dim; - int lora_A_num_shards = num_shards; - int lora_B_num_shards = 1; - if (lora_config.init_lora_weights) { - // initialize weights randomly - int seed = 0; - init_peft_weight_wrapper(weight, in_dim, out_dim, lora_config.rank, dt, seed); +void PEFTMemoryManager::load_peft_model(LoraLinearWeight &weight, + LoraLinearConfig const &lora_config) { + // Load weights + assert(weight.w0_ptr != nullptr && weight.w1_ptr != nullptr && + "PEFT Memory Manager weight ptr null"); + int w0_num_elements = lora_config.rank * in_dim; + int w1_num_elements = lora_config.rank * out_dim; + // values below represent total weight sizes before sharding. Lora B is not + // sharded. + int lora_A_num_rows = in_dim * num_shards; + int lora_A_num_cols = lora_config.rank; + int lora_B_num_rows = lora_config.rank; + int lora_B_num_cols = out_dim; + int lora_A_num_shards = num_shards; + int lora_B_num_shards = 1; + if (lora_config.init_lora_weights) { + // initialize weights randomly + int seed = 0; + init_peft_weight_wrapper( + weight, in_dim, out_dim, lora_config.rank, dt, seed); + } else { + // load weights from file + std::string weights_folder_filepath = join_path({ + lora_config.cache_folder, + "weights", + lora_config.peft_model_id, + dt == DT_FLOAT ? "full-precision" : "half-precision", + }); + std::string w0_filepath = join_path( + {weights_folder_filepath, lora_layername_substr + "_A.weight"}); + std::string w1_filepath = join_path( + {weights_folder_filepath, lora_layername_substr + "_B.weight"}); + if (dt == DT_FLOAT) { + std::cout << "Loading LORA weight " << lora_layername_substr + "_A.weight" + << ", num_rows: " << lora_A_num_rows + << ", num_cols: " << lora_A_num_cols + << ", num_shards: " << lora_A_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((float *)weight.w0_ptr, + lora_A_num_rows, + lora_A_num_cols, + lora_A_num_shards, + shard_id, + w0_filepath); + std::cout << "Loading LORA weight " << lora_layername_substr + "_B.weight" + << ", num_rows: " << lora_B_num_rows + << ", num_cols: " << lora_B_num_cols + << ", num_shards: " << lora_B_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((float *)weight.w1_ptr, + lora_B_num_rows, + lora_B_num_cols, + lora_B_num_shards, + shard_id, + w1_filepath); + } else if (dt == DT_HALF) { + std::cout << "Loading LORA weight " << lora_layername_substr + "_A.weight" + << ", num_rows: " << lora_A_num_rows + << ", num_cols: " << lora_A_num_cols + << ", num_shards: " << lora_A_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((half *)weight.w0_ptr, + lora_A_num_rows, + lora_A_num_cols, + lora_A_num_shards, + shard_id, + w0_filepath); + std::cout << "Loading LORA weight " << lora_layername_substr + "_B.weight" + << ", num_rows: " << lora_B_num_rows + << ", num_cols: " << lora_B_num_cols + << ", num_shards: " << lora_B_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((half *)weight.w1_ptr, + lora_B_num_rows, + lora_B_num_cols, + lora_B_num_shards, + shard_id, + w1_filepath); } else { - // load weights from file - std::string weights_folder_filepath = join_path({ - lora_config.cache_folder, - "weights", - lora_config.peft_model_id, - dt == DT_FLOAT ? "full-precision" : "half-precision", - }); - std::string w0_filepath = join_path( - {weights_folder_filepath, lora_layername_substr + "_A.weight"}); - std::string w1_filepath = join_path( - {weights_folder_filepath, lora_layername_substr + "_B.weight"}); - if (dt == DT_FLOAT) { - std::cout << "Loading LORA weight " - << lora_layername_substr + "_A.weight" - << ", num_rows: " << lora_A_num_rows - << ", num_cols: " << lora_A_num_cols - << ", num_shards: " << lora_A_num_shards - << ", shard_id: " << shard_id << std::endl; - load_peft_from_file((float *)weight.w0_ptr, - lora_A_num_rows, - lora_A_num_cols, - lora_A_num_shards, - shard_id, - w0_filepath); - std::cout << "Loading LORA weight " - << lora_layername_substr + "_B.weight" - << ", num_rows: " << lora_B_num_rows - << ", num_cols: " << lora_B_num_cols - << ", num_shards: " << lora_B_num_shards - << ", shard_id: " << shard_id << std::endl; - load_peft_from_file((float *)weight.w1_ptr, - lora_B_num_rows, - lora_B_num_cols, - lora_B_num_shards, - shard_id, - w1_filepath); - } else if (dt == DT_HALF) { - std::cout << "Loading LORA weight " - << lora_layername_substr + "_A.weight" - << ", num_rows: " << lora_A_num_rows - << ", num_cols: " << lora_A_num_cols - << ", num_shards: " << lora_A_num_shards - << ", shard_id: " << shard_id << std::endl; - load_peft_from_file((half *)weight.w0_ptr, - lora_A_num_rows, - lora_A_num_cols, - lora_A_num_shards, - shard_id, - w0_filepath); - std::cout << "Loading LORA weight " - << lora_layername_substr + "_B.weight" - << ", num_rows: " << lora_B_num_rows - << ", num_cols: " << lora_B_num_cols - << ", num_shards: " << lora_B_num_shards - << ", shard_id: " << shard_id << std::endl; - load_peft_from_file((half *)weight.w1_ptr, - lora_B_num_rows, - lora_B_num_cols, - lora_B_num_shards, - shard_id, - w1_filepath); - } else { - assert(false && "Data type not supported"); - } + assert(false && "Data type not supported"); } + } } -LoraLinearWeight PEFTMemoryManager::get_inference_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config) { - assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set"); - bool cache_miss; - int mem_slot = get_inference_peft_slot(model_id, &cache_miss); - int w0_num_elements = lora_config.rank * in_dim; - int data_size = data_type_size(dt); - LoraLinearWeight result; - result.w0_ptr = static_cast(base_ptr) + mem_slot * max_lora_size; - result.w1_ptr = static_cast(result.w0_ptr) + w0_num_elements * data_size; - if (cache_miss) { - load_peft_model(result, lora_config); - } - return result; +LoraLinearWeight + PEFTMemoryManager::get_inference_peft(PEFTModelID const &model_id, + LoraLinearConfig const &lora_config) { + assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set"); + bool cache_miss; + int mem_slot = get_inference_peft_slot(model_id, &cache_miss); + int w0_num_elements = lora_config.rank * in_dim; + int data_size = data_type_size(dt); + LoraLinearWeight result; + result.w0_ptr = static_cast(base_ptr) + mem_slot * max_lora_size; + result.w1_ptr = + static_cast(result.w0_ptr) + w0_num_elements * data_size; + if (cache_miss) { + load_peft_model(result, lora_config); + } + return result; } -LoraLinearWeight PEFTMemoryManager::get_finetuning_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config) { - assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set"); - bool cache_miss; - get_finetuning_slot(model_id, &cache_miss); - int w0_num_elements = lora_config.rank * in_dim; - int w1_num_elements = lora_config.rank * out_dim; - int data_size = data_type_size(dt); - LoraLinearWeight result; - result.w0_ptr = finetuning_ptr; - result.w1_ptr = static_cast(result.w0_ptr)+ w0_num_elements*data_size; - result.w0_grad_ptr = static_cast(result.w1_ptr) + w1_num_elements*data_size; - result.w1_grad_ptr = static_cast(result.w0_grad_ptr) + w0_num_elements*data_size; - result.w0_v_values_ptr = static_cast(result.w1_grad_ptr) + w1_num_elements*data_size; - result.w1_v_values_ptr = static_cast(result.w0_v_values_ptr) + w0_num_elements*data_size; - result.input_activation = static_cast(result.w1_v_values_ptr) + w1_num_elements*data_size; // max_peft_tokens*in_dim - result.low_rank_activation = static_cast(result.input_activation) + max_peft_tokens*in_dim*data_size; // max_peft_tokens*rank - if (cache_miss) { - load_peft_model(result, lora_config); - } - return result; +LoraLinearWeight PEFTMemoryManager::get_finetuning_peft( + PEFTModelID const &model_id, LoraLinearConfig const &lora_config) { + assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set"); + bool cache_miss; + get_finetuning_slot(model_id, &cache_miss); + int w0_num_elements = lora_config.rank * in_dim; + int w1_num_elements = lora_config.rank * out_dim; + int data_size = data_type_size(dt); + LoraLinearWeight result; + result.w0_ptr = finetuning_ptr; + result.w1_ptr = + static_cast(result.w0_ptr) + w0_num_elements * data_size; + result.w0_grad_ptr = + static_cast(result.w1_ptr) + w1_num_elements * data_size; + result.w1_grad_ptr = + static_cast(result.w0_grad_ptr) + w0_num_elements * data_size; + result.w0_v_values_ptr = + static_cast(result.w1_grad_ptr) + w1_num_elements * data_size; + result.w1_v_values_ptr = + static_cast(result.w0_v_values_ptr) + w0_num_elements * data_size; + result.input_activation = + static_cast(result.w1_v_values_ptr) + + w1_num_elements * data_size; // max_peft_tokens*in_dim + result.low_rank_activation = + static_cast(result.input_activation) + + max_peft_tokens * in_dim * data_size; // max_peft_tokens*rank + if (cache_miss) { + load_peft_model(result, lora_config); + } + return result; } -LoraLinearWeight PEFTMemoryManager::get_peft(PEFTModelID const &model_id, LoraLinearConfig const &lora_config) { - if (lora_config.trainable) { - return get_finetuning_peft(model_id, lora_config); - } else { - return get_inference_peft(model_id, lora_config); - } +LoraLinearWeight + PEFTMemoryManager::get_peft(PEFTModelID const &model_id, + LoraLinearConfig const &lora_config) { + if (lora_config.trainable) { + return get_finetuning_peft(model_id, lora_config); + } else { + return get_inference_peft(model_id, lora_config); + } } void PEFTMemoryManager::check_ft_model_id(PEFTModelID const &model_id) { - assert(finetuning_model_id == model_id && "PEFT bwd model is not in memory!"); + assert(finetuning_model_id == model_id && "PEFT bwd model is not in memory!"); } }; // namespace FlexFlow \ No newline at end of file diff --git a/src/runtime/peft_weight_allocator.cu b/src/runtime/peft_weight_allocator.cu index bc9ab443cb..3c4ea91db3 100644 --- a/src/runtime/peft_weight_allocator.cu +++ b/src/runtime/peft_weight_allocator.cu @@ -1,60 +1,70 @@ #include "flexflow/ops/kernels/decompress_kernels.h" -#include "flexflow/utils/peft_weight_allocator.h" #include "flexflow/utils/cuda_helper.h" +#include "flexflow/utils/peft_weight_allocator.h" #include #include namespace FlexFlow { template -void lora_init_kernel(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, int seed, cudaStream_t stream) { - // Initialize generator - std::mt19937 gen(seed); +void lora_init_kernel(LoraLinearWeight const &weight, + int in_dim, + int out_dim, + int rank, + int seed, + cudaStream_t stream) { + // Initialize generator + std::mt19937 gen(seed); - // Get handle to weights by iterating over m->model_state to get each - // LoraLinearWeight object - int w0_num_elements = rank * in_dim; - int w1_num_elements = rank * out_dim; + // Get handle to weights by iterating over m->model_state to get each + // LoraLinearWeight object + int w0_num_elements = rank * in_dim; + int w1_num_elements = rank * out_dim; - // LoRA_A weight: [in_dim, rank] - float stdv_lora_a = 1.0f / sqrt(in_dim); - std::uniform_real_distribution dis_lora_a(-stdv_lora_a, stdv_lora_a); - std::vector
lora_a_random_init(w0_num_elements); - for (auto &num : lora_a_random_init) { - float num_float = dis_lora_a(gen); - if (std::is_same::value) { - num = __float2half(num_float); - } else { - num = num_float; - } + // LoRA_A weight: [in_dim, rank] + float stdv_lora_a = 1.0f / sqrt(in_dim); + std::uniform_real_distribution dis_lora_a(-stdv_lora_a, stdv_lora_a); + std::vector
lora_a_random_init(w0_num_elements); + for (auto &num : lora_a_random_init) { + float num_float = dis_lora_a(gen); + if (std::is_same::value) { + num = __float2half(num_float); + } else { + num = num_float; } - checkCUDA(cudaMemcpyAsync(static_cast
(weight.w0_ptr), - lora_a_random_init.data(), - w0_num_elements * sizeof(DT), - cudaMemcpyHostToDevice, - stream)); + } + checkCUDA(cudaMemcpyAsync(static_cast
(weight.w0_ptr), + lora_a_random_init.data(), + w0_num_elements * sizeof(DT), + cudaMemcpyHostToDevice, + stream)); - // LoRA_B weight: [rank, out_dim] - float stdv_lora_b = 1.0f / sqrt(rank); - std::uniform_real_distribution dis_lora_b(-stdv_lora_b, stdv_lora_b); - std::vector lora_b_random_init(w1_num_elements); - for (auto &num : lora_b_random_init) { - float num_float = dis_lora_b(gen); - if (std::is_same::value) { - num = __float2half(num_float); - } else { - num = num_float; - } + // LoRA_B weight: [rank, out_dim] + float stdv_lora_b = 1.0f / sqrt(rank); + std::uniform_real_distribution dis_lora_b(-stdv_lora_b, stdv_lora_b); + std::vector lora_b_random_init(w1_num_elements); + for (auto &num : lora_b_random_init) { + float num_float = dis_lora_b(gen); + if (std::is_same::value) { + num = __float2half(num_float); + } else { + num = num_float; } - checkCUDA(cudaMemcpyAsync(static_cast
(weight.w1_ptr), - lora_b_random_init.data(), - w1_num_elements * sizeof(DT), - cudaMemcpyHostToDevice, - stream)); + } + checkCUDA(cudaMemcpyAsync(static_cast
(weight.w1_ptr), + lora_b_random_init.data(), + w1_num_elements * sizeof(DT), + cudaMemcpyHostToDevice, + stream)); } -void init_peft_weight_wrapper(LoraLinearWeight const &weight, int in_dim, int out_dim, int rank, DataType dt, int seed) { +void init_peft_weight_wrapper(LoraLinearWeight const &weight, + int in_dim, + int out_dim, + int rank, + DataType dt, + int seed) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 2377a4f938..9986fc2274 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -256,16 +256,16 @@ size_t RequestManager::get_num_ssms() { } void RequestManager::register_peft_config(PEFTModelID const &peft_model_id, - LoraLinearConfig const &peft_config) { + LoraLinearConfig const &peft_config) { // check that peft_model_id is not already in use assert(peft_configs.find(peft_model_id) == peft_configs.end() && "PEFT model ID already in use"); - // peft_configs[peft_model_id] = std::move(peft_config); - peft_configs.emplace(peft_model_id, std::move(peft_config)); + peft_configs[peft_model_id] = LoraLinearConfig::deserialize_from_json_string( + peft_config.serialize_to_json_string()); } -LoraLinearConfig const &RequestManager::get_peft_config( - PEFTModelID const &peft_model_id) { +LoraLinearConfig const & + RequestManager::get_peft_config(PEFTModelID const &peft_model_id) { assert(peft_configs.find(peft_model_id) != peft_configs.end() && "PEFT model ID not found"); return peft_configs[peft_model_id]; @@ -279,13 +279,16 @@ void RequestManager::set_max_concurrent_adapters(int max_concurrent_adapters_) { max_concurrent_adapters = max_concurrent_adapters_; } -int RequestManager::get_max_lora_rank() { return max_lora_rank; } +int RequestManager::get_max_lora_rank() { + return max_lora_rank; +} int RequestManager::get_max_concurrent_adapters() { return max_concurrent_adapters; } -PEFTModelID *FFModel::register_peft_adapter(LoraLinearConfig const &peft_config) { +PEFTModelID * + FFModel::register_peft_adapter(LoraLinearConfig const &peft_config) { assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled"); if (peft_config.target_modules.size() == 0) { @@ -293,16 +296,21 @@ PEFTModelID *FFModel::register_peft_adapter(LoraLinearConfig const &peft_config) std::cout << peft_config << std::endl; assert(false); } - // go over base_layer_to_peft_layer and check that you can find at least one match - for (int i=0; i 0 && std::string(base_layer.name).find(peft_config.target_modules[0]) != std::string::npos) { + for (auto const &pair : base_layer_to_peft_layer) { + Layer *base_layer = pair.first; + if (base_layer->name != nullptr && strlen(base_layer->name) > 0 && + std::string(base_layer->name).find(peft_config.target_modules[0]) != + std::string::npos) { found = true; break; } } - assert(found && "Attempting to add LoRA to a LLM target module that does not exist or does not support LoRA"); + assert(found && "Attempting to add LoRA to a LLM target module that does " + "not exist or does not support LoRA"); } PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++); RequestManager *rm = RequestManager::get_request_manager(); @@ -787,11 +795,13 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - // new_bc.requestsInfo[i].peft_model_id = - // old_bc.requestsInfo[i].peft_model_id; - new_bc.requestsInfo[i].peft_adapters = - old_bc.requestsInfo[i].peft_adapters; - num_concurrent_adapters += new_bc.requestsInfo[i].peft_adapters.size(); + new_bc.requestsInfo[i].peft_model_id = + old_bc.requestsInfo[i].peft_model_id; + new_bc.requestsInfo[i].peft_model_config = + old_bc.requestsInfo[i].peft_model_config; + if (old_bc.requestsInfo[i].peft_model_id != PEFTModelID::NO_ID) { + num_concurrent_adapters += 1; + } new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd; new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length; num_active_req++; @@ -853,12 +863,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.num_tokens < get_max_tokens_per_batch()) { Request new_request = pending_infr_request_queue.front(); assert(new_request.req_type == RequestType::REQ_INFERENCE); - - // if the request has peft adapters and we are at capacity, don't add it yet - if (new_request.peft_model_id != PEFTModelID::NO_ID && num_concurrent_adapters == get_max_concurrent_adapters()) { + + // if the request has peft adapters and we are at capacity, don't add it + // yet + if (new_request.peft_model_id != PEFTModelID::NO_ID && + num_concurrent_adapters == get_max_concurrent_adapters()) { break; } - + pending_infr_request_queue.pop(); // all_requests[new_request.guid] = new_request; @@ -869,9 +881,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, std::min(get_max_tokens_per_batch() - new_bc.num_tokens, (int)new_request.tokens.size()); new_bc.requestsInfo[i].max_length = new_request.max_length; - // new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id; + new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id; if (new_request.peft_model_id != PEFTModelID::NO_ID) { - new_bc.requestsInfo[i].peft_adapters[new_request.peft_model_id] = get_peft_config(new_request.peft_model_id).serialize_to_json_string(); + new_bc.requestsInfo[i].peft_model_config = + get_peft_config(new_request.peft_model_id) + .serialize_to_json_string(); } new_bc.requestsInfo[i].peft_bwd = false; new_bc.request_completed[i] = false; @@ -1027,7 +1041,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, int num_peft_label_tokens = request.dataset[dataset_entry].second.size(); assert(num_peft_label_tokens == 0); - if (num_peft_tokens > 0 && num_concurrent_adapters < get_max_concurrent_adapters()) { + if (num_peft_tokens > 0 && + num_concurrent_adapters < get_max_concurrent_adapters()) { assert(new_bc.request_completed[inference_batch_size]); // request info new_bc.request_completed[inference_batch_size] = false; @@ -1039,9 +1054,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, num_peft_tokens; new_bc.requestsInfo[inference_batch_size].max_length = request.max_length; new_bc.requestsInfo[inference_batch_size].request_guid = request.guid; - // new_bc.requestsInfo[inference_batch_size].peft_model_id = - // request.peft_model_id; - new_bc.requestsInfo[inference_batch_size].peft_adapters[request.peft_model_id] = get_peft_config(request.peft_model_id).serialize_to_json_string(); + new_bc.requestsInfo[inference_batch_size].peft_model_id = + request.peft_model_id; + new_bc.requestsInfo[inference_batch_size].peft_model_config = + get_peft_config(request.peft_model_id).serialize_to_json_string(); new_bc.requestsInfo[inference_batch_size].peft_bwd = true; set_optimizer_tasks( new_bc.requestsInfo[inference_batch_size].optimizer_tasks, @@ -1060,7 +1076,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.num_tokens++; new_bc.num_peft_tokens++; } - num_concurrent_adapters +=1; + num_concurrent_adapters += 1; } } assert(num_concurrent_adapters <= get_max_concurrent_adapters() && From 92c2c374e0d287d105489d9c009c1acc214e8f78 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 5 Nov 2024 04:21:07 +0000 Subject: [PATCH 19/37] fix --- .../ops/kernels/lora_linear_kernels.h | 9 +++-- include/flexflow/ops/lora_linear_params.h | 40 ++++--------------- include/flexflow/request_manager.h | 4 +- inference/peft/peft.cc | 13 +++--- inference/peft/peft_bwd_benchmark.cc | 8 ++-- inference/peft/peft_fwd_benchmark.cc | 8 ++-- inference/peft/req_rate_benchmark.cc | 6 +-- src/ops/kernels/lora_linear_kernels.cu | 5 ++- src/ops/lora_linear_params.cc | 29 ++++++-------- src/runtime/request_manager.cc | 6 +-- 10 files changed, 52 insertions(+), 76 deletions(-) diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h index dfff2ec5c5..7138f62e90 100644 --- a/include/flexflow/ops/kernels/lora_linear_kernels.h +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -10,6 +10,9 @@ namespace FlexFlow { +using Legion::Context; +using Legion::Runtime; + #ifdef DEADCODE struct LoraLinearModelState { LoraLinearWeight weights; @@ -40,7 +43,7 @@ namespace LoraLinear { bool lora_applies_to_this_layer(LoraLinearMeta *m, LoraLinearConfig const &config); -void init_kernel_wrapper(LoraLinearMeta *m, int seed); +// void init_kernel_wrapper(LoraLinearMeta *m, int seed); void inference_kernel_wrapper(LoraLinearMeta *m, BatchConfig const *bc, GenericTensorAccessorR const &input, @@ -53,8 +56,8 @@ void peft_bwd_kernel_wrapper(Context ctx, GenericTensorAccessorR const &output_grad); namespace Internal { -template -void init_kernel(LoraLinearMeta *m, int seed, ffStream_t stream); +// template +// void init_kernel(LoraLinearMeta *m, int seed, ffStream_t stream); template void inference_kernel(LoraLinearMeta *m, BatchConfig const *bc, diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h index 1dfe5f17bd..46b88c9690 100644 --- a/include/flexflow/ops/lora_linear_params.h +++ b/include/flexflow/ops/lora_linear_params.h @@ -19,8 +19,8 @@ class LoraOptimizerConfig { LoraOptimizerConfig(); virtual std::string getType() const = 0; virtual nlohmann::json toJson() const = 0; - static std::unique_ptr fromJson(nlohmann::json const &j); - virtual ~LoraOptimizerConfig() = default; + static LoraOptimizerConfig *fromJson(nlohmann::json const &j); + virtual ~LoraOptimizerConfig() {} }; class LoraSGDOptimizerConfig : public LoraOptimizerConfig { @@ -32,15 +32,11 @@ class LoraSGDOptimizerConfig : public LoraOptimizerConfig { bool weight_decay_ = 0.0f); friend std::ostream &operator<<(std::ostream &os, LoraSGDOptimizerConfig const &llc); - std::string getType() const override { return "SGD"; } - nlohmann::json toJson() const override; - - static std::unique_ptr - fromJson(nlohmann::json const &j); + static LoraSGDOptimizerConfig *fromJson(nlohmann::json const &j); public: double lr = 0.001f; @@ -63,11 +59,8 @@ class LoraAdamOptimizerConfig : public LoraOptimizerConfig { std::string getType() const override { return "Adam"; } - nlohmann::json toJson() const override; - - static std::unique_ptr - fromJson(nlohmann::json const &j); + static LoraAdamOptimizerConfig *fromJson(nlohmann::json const &j); public: // Adam @@ -94,29 +87,11 @@ class LoraLinearConfig { std::vector const &target_modules_ = {}); // constructor used to support std::unordered_map LoraLinearConfig(); - - // Method to set optimizer - template - void setOptimizer(T &&opt) { - if constexpr (std::is_base_of_v>) { - optimizer_config = - std::make_unique>(std::forward(opt)); - } else if constexpr (std::is_same_v, - std::remove_reference_t>) { - optimizer_config = std::move(opt); - } else { - static_assert(always_false, "Unsupported optimizer type"); - } - } - // Helper template for static_assert - template - static inline constexpr bool always_false = false; - friend bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs); friend std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc); + std::string serialize_to_json_string(int indent = -1) const; void serialize_to_json_file(std::string const &filename) const; // Deserialization method @@ -138,8 +113,7 @@ class LoraLinearConfig { // whether the weights are trainable (fine-tuning scenario) or not // (inference-only). If set to true, allocate space for the gradients bool trainable = false; - // LoraOptimizerConfig *optimizer_config; - std::unique_ptr optimizer_config; + LoraOptimizerConfig *optimizer_config; // whether to initialize weights randomly (instead of attempting to load them // from file) bool init_lora_weights; @@ -170,4 +144,4 @@ struct hash { }; } // namespace std -#endif // _FLEXFLOW_LORA_LINEAR_PARAMS_H +#endif // _FLEXFLOW_LORA_LINEAR_PARAMS_H \ No newline at end of file diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 39f213752e..3b4e8c4c8d 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -150,8 +150,8 @@ class RequestManager { std::vector eos_token_ids, std::string const &path); void register_output_filepath(std::string const &); - void register_peft_config(PEFTModelID const &peft_model_id, - LoraLinearConfig const &peft_config); + void set_peft_config(PEFTModelID const &peft_model_id, + LoraLinearConfig const &peft_config); LoraLinearConfig const &get_peft_config(PEFTModelID const &peft_model_id); void set_max_lora_rank(int max_lora_rank); void set_max_concurrent_adapters(int max_concurrent_adapters); diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index 0ab0b62ee8..af9e5743c7 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -320,18 +320,19 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "unknow model type"); } - // Add PEFT layer + // Start background server + rm->start_background_server(&model); + + // Add PEFT adapter(s) PEFTModelID *peft_model_id = nullptr, *peft_model_id_finetuning = nullptr; if (!peft_model_name.empty()) { - peft_model_id = model.add_lora_layer(peft_config); + peft_model_id = model.register_peft_adapter(peft_config); if (enable_peft_finetuning) { - peft_model_id_finetuning = model.add_lora_layer(peft_config_finetuning); + peft_model_id_finetuning = + model.register_peft_adapter(peft_config_finetuning); } } - // Start background server - rm->start_background_server(&model); - // Run workload { std::vector requests; diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc index 85e97ec4e8..9da4fa1994 100644 --- a/inference/peft/peft_bwd_benchmark.cc +++ b/inference/peft/peft_bwd_benchmark.cc @@ -304,15 +304,15 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "unknow model type"); } + // Start background server + rm->start_background_server(&model); + // Add PEFT layer PEFTModelID *peft_model_id = nullptr; if (!peft_model_name.empty()) { - peft_model_id = model.add_lora_layer(peft_config); + peft_model_id = model.register_peft_adapter(peft_config); } - // Start background server - rm->start_background_server(&model); - // Warmup stage { std::vector requests; diff --git a/inference/peft/peft_fwd_benchmark.cc b/inference/peft/peft_fwd_benchmark.cc index 87322a42dd..3274f2e535 100644 --- a/inference/peft/peft_fwd_benchmark.cc +++ b/inference/peft/peft_fwd_benchmark.cc @@ -304,15 +304,15 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "unknow model type"); } + // Start background server + rm->start_background_server(&model); + // Add PEFT layer PEFTModelID *peft_model_id = nullptr; if (!peft_model_name.empty()) { - peft_model_id = model.add_lora_layer(peft_config); + peft_model_id = model.register_peft_adapter(peft_config); } - // Start background server - rm->start_background_server(&model); - // Run workload { std::vector requests; diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc index ffa77478e1..8a94f6e68b 100644 --- a/inference/peft/req_rate_benchmark.cc +++ b/inference/peft/req_rate_benchmark.cc @@ -366,14 +366,14 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "unknow model type"); } + rm->start_background_server(&model); + // Add PEFT layer PEFTModelID *peft_model_id = nullptr; if (!peft_model_name.empty()) { - peft_model_id = model.add_lora_layer(peft_config); + peft_model_id = model.register_peft_adapter(peft_config); } - rm->start_background_server(&model); - // Warmup stage { std::vector requests; diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 28a2d6b23e..eb2a472ee3 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -35,6 +35,7 @@ LoraLinearMeta::~LoraLinearMeta(void) {} namespace Kernels { namespace LoraLinear { +#ifdef DEADCODE void init_kernel_wrapper(LoraLinearMeta *m, int seed) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -47,6 +48,7 @@ void init_kernel_wrapper(LoraLinearMeta *m, int seed) { assert(false && "Unsupported data type"); } } +#endif void inference_kernel_wrapper(LoraLinearMeta *m, BatchConfig const *bc, @@ -314,7 +316,6 @@ void inference_kernel(LoraLinearMeta *m, DT *output_ptr, int in_dim, int out_dim, - int num_shards, ffStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); @@ -593,7 +594,7 @@ void peft_bwd_kernel(Context ctx, if (lora_config.optimizer_config->getType() == "SGD") { LoraSGDOptimizerConfig const *sgd_config = static_cast( - lora_config.optimizer_config.get()); + lora_config.optimizer_config); // LoRA_A weight is split in tensor parallelism, so no need to apply // all-reduce sgd_update<< - LoraOptimizerConfig::fromJson(nlohmann::json const &j) { +LoraOptimizerConfig *LoraOptimizerConfig::fromJson(nlohmann::json const &j) { std::string type = j["type"]; if (type == "SGD") { return LoraSGDOptimizerConfig::fromJson(j); @@ -50,9 +49,9 @@ nlohmann::json LoraSGDOptimizerConfig::toJson() const { {"weight_decay", weight_decay}}; } -std::unique_ptr +LoraSGDOptimizerConfig * LoraSGDOptimizerConfig::fromJson(nlohmann::json const &j) { - auto sgd = std::make_unique(); + LoraSGDOptimizerConfig *sgd = new LoraSGDOptimizerConfig(); sgd->lr = j["lr"]; sgd->momentum = j["momentum"]; sgd->nesterov = j["nesterov"]; @@ -89,9 +88,9 @@ nlohmann::json LoraAdamOptimizerConfig::toJson() const { {"epsilon", epsilon}}; } -std::unique_ptr +LoraAdamOptimizerConfig * LoraAdamOptimizerConfig::fromJson(nlohmann::json const &j) { - auto adam = std::make_unique(); + LoraAdamOptimizerConfig *adam = new LoraAdamOptimizerConfig(); adam->alpha = j["alpha"]; adam->beta1 = j["beta1"]; adam->beta2 = j["beta2"]; @@ -220,12 +219,11 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) { os << "trainable: " << llc.trainable << ", "; if (llc.optimizer_config != nullptr) { os << "optimizer_config: "; - if (llc.optimizer_config.get()->getType() == "SGD") { - os << *static_cast( - llc.optimizer_config.get()); - } else if (llc.optimizer_config.get()->getType() == "Adam") { - os << *static_cast( - llc.optimizer_config.get()); + if (typeid(*llc.optimizer_config) == typeid(LoraSGDOptimizerConfig)) { + os << *static_cast(llc.optimizer_config); + } else if (typeid(*llc.optimizer_config) == + typeid(LoraAdamOptimizerConfig)) { + os << *static_cast(llc.optimizer_config); } else { os << "Unknown optimizer config type"; } @@ -248,8 +246,6 @@ std::string LoraLinearConfig::serialize_to_json_string(int indent) const { {"init_lora_weights", init_lora_weights}, {"base_model_name_or_path", base_model_name_or_path}, {"precision", precision}, - // {"optimizer_config", optimizer_config ? - // optimizer_config->toJson() : nullptr} {"optimizer_config", optimizer_config ? nlohmann::json(optimizer_config->toJson()) @@ -282,7 +278,8 @@ LoraLinearConfig LoraLinearConfig::deserialize_from_json_string( j["lora_dropout"].get(), j["target_modules"].get>()); if (!j["optimizer_config"].is_null()) { - config.setOptimizer(LoraOptimizerConfig::fromJson(j["optimizer_config"])); + config.optimizer_config = + LoraOptimizerConfig::fromJson(j["optimizer_config"]); } return config; } @@ -296,4 +293,4 @@ LoraLinearConfig return deserialize_from_json_string(j); } -}; // namespace FlexFlow +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index a65be9984c..db8f6b0042 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -263,8 +263,8 @@ size_t RequestManager::get_num_ssms() { return ssm_models.size(); } -void RequestManager::register_peft_config(PEFTModelID const &peft_model_id, - LoraLinearConfig const &peft_config) { +void RequestManager::set_peft_config(PEFTModelID const &peft_model_id, + LoraLinearConfig const &peft_config) { // check that peft_model_id is not already in use assert(peft_configs.find(peft_model_id) == peft_configs.end() && "PEFT model ID already in use"); @@ -322,7 +322,7 @@ PEFTModelID * } PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++); RequestManager *rm = RequestManager::get_request_manager(); - rm->register_peft_config(*peft_model_id, peft_config); + rm->set_peft_config(*peft_model_id, peft_config); return peft_model_id; } From fbdf74e0b4e67c905f7cb4acb4067c255d2608b0 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 5 Nov 2024 16:15:49 +0000 Subject: [PATCH 20/37] updates --- include/flexflow/config.h | 4 - include/flexflow/request_manager.h | 4 +- inference/python/chat.py | 1 - inference/python/ff_peft.py | 1 - inference/python/incr_decoding.py | 1 - inference/python/peft_demo/demo.ipynb | 2 - inference/python/peft_demo/demo.py | 1 - inference/python/spec_infer.py | 1 - inference/python/streamlit/fastapi_incr.py | 1 - python/flexflow/core/__init__.py | 1 - python/flexflow/serve/__init__.py | 9 - src/ops/lora_linear.cc | 159 ------------------ src/runtime/model.cc | 7 - src/runtime/model.cu | 30 +--- .../python_test_configs/generate_configs.py | 1 - 15 files changed, 4 insertions(+), 219 deletions(-) diff --git a/include/flexflow/config.h b/include/flexflow/config.h index dd9d657117..37afa0df27 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -104,8 +104,6 @@ struct FFHandler { // PEFT related fields MemoryAllocator *peft_activation_allocator; size_t peft_activation_reserve_space_size; - PEFTWeightAllocator *peft_weight_allocator; - size_t peft_weight_reserve_space_size; // Quantization fields DataType quantization_type; bool allowTensorOpMathConversion; @@ -118,7 +116,6 @@ struct FFInitInfo { size_t workSpaceSize; size_t offload_reserve_space_size; size_t peft_activation_reserve_space_size; - size_t peft_weight_reserve_space_size; DataType quantization_type; bool allowTensorOpMathConversion; // int myRank, allRanks; @@ -179,7 +176,6 @@ class FFConfig { // PEFT related fields bool enable_peft; size_t peft_activation_reserve_space_size; - size_t peft_weight_reserve_space_size; // Control parallelizable dimensions bool only_data_parallel; bool enable_sample_parallel; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 3b4e8c4c8d..e4a8f57900 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -300,8 +300,8 @@ class RequestManager { // peft std::unordered_map peft_configs; - int max_lora_rank; - int max_concurrent_adapters; + int max_lora_rank=0; + int max_concurrent_adapters=0; // peft benchmarking bool enable_peft_finetuning = false; static bool inference_finished; diff --git a/inference/python/chat.py b/inference/python/chat.py index 13ece116a6..70b8ee0067 100644 --- a/inference/python/chat.py +++ b/inference/python/chat.py @@ -36,7 +36,6 @@ def get_configs(): "use_8bit_quantization": False, "enable_peft": False, "peft_activation_reserve_space_size": 1024, # 1GB - "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "benchmarking": False, "inference_debugging": False, diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py index 13da7aee20..35338f5227 100644 --- a/inference/python/ff_peft.py +++ b/inference/python/ff_peft.py @@ -56,7 +56,6 @@ def get_configs(): "use_8bit_quantization": False, "enable_peft": True, "peft_activation_reserve_space_size": 1024, # 1GB - "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "inference_debugging": True, "fusion": False, diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index 232ef1699c..4bb6892a6b 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -56,7 +56,6 @@ def get_configs(): "use_8bit_quantization": False, "enable_peft": False, "peft_activation_reserve_space_size": 1024, # 1GB - "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "benchmarking": False, "inference_debugging": False, diff --git a/inference/python/peft_demo/demo.ipynb b/inference/python/peft_demo/demo.ipynb index dfb5193a1d..d29ad5ad2f 100644 --- a/inference/python/peft_demo/demo.ipynb +++ b/inference/python/peft_demo/demo.ipynb @@ -91,7 +91,6 @@ " \"use_8bit_quantization\": False,\n", " \"enable_peft\": True,\n", " \"peft_activation_reserve_space_size\": 1024, # 1GB\n", - " \"peft_weight_reserve_space_size\": 1024, # 1GB\n", " \"profiling\": False,\n", " \"inference_debugging\": False,\n", " \"fusion\": False,\n", @@ -1773,7 +1772,6 @@ " \"use_8bit_quantization\": False,\n", " \"enable_peft\": True,\n", " \"peft_activation_reserve_space_size\": 1024, # 1GB\n", - " \"peft_weight_reserve_space_size\": 1024, # 1GB\n", " \"profiling\": False,\n", " \"inference_debugging\": False,\n", " \"fusion\": False,\n", diff --git a/inference/python/peft_demo/demo.py b/inference/python/peft_demo/demo.py index 9e01b4645b..34b15b9a76 100644 --- a/inference/python/peft_demo/demo.py +++ b/inference/python/peft_demo/demo.py @@ -47,7 +47,6 @@ def create_datasets(finetune_dataset_size=2, inference_file_path='inference_data "use_8bit_quantization": False, "enable_peft": True, "peft_activation_reserve_space_size": 1024, # 1GB - "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "inference_debugging": False, "fusion": False, diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index 7ae752cffc..8cf96c1eba 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -56,7 +56,6 @@ def get_configs(): "use_8bit_quantization": False, "enable_peft": False, "peft_activation_reserve_space_size": 1024, # 1GB - "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "benchmarking": False, "inference_debugging": False, diff --git a/inference/python/streamlit/fastapi_incr.py b/inference/python/streamlit/fastapi_incr.py index 622f50008e..a1095e13dc 100644 --- a/inference/python/streamlit/fastapi_incr.py +++ b/inference/python/streamlit/fastapi_incr.py @@ -91,7 +91,6 @@ def get_configs(): "use_8bit_quantization": False, "enable_peft": False, "peft_activation_reserve_space_size": 1024, # 1GB - "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "benchmarking": False, "inference_debugging": False, diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py index b8ed15eaea..52fe331bf3 100644 --- a/python/flexflow/core/__init__.py +++ b/python/flexflow/core/__init__.py @@ -91,7 +91,6 @@ "use_8bit_quantization": "--8bit-quantization", "enable_peft": "-enable-peft", "peft_activation_reserve_space_size": "-peft-activation-reserve-space-size", - "peft_weight_reserve_space_size": "-peft-weight-reserve-space-size", } diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index fd29080a6a..55044d1838 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -55,7 +55,6 @@ def init( use_8bit_quantization: Optional[bool] = None, enable_peft: Optional[bool] = None, peft_activation_reserve_space_size: Optional[int] = None, - peft_weight_reserve_space_size: Optional[int] = None, profiling: Optional[bool] = None, benchmarking: Optional[bool] = None, inference_debugging: Optional[bool] = None, @@ -86,7 +85,6 @@ def init( - use_8bit_quantization: whether to use 8-bit quantization, defaults to False - enable_peft: whether to enable the use of PEFT, defaults to False - peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB - - peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB - profiling: whether to enable the FlexFlow profiling mode, defaults to False - benchmarking: whether to run benchmaking only, without loading real weights, defaults to False - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False @@ -125,8 +123,6 @@ def init( :type enable_peft: Optional[bool], optional :param peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB :type peft_activation_reserve_space_size: Optional[int], optional - :param peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB - :type peft_weight_reserve_space_size: Optional[int], optional :param profiling: whether to enable the FlexFlow profiling mode, defaults to False :type profiling: Optional[bool], optional :param benchmarking: whether to run benchmaking only, without loading real weights, defaults to False @@ -158,7 +154,6 @@ def init( use_8bit_quantization is not None, enable_peft is not None, peft_activation_reserve_space_size is not None, - peft_weight_reserve_space_size is not None, profiling is not None, benchmarking is not None, inference_debugging is not None, @@ -187,7 +182,6 @@ def init( "use_8bit_quantization": use_8bit_quantization, "enable_peft": enable_peft, "peft_activation_reserve_space_size": peft_activation_reserve_space_size, - "peft_weight_reserve_space_size": peft_weight_reserve_space_size, "profiling": profiling, "benchmarking": benchmarking, "inference_debugging": inference_debugging, @@ -210,7 +204,6 @@ def init( "pipeline_parallelism_degree", "offload_reserve_space_size", "peft_activation_reserve_space_size", - "peft_weight_reserve_space_size", ] for param in positive_int_params: __check_positive_int(configs_dict, param) @@ -238,8 +231,6 @@ def init( configs_dict["enable_peft"] = False if configs_dict.get("peft_activation_reserve_space_size", None) is None: configs_dict["peft_activation_reserve_space_size"] = 8 * 1024**3 - if configs_dict.get("peft_weight_reserve_space_size", None) is None: - configs_dict["peft_weight_reserve_space_size"] = 1024**3 if configs_dict.get("profiling", None) is None: configs_dict["profiling"] = False if configs_dict.get("benchmarking", None) is None: diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 23e493b6bd..c61e3f94ac 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -492,166 +492,7 @@ OpMeta *LoraLinear::init_task(Task const *task, return m; } -#ifdef DEADCODE -void load_peft_adapters(BatchConfig const *bc) { - for (auto const &kv : bc->peft_configs) { - PEFTModelID const &model_id = kv.first; - LoraLinearConfig const &lora_config = kv.second; - - int rank = lora_config.rank; - - int w0_num_elements = rank * in_dim; - int w1_num_elements = rank * out_dim; - // values below represent total weight sizes before sharding. Lora B is not - // sharded. - int lora_A_num_rows = in_dim * num_shards; - int lora_A_num_cols = rank; - int lora_B_num_rows = rank; - int lora_B_num_cols = out_dim; - int lora_A_num_shards = num_shards; - int lora_B_num_shards = 1; - - LoraLinearWeight weight; - weight.in_dim = in_dim; - weight.out_dim = out_dim; - weight.rank = rank; - weight.num_shards = num_shards; - PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator; - weight.w0_ptr = allocator->allocate_local_weights_untyped( - model_id, w0_num_elements * data_type_size(dt)); - weight.w1_ptr = allocator->allocate_local_weights_untyped( - model_id, w1_num_elements * data_type_size(dt)); - - if (!lora_config.init_lora_weights) { - // load weights from file - std::string weights_folder_filepath = join_path({ - lora_config.cache_folder, - "weights", - lora_config.peft_model_id, - dt == DT_FLOAT ? "full-precision" : "half-precision", - }); - std::string w0_filepath = join_path( - {weights_folder_filepath, lora_layername_substr + "_A.weight"}); - std::string w1_filepath = join_path( - {weights_folder_filepath, lora_layername_substr + "_B.weight"}); - if (dt == DT_FLOAT) { - std::cout << "Loading LORA weight " - << lora_layername_substr + "_A.weight" - << ", num_rows: " << lora_A_num_rows - << ", num_cols: " << lora_A_num_cols - << ", num_shards: " << lora_A_num_shards - << ", shard_id: " << shard_id << std::endl; - load_peft_from_file((float *)weight.w0_ptr, - lora_A_num_rows, - lora_A_num_cols, - lora_A_num_shards, - shard_id, - w0_filepath); - std::cout << "Loading LORA weight " - << lora_layername_substr + "_B.weight" - << ", num_rows: " << lora_B_num_rows - << ", num_cols: " << lora_B_num_cols - << ", num_shards: " << lora_B_num_shards - << ", shard_id: " << shard_id << std::endl; - load_peft_from_file((float *)weight.w1_ptr, - lora_B_num_rows, - lora_B_num_cols, - lora_B_num_shards, - shard_id, - w1_filepath); - } else if (dt == DT_HALF) { - std::cout << "Loading LORA weight " - << lora_layername_substr + "_A.weight" - << ", num_rows: " << lora_A_num_rows - << ", num_cols: " << lora_A_num_cols - << ", num_shards: " << lora_A_num_shards - << ", shard_id: " << shard_id << std::endl; - load_peft_from_file((half *)weight.w0_ptr, - lora_A_num_rows, - lora_A_num_cols, - lora_A_num_shards, - shard_id, - w0_filepath); - std::cout << "Loading LORA weight " - << lora_layername_substr + "_B.weight" - << ", num_rows: " << lora_B_num_rows - << ", num_cols: " << lora_B_num_cols - << ", num_shards: " << lora_B_num_shards - << ", shard_id: " << shard_id << std::endl; - load_peft_from_file((half *)weight.w1_ptr, - lora_B_num_rows, - lora_B_num_cols, - lora_B_num_shards, - shard_id, - w1_filepath); - } else { - assert(false && "Data type not supported"); - } - } else { - // initialize weights - int seed = 0; - init_kernel_wrapper(m, seed); - } - // allocate space for gradients if the LoRA layer is trainable - if (lora_config.trainable) { - // Ensure we have an optimizer - assert(lora_config.optimizer_config != nullptr && "Optimizer not set"); - assert(typeid(*lora_config.optimizer_config) != - typeid(LoraOptimizerConfig) && - "Optimizer config is not a subclass of LoraOptimizerConfig"); - if (lora->inputs[0]->dims[num_dims - 1].degree == 1) { - // Input is partitioned (no replication) - // w0_grad is local weight gradients - weight.w0_grad_ptr = allocator->allocate_local_weights_untyped( - model_id, w0_num_elements * data_type_size(dt)); - // w1_grad is sync weight gradients - weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped( - model_id, w1_num_elements * data_type_size(dt)); - } else { - // Input is replicated - // w0_grad is sync weight gradients - weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped( - model_id, w0_num_elements * data_type_size(dt)); - // w1_grad is local weight gradients - weight.w1_grad_ptr = allocator->allocate_local_weights_untyped( - model_id, w1_num_elements * data_type_size(dt)); - } - // allocate space for v_values if needed by optimizer - if (typeid(*lora_config.optimizer_config) == - typeid(LoraSGDOptimizerConfig)) { - LoraSGDOptimizerConfig const *sgd_config = - static_cast( - lora_config.optimizer_config); - if (sgd_config->momentum > 0.0f) { - if (lora->inputs[0]->dims[num_dims - 1].degree == 1) { - weight.w0_v_values_ptr = allocator->allocate_local_weights_untyped( - model_id, w0_num_elements * data_type_size(dt)); - weight.w1_v_values_ptr = allocator->allocate_sync_weights_untyped( - model_id, w1_num_elements * data_type_size(dt)); - } else { - weight.w0_v_values_ptr = allocator->allocate_sync_weights_untyped( - model_id, w0_num_elements * data_type_size(dt)); - weight.w1_v_values_ptr = allocator->allocate_local_weights_untyped( - model_id, w1_num_elements * data_type_size(dt)); - } - } - } else if (typeid(*lora_config.optimizer_config) == - typeid(LoraAdamOptimizerConfig)) { - assert(false && "Adam optim not yet implemented"); - } else { - assert(false && "Optimizer not supported"); - } - } - assert(m->model_state.find(model_id) == m->model_state.end()); - m->model_state[model_id].weights = weight; - m->model_state[model_id].optimizer_config = lora_config.optimizer_config; - m->model_state[model_id].lora_alpha = lora_config.lora_alpha; - m->model_state[model_id].cache_folder = lora_config.cache_folder; - m->model_state[model_id].peft_model_id = lora_config.peft_model_id; - } -} -#endif void LoraLinear::forward(FFModel const &ff) { assert(false && "LoraLinear does not support normal init"); diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 417cd2c056..de798890ef 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1550,8 +1550,6 @@ FFRuntime::FFRuntime(FFConfig &config) { config.cpu_offload ? config.offload_reserve_space_size : 0; info.peft_activation_reserve_space_size = config.enable_peft ? config.peft_activation_reserve_space_size : 0; - info.peft_weight_reserve_space_size = - config.enable_peft ? config.peft_weight_reserve_space_size : 0; info.quantization_type = config.quantization_type; info.allowTensorOpMathConversion = config.allow_tensor_op_math_conversion; argmap.set_point(*it, TaskArgument(&info, sizeof(FFInitInfo))); @@ -4400,7 +4398,6 @@ FFConfig::FFConfig() { enable_peft = DefaultConfig::enablePeft; peft_activation_reserve_space_size = DefaultConfig::peftActivationReserveSpaceSize; - peft_weight_reserve_space_size = DefaultConfig::peftWeightReserveSpaceSize; quantization_type = DT_NONE; only_data_parallel = DefaultConfig::onlyDataParallel; data_parallelism_degree = 1; @@ -4535,10 +4532,6 @@ void FFConfig::parse_args(char **argv, int argc) { peft_activation_reserve_space_size = atoll(argv[++i]) * 1024 * 1024; continue; } - if (!strcmp(argv[i], "-peft-weight-reserve-space-size")) { - peft_weight_reserve_space_size = atoll(argv[++i]) * 1024 * 1024; - continue; - } if ((!strcmp(argv[i], "--only-data-parallel"))) { only_data_parallel = true; continue; diff --git a/src/runtime/model.cu b/src/runtime/model.cu index 136ce99edd..6a166835d6 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -168,7 +168,7 @@ FFHandler } else { handle.batch_config_metadata = nullptr; } -#ifdef DEADCODE +// #ifdef DEADCODE if (info->peft_activation_reserve_space_size > 0) { // allocate memory for peft activation reserve space Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) @@ -182,33 +182,7 @@ FFHandler } else { handle.peft_activation_allocator = nullptr; } - - if (info->peft_weight_reserve_space_size > 0) { - // allocate memory for peft weight reserve space - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); - Realm::Rect<1, coord_t> bounds( - Realm::Point<1, coord_t>(0), - Realm::Point<1, coord_t>(info->peft_weight_reserve_space_size - 1)); - std::vector field_sizes; - field_sizes.push_back(sizeof(char)); - Realm::RegionInstance workspaceInst; - Realm::RegionInstance::create_instance(workspaceInst, - gpu_mem, - bounds, - field_sizes, - 0, - Realm::ProfilingRequestSet()) - .wait(); - void *ptr = workspaceInst.pointer_untyped(0, sizeof(char)); - handle.peft_weight_allocator = - new PEFTWeightAllocator(ptr, info->peft_weight_reserve_space_size); - } else { - handle.peft_weight_allocator = nullptr; - } -#endif +// #endif // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL handle.ncclComm = NULL; diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index 2720304d4f..4f7929e2db 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -19,7 +19,6 @@ "use_8bit_quantization": False, "enable_peft": False, "peft_activation_reserve_space_size": 1024, # 1GB - "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "benchmarking": False, "inference_debugging": False, From 10fb496f780e24abb248b867ada989e0d1b8f5d5 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 5 Nov 2024 20:53:42 +0000 Subject: [PATCH 21/37] fix --- include/flexflow/batch_config.h | 6 +++++- include/flexflow/request_manager.h | 3 ++- inference/peft/peft.cc | 1 + src/ops/kernels/lora_linear_kernels.cu | 10 ++++------ src/ops/lora_linear.cc | 15 ++++++-------- src/ops/lora_linear_params.cc | 27 +++++++++++++++++++------- src/runtime/request_manager.cc | 17 ++++++++-------- tests/peft_test.sh | 4 ++-- 8 files changed, 49 insertions(+), 34 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 44d829a7f7..2fb9413ae9 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -20,6 +20,8 @@ #include "legion.h" #include #include +#include + // #define MAX_SEQ_LEN 1024 // #define BATCH_SIZE 2 @@ -79,6 +81,7 @@ class BatchConfig { static int const MAX_NUM_REQUESTS = 65; static int const MAX_NUM_TOKENS = 1024; static int const MAX_SPEC_TREE_TOKEN_NUM = 64; + static int const MAX_PEFT_CONFIG_SIZE = 1024; // Set by update @@ -99,6 +102,7 @@ class BatchConfig { batch_config_request_id = -1; peft_bwd = false; optimizer_tasks = {true, false, false, false}; + std::memset(peft_model_config_str, 0, MAX_PEFT_CONFIG_SIZE); } int first_token_depth_in_request; int first_token_offset_in_batch; @@ -111,7 +115,7 @@ class BatchConfig { RequestGuid request_guid; // PEFT fields PEFTModelID peft_model_id; - std::string peft_model_config; + char peft_model_config_str[MAX_PEFT_CONFIG_SIZE]; bool peft_bwd; OptimizerTasks optimizer_tasks; }; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index e4a8f57900..d5e67d0c66 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -189,6 +189,7 @@ class RequestManager { bool is_eos_token(int token_id); bool check_inf_req_completion(BatchConfig const &old_bc, int i); void check_batch(BatchConfig const &old_bc, BatchConfig const &new_bc); + void add_peft_config_to_request_info(BatchConfig &bc, int req_idx, LoraLinearConfig const &peft_config); BatchConfig prepare_next_batch(BatchConfig const &bc, InferenceResult const &result); BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc, @@ -300,7 +301,7 @@ class RequestManager { // peft std::unordered_map peft_configs; - int max_lora_rank=0; + int max_lora_rank=32; int max_concurrent_adapters=0; // peft benchmarking bool enable_peft_finetuning = false; diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index af9e5743c7..96dd3a0562 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -275,6 +275,7 @@ void FlexFlow::top_level_task(Task const *task, rm->set_max_requests_per_batch( max_requests_per_batch + (int)enable_peft_finetuning); // add one slot for finetuning if needed + rm->set_max_concurrent_adapters(max_requests_per_batch + (int)enable_peft_finetuning); rm->set_max_tokens_per_batch(max_tokens_per_batch); rm->set_max_sequence_length(max_sequence_length); rm->register_tokenizer( diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index eb2a472ee3..e50805b6ca 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -335,9 +335,8 @@ void inference_kernel(LoraLinearMeta *m, if (bc->requestsInfo[i].peft_bwd) { num_peft_requests++; } - LoraLinearConfig lora_config = - LoraLinearConfig::deserialize_from_json_string( - bc->requestsInfo[i].peft_model_config); + std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str); + LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); if (!lora_applies_to_this_layer(m, lora_config)) { continue; } @@ -463,9 +462,8 @@ void peft_bwd_kernel(Context ctx, !bc->requestsInfo[i].peft_bwd) { continue; } - LoraLinearConfig lora_config = - LoraLinearConfig::deserialize_from_json_string( - bc->requestsInfo[i].peft_model_config); + std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str); + LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); if (!lora_applies_to_this_layer(m, lora_config)) { continue; } diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index c61e3f94ac..3735aefc01 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -619,9 +619,8 @@ void LoraLinear::inference_task(Task const *task, bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { continue; } - LoraLinearConfig lora_config = - LoraLinearConfig::deserialize_from_json_string( - bc->requestsInfo[i].peft_model_config); + std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str); + LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); if (!lora_applies_to_this_layer(m, lora_config)) { continue; } @@ -777,9 +776,8 @@ void lora_inference_debugging(LoraLinearMeta *m, !bc->requestsInfo[i].peft_bwd) { continue; } - LoraLinearConfig lora_config = - LoraLinearConfig::deserialize_from_json_string( - bc->requestsInfo[i].peft_model_config); + std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str); + LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); if (!lora_applies_to_this_layer(m, lora_config)) { continue; } @@ -911,9 +909,8 @@ void save_peft_weights_if_needed(LoraLinearMeta *m, !bc->requestsInfo[i].peft_bwd) { continue; } - LoraLinearConfig lora_config = - LoraLinearConfig::deserialize_from_json_string( - bc->requestsInfo[i].peft_model_config); + std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str); + LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); if (!lora_applies_to_this_layer(m, lora_config)) { continue; } diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc index 21648089da..4eb59bc53f 100644 --- a/src/ops/lora_linear_params.cc +++ b/src/ops/lora_linear_params.cc @@ -235,12 +235,23 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) { return os; } +double ToThreeDecimalPlaces(float f) { + double d = static_cast(f); + int i; + if (d >= 0) { + i = static_cast(d * 1000 + 0.5); + } else { + i = static_cast(d * 1000 - 0.5); + } + return (i / 1000.0); +} + std::string LoraLinearConfig::serialize_to_json_string(int indent) const { nlohmann::json j = {{"cache_folder", cache_folder}, {"peft_model_id", peft_model_id}, {"rank", rank}, - {"lora_alpha", lora_alpha}, - {"lora_dropout", lora_dropout}, + {"lora_alpha", ToThreeDecimalPlaces(lora_alpha)}, + {"lora_dropout", ToThreeDecimalPlaces(lora_dropout)}, {"target_modules", target_modules}, {"trainable", trainable}, {"init_lora_weights", init_lora_weights}, @@ -264,12 +275,18 @@ void LoraLinearConfig::serialize_to_json_file( // Deserialization method LoraLinearConfig LoraLinearConfig::deserialize_from_json_string( std::string const &json_string) { + // std::cout << "Attempting to deserialize from JSON string: " << json_string + // << std::endl; nlohmann::json j = nlohmann::json::parse(json_string); + LoraOptimizerConfig *optimizer_config_ = nullptr; + if (!j["optimizer_config"].is_null()) { + optimizer_config_ = LoraOptimizerConfig::fromJson(j["optimizer_config"]); + } LoraLinearConfig config( j["cache_folder"].get(), j["peft_model_id"].get(), j["trainable"].get(), - nullptr, // optimizer_config will be set later if present + optimizer_config_, // optimizer_config will be set later if present j["init_lora_weights"].get(), j["base_model_name_or_path"].get(), j["precision"].get(), @@ -277,10 +294,6 @@ LoraLinearConfig LoraLinearConfig::deserialize_from_json_string( j["lora_alpha"].get(), j["lora_dropout"].get(), j["target_modules"].get>()); - if (!j["optimizer_config"].is_null()) { - config.optimizer_config = - LoraOptimizerConfig::fromJson(j["optimizer_config"]); - } return config; } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index db8f6b0042..0bfbb7f8f4 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -691,6 +691,12 @@ void RequestManager::check_batch(BatchConfig const &old_bc, } } +void RequestManager::add_peft_config_to_request_info(BatchConfig &bc, int req_idx, LoraLinearConfig const &peft_config) { + std::memset(bc.requestsInfo[req_idx].peft_model_config_str, 0, BatchConfig::MAX_PEFT_CONFIG_SIZE); + std::string peft_config_str = peft_config.serialize_to_json_string(); + std::strcpy(bc.requestsInfo[req_idx].peft_model_config_str, peft_config_str.c_str()); +} + BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { const std::lock_guard lock(request_queue_mutex); @@ -825,8 +831,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].peft_model_id = old_bc.requestsInfo[i].peft_model_id; - new_bc.requestsInfo[i].peft_model_config = - old_bc.requestsInfo[i].peft_model_config; + std::strcpy(new_bc.requestsInfo[i].peft_model_config_str, old_bc.requestsInfo[i].peft_model_config_str); if (old_bc.requestsInfo[i].peft_model_id != PEFTModelID::NO_ID) { num_concurrent_adapters += 1; } @@ -911,9 +916,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].max_length = new_request.max_length; new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id; if (new_request.peft_model_id != PEFTModelID::NO_ID) { - new_bc.requestsInfo[i].peft_model_config = - get_peft_config(new_request.peft_model_id) - .serialize_to_json_string(); + add_peft_config_to_request_info(new_bc, i, get_peft_config(new_request.peft_model_id)); } new_bc.requestsInfo[i].peft_bwd = false; new_bc.request_completed[i] = false; @@ -1084,9 +1087,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[inference_batch_size].request_guid = request.guid; new_bc.requestsInfo[inference_batch_size].peft_model_id = request.peft_model_id; - new_bc.requestsInfo[inference_batch_size].peft_model_config = - get_peft_config(request.peft_model_id).serialize_to_json_string(); - new_bc.requestsInfo[inference_batch_size].peft_bwd = true; + add_peft_config_to_request_info(new_bc, inference_batch_size, get_peft_config(request.peft_model_id)); set_optimizer_tasks( new_bc.requestsInfo[inference_batch_size].optimizer_tasks, request.max_training_steps, diff --git a/tests/peft_test.sh b/tests/peft_test.sh index 5600d57edf..173fb37fd9 100755 --- a/tests/peft_test.sh +++ b/tests/peft_test.sh @@ -38,9 +38,9 @@ python ./tests/peft/hf_finetune.py --peft-model-id goliaro/llama-160m-lora --sav # Python test echo "Python test" -python ./inference/python/ff_peft.py +# python ./inference/python/ff_peft.py # Check alignment -python ./tests/peft/peft_alignment_test.py -tp 2 +# python ./tests/peft/peft_alignment_test.py -tp 2 # C++ test echo "C++ test" From 79dc3a2b4666020de0d052f2bbc354900cc4e8cd Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 5 Nov 2024 21:02:46 +0000 Subject: [PATCH 22/37] fix --- include/flexflow/batch_config.h | 1 - include/flexflow/request_manager.h | 8 +++++--- inference/peft/peft.cc | 3 ++- src/ops/kernels/lora_linear_kernels.cu | 12 ++++++++---- src/ops/lora_linear.cc | 20 ++++++++++++-------- src/runtime/model.cu | 4 ++-- src/runtime/request_manager.cc | 20 ++++++++++++++------ 7 files changed, 43 insertions(+), 25 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 2fb9413ae9..bbcfdb32fc 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -22,7 +22,6 @@ #include #include - // #define MAX_SEQ_LEN 1024 // #define BATCH_SIZE 2 // #define BATCH_SIZE 16 diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index d5e67d0c66..c15c0ff8b4 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -189,7 +189,9 @@ class RequestManager { bool is_eos_token(int token_id); bool check_inf_req_completion(BatchConfig const &old_bc, int i); void check_batch(BatchConfig const &old_bc, BatchConfig const &new_bc); - void add_peft_config_to_request_info(BatchConfig &bc, int req_idx, LoraLinearConfig const &peft_config); + void add_peft_config_to_request_info(BatchConfig &bc, + int req_idx, + LoraLinearConfig const &peft_config); BatchConfig prepare_next_batch(BatchConfig const &bc, InferenceResult const &result); BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc, @@ -301,8 +303,8 @@ class RequestManager { // peft std::unordered_map peft_configs; - int max_lora_rank=32; - int max_concurrent_adapters=0; + int max_lora_rank = 32; + int max_concurrent_adapters = 0; // peft benchmarking bool enable_peft_finetuning = false; static bool inference_finished; diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index 96dd3a0562..da2993187c 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -275,7 +275,8 @@ void FlexFlow::top_level_task(Task const *task, rm->set_max_requests_per_batch( max_requests_per_batch + (int)enable_peft_finetuning); // add one slot for finetuning if needed - rm->set_max_concurrent_adapters(max_requests_per_batch + (int)enable_peft_finetuning); + rm->set_max_concurrent_adapters(max_requests_per_batch + + (int)enable_peft_finetuning); rm->set_max_tokens_per_batch(max_tokens_per_batch); rm->set_max_sequence_length(max_sequence_length); rm->register_tokenizer( diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index e50805b6ca..09d79809a7 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -335,8 +335,10 @@ void inference_kernel(LoraLinearMeta *m, if (bc->requestsInfo[i].peft_bwd) { num_peft_requests++; } - std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str); - LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); + std::string peft_model_config_str = + std::string(bc->requestsInfo[i].peft_model_config_str); + LoraLinearConfig lora_config = + LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); if (!lora_applies_to_this_layer(m, lora_config)) { continue; } @@ -462,8 +464,10 @@ void peft_bwd_kernel(Context ctx, !bc->requestsInfo[i].peft_bwd) { continue; } - std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str); - LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); + std::string peft_model_config_str = + std::string(bc->requestsInfo[i].peft_model_config_str); + LoraLinearConfig lora_config = + LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); if (!lora_applies_to_this_layer(m, lora_config)) { continue; } diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 3735aefc01..f17f69a7c9 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -492,8 +492,6 @@ OpMeta *LoraLinear::init_task(Task const *task, return m; } - - void LoraLinear::forward(FFModel const &ff) { assert(false && "LoraLinear does not support normal init"); } @@ -619,8 +617,10 @@ void LoraLinear::inference_task(Task const *task, bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { continue; } - std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str); - LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); + std::string peft_model_config_str = + std::string(bc->requestsInfo[i].peft_model_config_str); + LoraLinearConfig lora_config = + LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); if (!lora_applies_to_this_layer(m, lora_config)) { continue; } @@ -776,8 +776,10 @@ void lora_inference_debugging(LoraLinearMeta *m, !bc->requestsInfo[i].peft_bwd) { continue; } - std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str); - LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); + std::string peft_model_config_str = + std::string(bc->requestsInfo[i].peft_model_config_str); + LoraLinearConfig lora_config = + LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); if (!lora_applies_to_this_layer(m, lora_config)) { continue; } @@ -909,8 +911,10 @@ void save_peft_weights_if_needed(LoraLinearMeta *m, !bc->requestsInfo[i].peft_bwd) { continue; } - std::string peft_model_config_str = std::string(bc->requestsInfo[i].peft_model_config_str); - LoraLinearConfig lora_config = LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); + std::string peft_model_config_str = + std::string(bc->requestsInfo[i].peft_model_config_str); + LoraLinearConfig lora_config = + LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); if (!lora_applies_to_this_layer(m, lora_config)) { continue; } diff --git a/src/runtime/model.cu b/src/runtime/model.cu index 6a166835d6..3a250539c7 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -168,7 +168,7 @@ FFHandler } else { handle.batch_config_metadata = nullptr; } -// #ifdef DEADCODE + // #ifdef DEADCODE if (info->peft_activation_reserve_space_size > 0) { // allocate memory for peft activation reserve space Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) @@ -183,7 +183,7 @@ FFHandler handle.peft_activation_allocator = nullptr; } // #endif - // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize)); +// checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL handle.ncclComm = NULL; #endif diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 0bfbb7f8f4..a25677b22e 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -691,10 +691,14 @@ void RequestManager::check_batch(BatchConfig const &old_bc, } } -void RequestManager::add_peft_config_to_request_info(BatchConfig &bc, int req_idx, LoraLinearConfig const &peft_config) { - std::memset(bc.requestsInfo[req_idx].peft_model_config_str, 0, BatchConfig::MAX_PEFT_CONFIG_SIZE); +void RequestManager::add_peft_config_to_request_info( + BatchConfig &bc, int req_idx, LoraLinearConfig const &peft_config) { + std::memset(bc.requestsInfo[req_idx].peft_model_config_str, + 0, + BatchConfig::MAX_PEFT_CONFIG_SIZE); std::string peft_config_str = peft_config.serialize_to_json_string(); - std::strcpy(bc.requestsInfo[req_idx].peft_model_config_str, peft_config_str.c_str()); + std::strcpy(bc.requestsInfo[req_idx].peft_model_config_str, + peft_config_str.c_str()); } BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, @@ -831,7 +835,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].peft_model_id = old_bc.requestsInfo[i].peft_model_id; - std::strcpy(new_bc.requestsInfo[i].peft_model_config_str, old_bc.requestsInfo[i].peft_model_config_str); + std::strcpy(new_bc.requestsInfo[i].peft_model_config_str, + old_bc.requestsInfo[i].peft_model_config_str); if (old_bc.requestsInfo[i].peft_model_id != PEFTModelID::NO_ID) { num_concurrent_adapters += 1; } @@ -916,7 +921,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].max_length = new_request.max_length; new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id; if (new_request.peft_model_id != PEFTModelID::NO_ID) { - add_peft_config_to_request_info(new_bc, i, get_peft_config(new_request.peft_model_id)); + add_peft_config_to_request_info( + new_bc, i, get_peft_config(new_request.peft_model_id)); } new_bc.requestsInfo[i].peft_bwd = false; new_bc.request_completed[i] = false; @@ -1085,9 +1091,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, num_peft_tokens; new_bc.requestsInfo[inference_batch_size].max_length = request.max_length; new_bc.requestsInfo[inference_batch_size].request_guid = request.guid; + new_bc.requestsInfo[inference_batch_size].peft_bwd = true; new_bc.requestsInfo[inference_batch_size].peft_model_id = request.peft_model_id; - add_peft_config_to_request_info(new_bc, inference_batch_size, get_peft_config(request.peft_model_id)); + add_peft_config_to_request_info( + new_bc, inference_batch_size, get_peft_config(request.peft_model_id)); set_optimizer_tasks( new_bc.requestsInfo[inference_batch_size].optimizer_tasks, request.max_training_steps, From 42198061fb2970a7e40e8141fa23cc0d228dbe98 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 5 Nov 2024 21:27:53 +0000 Subject: [PATCH 23/37] fix --- src/runtime/peft_weight_allocator.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc index 81b412e049..2dd9a4711b 100644 --- a/src/runtime/peft_weight_allocator.cc +++ b/src/runtime/peft_weight_allocator.cc @@ -63,6 +63,7 @@ void PEFTMemoryManager::get_finetuning_slot(PEFTModelID const &model_id, assert(finetuning_ptr != nullptr && "PEFT Memory Manager finetuning_ptr is null"); *cache_miss = (model_id.id != finetuning_model_id.id); + finetuning_model_id = model_id; } int PEFTMemoryManager::get_inference_peft_slot(PEFTModelID const &model_id, From f542fbb2690778ed8969807ed71abaadea7eada5 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 6 Nov 2024 15:39:10 +0000 Subject: [PATCH 24/37] small fix --- tests/inference/inference_alignment_test.py | 2 +- tests/peft/alignment/align_test_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/inference/inference_alignment_test.py b/tests/inference/inference_alignment_test.py index 8dab7ff43b..1fe2bfbaae 100644 --- a/tests/inference/inference_alignment_test.py +++ b/tests/inference/inference_alignment_test.py @@ -361,7 +361,7 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)[:,:,-1].squeeze() hf_tensor = hf_tensor.squeeze() - print(hf_tensor.shape, ff_tensor.shape) + # print(hf_tensor.shape, ff_tensor.shape) compare(hf_tensor, ff_tensor, label="LM head input") output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py index f5ed8ae65b..a8a9be2f3b 100644 --- a/tests/peft/alignment/align_test_utils.py +++ b/tests/peft/alignment/align_test_utils.py @@ -430,7 +430,7 @@ def compare_loaded_tensors(hf_tensor, ff_tensor, tolerance=1e-2): print(f"HF: {hf_tensor}\nFF:{ff_tensor}") print(np.isclose(hf_tensor, ff_tensor, atol=tolerance)) mismatches = np.where(~np.isclose(hf_tensor, ff_tensor, atol=tolerance))[0] - print(mismatches) + # print(mismatches) len_hf_tensor = hf_tensor.flatten().shape[0] assert len(mismatches) <= 0.05 * len_hf_tensor print("Ok!") From 139b643646e3c3ddcf69682a9e9e98b37dec2f0e Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 7 Nov 2024 22:03:38 +0000 Subject: [PATCH 25/37] fix --- .../ops/kernels/lora_linear_kernels.h | 2 + .../flexflow/utils/peft_weight_allocator.h | 68 ----- src/ops/fused.cu | 2 + src/ops/kernels/lora_linear_kernels.cu | 192 +++---------- src/ops/lora_linear.cc | 255 +----------------- src/ops/lora_linear_params.cc | 24 +- src/runtime/peft_weight_allocator.cc | 4 +- src/runtime/request_manager.cc | 8 +- tests/peft/hf_finetune.py | 2 +- tests/peft/peft_alignment_test.py | 57 +++- tests/peft_test.sh | 8 +- 11 files changed, 120 insertions(+), 502 deletions(-) diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h index 7138f62e90..b17868fb96 100644 --- a/include/flexflow/ops/kernels/lora_linear_kernels.h +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -52,6 +52,7 @@ void peft_bwd_kernel_wrapper(Context ctx, Runtime *runtime, LoraLinearMeta *m, BatchConfig const *bc, + int shard_id, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output_grad); @@ -71,6 +72,7 @@ void peft_bwd_kernel(Context ctx, Runtime *runtime, LoraLinearMeta *m, BatchConfig const *bc, + int shard_id, DT *input_grad_ptr, DT const *output_grad_ptr, int in_dim, diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h index bd8ddb1dce..21ac9bf426 100644 --- a/include/flexflow/utils/peft_weight_allocator.h +++ b/include/flexflow/utils/peft_weight_allocator.h @@ -23,74 +23,6 @@ namespace FlexFlow { -#ifdef DEADCODE -class PEFTWeightAllocator { -public: - PEFTWeightAllocator(void *_base_ptr, size_t _total_size) - : base_ptr(_base_ptr), total_size(_total_size), sync_offset(0), - local_offset(_total_size) {} - - inline void *allocate_sync_weights_untyped(PEFTModelID const &peft_model_id, - size_t datalen) { - const std::lock_guard lock(peft_weight_allocator_mutex); - void *ptr = static_cast(base_ptr) + sync_offset; - off_t model_sync_weights_offset = sync_offset; - size_t model_sync_weights_size = datalen; - if (sync_weights.find(peft_model_id) != sync_weights.end()) { - // Assert that sync weights for each PEFT model is consecutive - std::pair offset_and_size = sync_weights[peft_model_id]; - assert(sync_offset == offset_and_size.first + offset_and_size.second); - model_sync_weights_offset = offset_and_size.first; - model_sync_weights_size = offset_and_size.second + datalen; - } - sync_offset += datalen; - assert(sync_offset < local_offset); - sync_weights[peft_model_id] = - std::make_pair(model_sync_weights_offset, model_sync_weights_size); - return ptr; - } - - std::pair - get_sync_weights_ptr_and_size(PEFTModelID const &peft_model_id) { - const std::lock_guard lock(peft_weight_allocator_mutex); - assert(sync_weights.find(peft_model_id) != sync_weights.end()); - std::pair offset_and_size = sync_weights[peft_model_id]; - return std::make_pair(static_cast(base_ptr) + offset_and_size.first, - offset_and_size.second); - } - - inline void *allocate_local_weights_untyped(PEFTModelID const &peft_model_id, - size_t datalen) { - const std::lock_guard lock(peft_weight_allocator_mutex); - local_offset -= datalen; - assert(sync_offset < local_offset); - void *ptr = static_cast(base_ptr) + local_offset; - return ptr; - } - - template - inline DT *allocate_sync_weights(PEFTModelID const &peft_model_id, - size_t count) { - return static_cast
( - allocate_sync_weights_untyped(peft_model_id, sizeof(DT) * count)); - } - - template - inline DT *allocate_local_weights(PEFTModelID const &peft_model_id, - size_t count) { - return static_cast
( - allocate_local_weights_untyped(peft_model_id, sizeof(DT) * count)); - } - -public: - void *base_ptr; - size_t total_size; - off_t sync_offset, local_offset; - std::unordered_map> sync_weights; - std::mutex peft_weight_allocator_mutex; -}; -#endif - struct LoraLinearWeight { // weights void *w0_ptr, *w1_ptr; diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 62845c0f8e..c615a104d2 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -889,11 +889,13 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, // Assert that the output and the second input are at the same place // since we ``inplace'' the output for LoRA assert(my_input_grad_accessor[1].ptr == my_output_grad_accessor[0].ptr); + int shard_id = task->index_point.point_data[0]; Kernels::LoraLinear::peft_bwd_kernel_wrapper( ctx, runtime, m, bc, + shard_id, my_input_grad_accessor[0], my_output_grad_accessor[0]); break; diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 09d79809a7..dabe40c501 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -24,31 +24,34 @@ namespace FlexFlow { LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li) : OpMeta(handler, li) { -#ifdef DEADCODE - allocated_peft_buffer_size1 = 0; - allocated_peft_buffer_size2 = 0; -#endif } LoraLinearMeta::~LoraLinearMeta(void) {} +std::string get_peft_dbg_folder(LoraLinearMeta const *m, + int shard_id, + bool is_fwd) { + std::string op_name_without_uid = LoraLinear::get_op_name_without_uid(m); + fs::path dst_filepath; + if (is_fwd) { + dst_filepath = get_dst_folder("fwd", m->decoding_step, shard_id); + } else { + dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id); + } + if (m->layer_guid.model_id > 0) { + assert(false && "Model ID > 0 not supported yet"); + } + std::string layername = "layers." + + std::to_string(m->layer_guid.transformer_layer_id) + + "." + op_name_without_uid; + dst_filepath /= layername; + return dst_filepath.string(); +} + namespace Kernels { namespace LoraLinear { -#ifdef DEADCODE -void init_kernel_wrapper(LoraLinearMeta *m, int seed) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - if (m->input_type[0] == DT_FLOAT) { - Internal::init_kernel(m, seed, stream); - } else if (m->input_type[0] == DT_HALF) { - Internal::init_kernel(m, seed, stream); - } else { - assert(false && "Unsupported data type"); - } -} -#endif void inference_kernel_wrapper(LoraLinearMeta *m, BatchConfig const *bc, @@ -104,6 +107,7 @@ void peft_bwd_kernel_wrapper(Context ctx, Runtime *runtime, LoraLinearMeta *m, BatchConfig const *bc, + int shard_id, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output_grad) { cudaStream_t stream; @@ -121,6 +125,7 @@ void peft_bwd_kernel_wrapper(Context ctx, runtime, m, bc, + shard_id, input_grad.get_float_ptr(), output_grad.get_float_ptr(), in_dim, @@ -131,6 +136,7 @@ void peft_bwd_kernel_wrapper(Context ctx, runtime, m, bc, + shard_id, input_grad.get_half_ptr(), output_grad.get_half_ptr(), in_dim, @@ -168,146 +174,6 @@ bool lora_applies_to_this_layer(LoraLinearMeta *m, namespace Internal { -#ifdef DEADCODE -template -void inference_kernel(LoraLinearMeta *m, - BatchConfig const *bc, - DT const *input_ptr, - DT *output_ptr, - int in_dim, - int out_dim, - ffStream_t stream) { - checkCUDA(cublasSetStream(m->handle.blas, stream)); - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - DT alpha = 1.0f, beta = 0.0f; - cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); - cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]); - cudaDataType_t lr_actv_type = output_type; - assert(input_type == output_type); - cudaDataType_t weight_type = output_type; - cudaDataType_t compute_type = output_type; - // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - // cudaDataType_t compute_type = output_type; - // #else - // // For best performance, set the default cublas compute type to - // // CUBLAS_COMPUTE_16F for half precision and to - // // CUBLAS_COMPUTE_32F_FAST_16F for full precision - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - // if (m->input_type[0] == DT_FLOAT) { - // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - // } - // #endif - int num_peft_requests = 0; - for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } - if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { - continue; - } - if (bc->requestsInfo[i].peft_bwd) { - num_peft_requests++; - } - } - // Assert that we have at most one request that requires peft_bwd - assert(num_peft_requests <= 1); - for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } - // Skip non-PEFT requests - if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { - continue; - } - int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_length; - int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; - assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != - m->model_state.end()); - LoraLinearWeight weight = - m->model_state[bc->requestsInfo[i].peft_model_id].weights; - int rank = weight.rank; - void *intermediate_result_ptr = nullptr; - if (bc->requestsInfo[i].peft_bwd) { - size_t activation_size_needed1 = - data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; - size_t activation_size_needed2 = - data_type_size(m->input_type[1]) * max_peft_tokens * rank; - MemoryAllocator *allocator = m->handle.peft_activation_allocator; - if (activation_size_needed1 > m->allocated_peft_buffer_size1) { - m->input_activation = - allocator->allocate_instance_untyped(activation_size_needed1); - m->allocated_peft_buffer_size1 = activation_size_needed1; - } - if (activation_size_needed2 > m->allocated_peft_buffer_size2) { - m->low_rank_activation = - allocator->allocate_instance_untyped(activation_size_needed2); - m->allocated_peft_buffer_size2 = activation_size_needed2; - } - // copy input activation - checkCUDA(cudaMemcpyAsync(m->input_activation, - input_ptr + first_token_offset * in_dim, - data_type_size(m->input_type[0]) * - num_peft_tokens * in_dim, - cudaMemcpyDeviceToDevice, - stream)); - intermediate_result_ptr = m->low_rank_activation; - } else { - // use workspace to save intermediate result - assert(m->handle.workSpaceSize >= - data_type_size(m->input_type[1]) * num_peft_tokens * rank); - intermediate_result_ptr = m->handle.workSpace; - } - // buffer = weight_first * input - // [rank, num_peft_tokens] = [in_dim, rank].T * [in_dim, num_peft_tokens] - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - rank, - num_peft_tokens, - in_dim, - &alpha, - weight.w0_ptr, - weight_type, - in_dim, - input_ptr + first_token_offset * in_dim, - input_type, - in_dim, - &beta, - intermediate_result_ptr, - lr_actv_type, - rank, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // output = weight_second * buffer - // [out_dim, num_peft_tokens] = [rank, out_dim].T * [rank, num_peft_tokens] - // Note that we use alpha in both places since we do - // an in-place update for LoraLinear - float lora_alpha = - m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha; - DT scaling_constant = (DT)(lora_alpha / rank); - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - out_dim, - num_peft_tokens, - rank, - &scaling_constant, - weight.w1_ptr, - weight_type, - rank, - intermediate_result_ptr, - lr_actv_type, - rank, - &alpha, - output_ptr + first_token_offset * out_dim, - output_type, - out_dim, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } -} -#endif template void inference_kernel(LoraLinearMeta *m, @@ -342,6 +208,8 @@ void inference_kernel(LoraLinearMeta *m, if (!lora_applies_to_this_layer(m, lora_config)) { continue; } + std::cout << "Lora layer activated!" << std::endl; + std::cout << "Lora Config: " << peft_model_config_str << std::endl; assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && "Trainable flag mismatch"); int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; @@ -443,6 +311,7 @@ void peft_bwd_kernel(Context ctx, Runtime *runtime, LoraLinearMeta *m, BatchConfig const *bc, + int shard_id, DT *input_grad_ptr, DT const *output_grad_ptr, int in_dim, @@ -471,6 +340,8 @@ void peft_bwd_kernel(Context ctx, if (!lora_applies_to_this_layer(m, lora_config)) { continue; } + std::cout << "Lora layer activated!" << std::endl; + std::cout << "Lora Config: " << peft_model_config_str << std::endl; assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && "Trainable flag mismatch"); m->peft_memory_manager->check_ft_model_id( @@ -488,6 +359,13 @@ void peft_bwd_kernel(Context ctx, DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero) ? 0.0f : 1.0f; + std::cout << "Lora B gradient computation, beta = " << (float) beta << std::endl; + if (m->inference_debugging) { + // save result to file for checking + std::string filename = get_peft_dbg_folder(m, shard_id, false) + ".low_rank_activation"; + std::cout << "Save low_rank_activation (" << lora_config.rank << ", " << num_peft_tokens << ") to " << filename << std::endl; + save_tensor(static_cast(weight.low_rank_activation), lora_config.rank*num_peft_tokens, filename.c_str()); + } checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_T, diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index f17f69a7c9..5f67709358 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -136,133 +136,6 @@ void FFModel::add_lora_layers(std::vector target_modules) { } } -#ifdef DEADCODE -PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) { - assert(config.enable_peft && - "Cannot add a LoRA layer if PEFT mode is not enabled"); - if (peft_config.target_modules.size() == 0) { - printf("PEFT config does not contain any target module\n"); - std::cout << peft_config << std::endl; - assert(false); - } - PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++); - peft_configs[*peft_model_id] = peft_config; - - for (std::string target_module_name : peft_config.target_modules) { - assert(target_module_name.length() > 0 && - "LoRA target module name is empty"); - // find target layer - for (auto it = layers.begin(); it != layers.end(); ++it) { - Layer *target_module = *it; - bool match = check_lora_layer_match(target_module, target_module_name); - if (!match) { - continue; - } - - if (base_layer_to_peft_layer.find(target_module) != - base_layer_to_peft_layer.end()) { - // lora linear layer already added, no need to add again - Layer *peft_layer = base_layer_to_peft_layer[target_module]; - peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id); - } else { - Tensor const input = target_module->inputs[0]; - Tensor const output = target_module->outputs[0]; - assert(input->data_type == output->data_type); - std::string name_ = target_module->name - ? std::string(target_module->name) - : std::string(""); - size_t last_underscore = name_.length() - 1; - for (int i = name_.length() - 1; i > 0; i--) { - if (!(std::isdigit(target_module->name[i]) || - target_module->name[i] == '_')) { - break; - } else if (target_module->name[i] == '_') { - last_underscore = i; - } - } - name_.erase(last_underscore); - - name_ += ".lora"; - std::cout << "Adding layer " << name_ << std::endl; - Layer *peft_layer = new Layer(this, - OP_LORA, - output->data_type, - name_.c_str(), - 2 /*inputs*/, - 0 /*weights*/, - 1 /*outputs*/, - input, - output); - // fix LoRA layer's transformer layer ID and model ID - peft_layer->layer_guid.transformer_layer_id = - target_module->layer_guid.transformer_layer_id; - peft_layer->layer_guid.model_id = target_module->layer_guid.model_id; - { - int numdims = output->num_dims; - int dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdims; i++) { - dims[i] = output->dims[i]; - } - peft_layer->outputs[0] = - create_tensor_legion_ordering(numdims, - dims, - output->data_type, - peft_layer, - 0, - true /*create_grad*/); - } - it = layers.insert(it + 1, peft_layer); - ++it; - base_layer_to_peft_layer[target_module] = peft_layer; - peft_layer_to_peft_id[peft_layer] = std::vector(); - peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id); - } - } - } - - // save finetuned lora model configs to file - if (peft_config.trainable) { - std::string finetuned_model_folder = join_path({ - peft_config.cache_folder, - "finetuned_models", - peft_config.peft_model_id, - }); - fs::remove_all(finetuned_model_folder); - std::string finetuned_model_config_folder = join_path({ - finetuned_model_folder, - "config", - }); - fs::create_directories(finetuned_model_config_folder); - std::string lora_linear_config_filepath = join_path({ - finetuned_model_config_folder, - "ff_config.json", - }); - serialize_to_json_file(peft_config, lora_linear_config_filepath); - std::string optimizer_config_filepath = join_path({ - finetuned_model_config_folder, - "ff_optimizer_config.json", - }); - if (typeid(*peft_config.optimizer_config) == - typeid(LoraSGDOptimizerConfig)) { - LoraSGDOptimizerConfig const *sgd_config = - static_cast( - peft_config.optimizer_config); - serialize_to_json_file(*sgd_config, optimizer_config_filepath); - } else if (typeid(*peft_config.optimizer_config) == - typeid(LoraAdamOptimizerConfig)) { - LoraAdamOptimizerConfig const *adam_config = - static_cast( - peft_config.optimizer_config); - serialize_to_json_file(*adam_config, optimizer_config_filepath); - } else { - assert(false && "Optimizer not supported"); - } - } - - return peft_model_id; -} -#endif - Op *LoraLinear::create_operator_from_layer( FFModel &model, Layer const *layer, @@ -272,15 +145,6 @@ Op *LoraLinear::create_operator_from_layer( int max_rank = value; layer->get_int_property("max_concurrent_adapters", value); int max_concurrent_adapters = value; -#ifdef DEADCODE - std::unordered_map _peft_configs; - std::vector const &peft_ids = - model.peft_layer_to_peft_id[(Layer *)layer]; - for (int i = 0; i < peft_ids.size(); i++) { - _peft_configs.emplace( - std::make_pair(peft_ids[i], model.peft_configs[peft_ids[i]])); - } -#endif return new LoraLinear(model, layer->layer_guid, inputs[0], @@ -982,7 +846,7 @@ void LoraLinear::peft_bwd_task(Task const *task, int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; // int num_infr_tokens = bc->num_active_infr_tokens(); // int num_peft_tokens = bc->num_active_peft_tokens(); - peft_bwd_kernel_wrapper(ctx, runtime, m, bc, input_grad, output_grad); + peft_bwd_kernel_wrapper(ctx, runtime, m, bc, shard_id, input_grad, output_grad); save_peft_weights_if_needed(m, bc, in_dim, out_dim, shard_id); @@ -1018,14 +882,6 @@ bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) { if (lhs.layer_guid == rhs.layer_guid && lhs.max_rank == rhs.max_rank && lhs.max_concurrent_adapters == rhs.max_concurrent_adapters && strcmp(lhs.name, rhs.name) == 0) { -#ifdef DEADCODE - for (auto const &kv : lhs.peft_configs) { - auto it = rhs.peft_configs.find(kv.first); - if (it == rhs.peft_configs.end() || !(it->second == kv.second)) { - return false; - } - } -#endif return true; } return false; @@ -1066,50 +922,6 @@ void LoraLinear::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->max_rank); sez.serialize(this->max_concurrent_adapters); -#ifdef DEADCODE - sez.serialize(this->op_type); - sez.serialize(this->peft_configs.size()); - for (auto const &kv : this->peft_configs) { - // Serialize PEFTModelID - sez.serialize(kv.first.id); - - // Serialize LoraLinearConfig and OptimizerConfig to tmp folder - // 1. Create tmp dir and serialize it - fs::path unique_temp_dir = create_unique_temp_directory(); - serialize_string(sez, unique_temp_dir.string()); - // 2. Dump LoraLinearConfig to json file in tmp dir - std::string lora_config_filename = std::string("lora_linear_config_") + - std::to_string(kv.first.id) + - std::string(".json"); - fs::path lora_config_json_filepath = unique_temp_dir / lora_config_filename; - serialize_to_json_file(kv.second, lora_config_json_filepath); - // 3. Dump optimizer to json file in tmp dir, and serialize optimizer type - std::string optimizer_filename = std::string("optimizer_config_") + - std::to_string(kv.first.id) + - std::string(".json"); - fs::path optim_config_filepath = unique_temp_dir / optimizer_filename; - assert((kv.second.trainable) == (kv.second.optimizer_config != nullptr)); - if (kv.second.trainable) { - if (typeid(*kv.second.optimizer_config) == - typeid(LoraSGDOptimizerConfig)) { - sez.serialize(OPTIMIZER_TYPE_SGD); - LoraSGDOptimizerConfig const *sgd_config = - static_cast( - kv.second.optimizer_config); - serialize_to_json_file(*sgd_config, optim_config_filepath); - } else if (typeid(*kv.second.optimizer_config) == - typeid(LoraAdamOptimizerConfig)) { - sez.serialize(OPTIMIZER_TYPE_ADAM); - LoraAdamOptimizerConfig const *adam_config = - static_cast( - kv.second.optimizer_config); - serialize_to_json_file(*adam_config, optim_config_filepath); - } else { - assert(false && "Optimizer type not yet supported"); - } - } - } -#endif sez.serialize(strlen(this->name)); sez.serialize(this->name, strlen(this->name)); } @@ -1135,58 +947,6 @@ Node LoraLinear::deserialize(FFModel &ff, dez.deserialize(deserialized_model_id); dez.deserialize(max_rank); dez.deserialize(max_concurrent_adapters); -#ifdef DEADCODE - dez.deserialize(op_type); - dez.deserialize(num_pefts); - for (int i = 0; i < num_pefts; i++) { - // Deserialize PEFTModelID - size_t pid; - dez.deserialize(pid); - PEFTModelID peft_model_id(pid); - // Deserialize tmp folder containing LoraLinearConfig and optimizer config - fs::path unique_temp_dir = fs::path(deserialize_string(dez)); - // 1. Deserialize LoraLinearConfig - std::string lora_config_filename = std::string("lora_linear_config_") + - std::to_string(pid) + - std::string(".json"); - fs::path lora_config_json_filepath = unique_temp_dir / lora_config_filename; - std::unique_ptr lora_linear_config = - deserialize_from_json_file(lora_config_json_filepath); - // 2. Deserialize optimizer if needed - if (lora_linear_config->trainable) { - std::string optimizer_filename = std::string("optimizer_config_") + - std::to_string(pid) + - std::string(".json"); - fs::path optim_config_filepath = unique_temp_dir / optimizer_filename; - OptimizerType type_; - dez.deserialize(type_); - if (type_ == OPTIMIZER_TYPE_SGD) { - std::unique_ptr sgd_optimizer_config = - deserialize_from_json_file( - optim_config_filepath); - lora_linear_config->optimizer_config = - dynamic_cast(sgd_optimizer_config.release()); - } else if (type_ == OPTIMIZER_TYPE_ADAM) { - std::unique_ptr adam_optimizer_config = - deserialize_from_json_file( - optim_config_filepath); - lora_linear_config->optimizer_config = - dynamic_cast( - adam_optimizer_config.release()); - } else { - printf("Optimizer type: %d\n", type_); - assert(false && "Optimizer type not yet supported"); - } - } - try { - fs::remove_all(unique_temp_dir); - } catch (fs::filesystem_error const &e) { - std::cerr << "Error removing tmp directory: " << e.what() << std::endl; - } - params.peft_configs.emplace( - std::make_pair(peft_model_id, *lora_linear_config)); - } -#endif dez.deserialize(name_len); dez.deserialize(name, name_len); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); @@ -1236,19 +996,6 @@ size_t hash::operator()( hash_combine(key, params.layer_guid.model_id); hash_combine(key, params.max_rank); hash_combine(key, params.max_concurrent_adapters); -#ifdef DEADCODE - for (auto const &kv : params.peft_configs) { - hash_combine(key, kv.first.id); - hash_combine(key, kv.second.rank); - hash_combine(key, kv.second.trainable); - hash_combine(key, kv.second.cache_folder); - hash_combine(key, kv.second.peft_model_id); - hash_combine(key, kv.second.lora_alpha); - hash_combine(key, kv.second.lora_dropout); - hash_combine(key, kv.second.target_modules); - hash_combine(key, kv.second.init_lora_weights); - } -#endif return key; } }; // namespace std diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc index 4eb59bc53f..4bc75d17e4 100644 --- a/src/ops/lora_linear_params.cc +++ b/src/ops/lora_linear_params.cc @@ -282,18 +282,18 @@ LoraLinearConfig LoraLinearConfig::deserialize_from_json_string( if (!j["optimizer_config"].is_null()) { optimizer_config_ = LoraOptimizerConfig::fromJson(j["optimizer_config"]); } - LoraLinearConfig config( - j["cache_folder"].get(), - j["peft_model_id"].get(), - j["trainable"].get(), - optimizer_config_, // optimizer_config will be set later if present - j["init_lora_weights"].get(), - j["base_model_name_or_path"].get(), - j["precision"].get(), - j["rank"].get(), - j["lora_alpha"].get(), - j["lora_dropout"].get(), - j["target_modules"].get>()); + LoraLinearConfig config = LoraLinearConfig::EmptyConfig; + config.cache_folder = j["cache_folder"].get(); + config.peft_model_id = j["peft_model_id"].get(); + config.rank = j["rank"].get(); + config.lora_alpha = j["lora_alpha"].get(); + config.lora_dropout = j["lora_dropout"].get(); + config.target_modules = j["target_modules"].get>(); + config.trainable = j["trainable"].get(); + config.init_lora_weights = j["init_lora_weights"].get(); + config.base_model_name_or_path = j["base_model_name_or_path"].get(); + config.precision = j["precision"].get(); + config.optimizer_config = optimizer_config_; return config; } diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc index 2dd9a4711b..bd33076309 100644 --- a/src/runtime/peft_weight_allocator.cc +++ b/src/runtime/peft_weight_allocator.cc @@ -23,7 +23,7 @@ using Legion::TaskLauncher; void PEFTMemoryManager::allocate_inference_memory() { // allocate chunk of memory for all the PEFT adapters Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), - Realm::Point<1, coord_t>(max_lora_size - 1)); + Realm::Point<1, coord_t>(max_lora_size*max_concurrent_adapters - 1)); std::vector field_sizes; field_sizes.push_back(sizeof(char)); Realm::RegionInstance::create_instance(peftLegionInst, @@ -39,7 +39,7 @@ void PEFTMemoryManager::allocate_inference_memory() { void PEFTMemoryManager::allocate_finetuning_memory() { size_t ft_size = max_lora_size * 3; // weights, gradients, momentum values ft_size += - max_peft_tokens * (in_dim + max_rank); // input, low-rank activations + max_peft_tokens * (in_dim + max_rank) * data_type_size(dt); // input, low-rank activations // allocate chunk of memory for PEFT adapter Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(ft_size - 1)); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index a25677b22e..7d1e338d8f 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -268,8 +268,9 @@ void RequestManager::set_peft_config(PEFTModelID const &peft_model_id, // check that peft_model_id is not already in use assert(peft_configs.find(peft_model_id) == peft_configs.end() && "PEFT model ID already in use"); - peft_configs[peft_model_id] = LoraLinearConfig::deserialize_from_json_string( - peft_config.serialize_to_json_string()); + // LoraLinearConfig new_config = LoraLinearConfig::deserialize_from_json_string( + // peft_config.serialize_to_json_string()); + peft_configs[peft_model_id] = peft_config; } LoraLinearConfig const & @@ -304,6 +305,7 @@ PEFTModelID * std::cout << peft_config << std::endl; assert(false); } + std::cout << "Registering PEFT adapter" << peft_config.serialize_to_json_string() << std::endl; // go over base_layer_to_peft_layer and check that you can find at least one // match for (int i = 0; i < peft_config.target_modules.size(); i++) { @@ -699,6 +701,8 @@ void RequestManager::add_peft_config_to_request_info( std::string peft_config_str = peft_config.serialize_to_json_string(); std::strcpy(bc.requestsInfo[req_idx].peft_model_config_str, peft_config_str.c_str()); + // std::cout << "Added PEFT config to request info: " + // << bc.requestsInfo[req_idx].peft_model_config_str << std::endl; } BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py index a2fc5548ab..8a53ef8c9c 100644 --- a/tests/peft/hf_finetune.py +++ b/tests/peft/hf_finetune.py @@ -77,7 +77,7 @@ def main(): if args.save_peft_tensors: make_debug_dirs() register_peft_hooks(model) - save_model_weights(model, target_modules=["lora", "lm_head", "down_proj"]) + save_model_weights(model, target_modules=["lora", "lm_head", "down_proj", "up_proj"]) # Load fine-tuning dataset data = load_dataset("Abirate/english_quotes") diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py index cc677cd51a..bc9d8d9d24 100644 --- a/tests/peft/peft_alignment_test.py +++ b/tests/peft/peft_alignment_test.py @@ -17,7 +17,7 @@ def check_bwd_pass(self): def check_step(self, step_idx, learning_rate=0.001): raise NotImplementedError() -class LllamaAlignmentTest(AlignmentTest): +class LlamaAlignmentTest(AlignmentTest): def __init__(self, model_name, tp_degree=1): self.model_name = model_name self.peft_config = PeftConfig.from_pretrained(model_name) @@ -538,11 +538,47 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + print(f"w3 {i} grad output") + print("flexflow tensor shape:", ff_tensor.squeeze().shape) + print(ff_tensor.squeeze()) + print("huggingface tensor shape:", hf_tensor.squeeze().T.shape) + print(hf_tensor.squeeze().T) compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient output") + # print(f"W3 {i} output matches!") + # print(f"FF shape: {ff_tensor.shape}") + # print(f"HF shape: {hf_tensor.shape}") + + # hf_w3_output = hf_tensor.clone() + + # W3 (up_proj) input input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + + # w3_input_torch = torch.matmul(hf_tensor, torch.transpose(ff_tensor, 0, 1)) + # ff_up_proj_weight_path="/usr/.cache/flexflow/debug/flexflow/weights/step_0/shard_0/layers.11.layers.11.mlp.up_proj.weight_0" + # hf_up_proj_weight_path="/usr/.cache/flexflow/debug/huggingface/weights/step_0/layers.11.mlp.up_proj.weight" + # hf_up_proj_weight = torch.load(hf_up_proj_weight_path, map_location='cpu') + # print(hf_up_proj_weight.shape) + # ff_up_proj_weight = load_ff_tensor(ff_up_proj_weight_path, hf_up_proj_weight.shape[::-1]) + # print(ff_up_proj_weight.shape) + # ff_up_proj_weight = torch.from_numpy(ff_up_proj_weight).to(hf_up_proj_weight.dtype) + # assert torch.allclose(hf_up_proj_weight.T, ff_up_proj_weight, atol=1e-5) + + # print("HF W3 output shape:", hf_w3_output.shape) + # print("HF W3 weight shape:", hf_up_proj_weight.shape) + # print("HF W3 input shape:", hf_tensor.shape) + + # simulated_w3_input = torch.matmul(hf_w3_output.squeeze(), hf_up_proj_weight) + # print("simulated W3 input shape:", simulated_w3_input.T.shape) + # print(simulated_w3_input.T) + print(f"w3 {i} grad input") + print("flexflow tensor shape:", ff_tensor.squeeze().shape) + print(ff_tensor.squeeze()) + print("huggingface tensor shape:", hf_tensor.squeeze().T.shape) + print(hf_tensor.squeeze().T) + compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient input") # Attn O-proj @@ -695,7 +731,24 @@ def compare(hf_tensor, ff_tensor, label="", tolerance=1e-4): torch.testing.assert_close(hf_gradient, (hf_original_weight-hf_finetuned_weight)/learning_rate, rtol=1.3e-6, atol=1e-5) ff_gradient_name = convert_hf_filename_to_ff(hf_gradient_name) ff_gradient = get_ff_tensor(ff_gradient_name, hf_gradient.shape, tp_type=TPType.REPLICATE) + + lora_low_rank_activation_fwd_path = f"/usr/.cache/flexflow/debug/flexflow/fwd/step_{step_idx}/shard_0/layers.{i}.layers.{i}.mlp.down_proj.lora.low_rank_activation" + lora_low_rank_activation_bwd_path = f"/usr/.cache/flexflow/debug/flexflow/bwd/step_{step_idx}/shard_0/layers.{i}.layers.{i}.mlp.down_proj.lora.low_rank_activation" + lora_low_rank_activation_fwd = load_ff_tensor(lora_low_rank_activation_fwd_path, [16, 128])[:,:self.num_tokens] + lora_low_rank_activation_fwd = torch.from_numpy(lora_low_rank_activation_fwd) + lora_low_rank_activation_bwd = load_ff_tensor(lora_low_rank_activation_bwd_path, [16, 24]) + lora_low_rank_activation_bwd = torch.from_numpy(lora_low_rank_activation_bwd) + torch.testing.assert_close(lora_low_rank_activation_fwd, lora_low_rank_activation_bwd, rtol=1.3e-6, atol=1e-5) + + print(f"LoRA_B {i} gradient") + print("FlexFlow shape: ", ff_gradient.shape) + print(ff_gradient) + print("HuggingFace shape: ", hf_gradient.shape) + print(hf_gradient.squeeze().T) compare(hf_gradient, ff_gradient, label=f"LoRA_B {i} gradient") + + + # ff_out_gradient_name = f"layers.{i}.layers.{i}.mlp.down_proj.lora.output_gradient_0" # ff_fwd_folder = os.path.join(ff_path, "fwd", f"step_{step_idx}", "shard_0") # ff_bwd_folder = os.path.join(ff_path, "bwd", f"step_{step_idx}", "shard_0") @@ -737,7 +790,7 @@ def compare(hf_tensor, ff_tensor, label="", tolerance=1e-4): args = parser.parse_args() if __name__ == "__main__": - llama_alignment = LllamaAlignmentTest(args.model_name, tp_degree=args.tensor_parallelism_degree) + llama_alignment = LlamaAlignmentTest(args.model_name, tp_degree=args.tensor_parallelism_degree) # llama_alignment.check_weights_alignment() for i in range(args.num_steps): llama_alignment.check_fwd_pass(i) diff --git a/tests/peft_test.sh b/tests/peft_test.sh index 173fb37fd9..b7adce8028 100755 --- a/tests/peft_test.sh +++ b/tests/peft_test.sh @@ -34,7 +34,7 @@ export LEGION_BACKTRACE=1 python ./inference/utils/download_peft_model.py goliaro/llama-160m-lora --base_model_name JackFram/llama-160m # Run PEFT in Huggingface to get ground truth tensors -python ./tests/peft/hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision +python ./tests/peft/hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision -lr 1.0 # Python test echo "Python test" @@ -45,8 +45,8 @@ echo "Python test" # C++ test echo "C++ test" ./build/inference/peft/peft \ - -ll:gpu 2 -ll:cpu 4 -ll:util 4 \ - -tensor-parallelism-degree 2 \ + -ll:gpu 1 -ll:cpu 4 -ll:util 4 \ + -tensor-parallelism-degree 1 \ -ll:fsize 8192 -ll:zsize 12000 \ -llm-model JackFram/llama-160m \ -finetuning-dataset ./inference/prompt/peft_dataset.json \ @@ -55,7 +55,7 @@ echo "C++ test" --use-full-precision \ --inference-debugging # Check alignment -python ./tests/peft/peft_alignment_test.py -tp 2 +python ./tests/peft/peft_alignment_test.py -tp 1 -lr 1.0 # Print succeess message echo "" From b56ebd3aab4f7eb4ae77890869437258b6bbe150 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 8 Nov 2024 06:19:18 +0000 Subject: [PATCH 26/37] fix reset input grad for non-activated loras --- include/flexflow/operator.h | 2 +- include/flexflow/ops/kernels/linear_kernels.h | 2 + .../ops/kernels/lora_linear_kernels.h | 17 ------- src/ops/fused.cu | 1 + src/ops/kernels/linear_kernels.cu | 45 +++++++++++++++++++ src/ops/kernels/lora_linear_kernels.cu | 34 +++++++------- src/ops/linear.cc | 1 + src/ops/lora_linear.cc | 3 +- src/ops/lora_linear_params.cc | 3 +- src/runtime/inference_manager.cc | 4 +- src/runtime/peft_weight_allocator.cc | 9 ++-- src/runtime/request_manager.cc | 6 ++- tests/peft/peft_alignment_test.py | 43 +++++++++++------- 13 files changed, 111 insertions(+), 59 deletions(-) diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 007314797a..c108740ef3 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -280,7 +280,7 @@ class Op { // get operator name and print it std::string op_name_without_uid = get_op_name_without_uid(m); std::cout << (fwd_pass ? "INF " : "BWD ") << op_name_without_uid - << std::endl; + << (before_kernel ? " (before kernel)" : "") << std::endl; // build the path to save the tensor fs::path dst_filepath; if (fwd_pass) { diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h index 90e50a0c9a..aaa845db23 100644 --- a/include/flexflow/ops/kernels/linear_kernels.h +++ b/include/flexflow/ops/kernels/linear_kernels.h @@ -61,6 +61,7 @@ void inference_kernel_wrapper(LinearMeta *m, int out_dim, int batch_size); void peft_bwd_kernel_wrapper(LinearMeta const *m, + BatchConfig const *bc, void *input_grad_ptr, void *output_grad_ptr, void const *kernel_ptr, @@ -94,6 +95,7 @@ void forward_kernel(LinearMeta const *m, ffStream_t stream); template void peft_bwd_kernel(LinearMeta const *m, + BatchConfig const *bc, void *input_grad_ptr, void *output_grad_ptr, void const *kernel_ptr, diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h index b17868fb96..fd86dc68c0 100644 --- a/include/flexflow/ops/kernels/lora_linear_kernels.h +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -13,27 +13,10 @@ namespace FlexFlow { using Legion::Context; using Legion::Runtime; -#ifdef DEADCODE -struct LoraLinearModelState { - LoraLinearWeight weights; - LoraOptimizerConfig const *optimizer_config; - float lora_alpha; - std::string cache_folder; - // Huggingface model ID (for download and/or upload) - std::string peft_model_id; -}; -#endif - class LoraLinearMeta : public OpMeta { public: LoraLinearMeta(FFHandler handle, LoraLinear const *li); ~LoraLinearMeta(void); - // PEFT related fields - // void *low_rank_activation; - // void *input_activation; - // std::unordeded_map model_state; - // std::unordered_map model_state; - // size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0; PEFTMemoryManager *peft_memory_manager; }; diff --git a/src/ops/fused.cu b/src/ops/fused.cu index c615a104d2..8635fd6a87 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -862,6 +862,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, int num_infr_tokens = bc->num_active_infr_tokens(); int num_peft_tokens = bc->num_active_peft_tokens(); Kernels::Linear::peft_bwd_kernel_wrapper(m, + bc, my_input_grad_accessor[0].ptr, my_output_grad_accessor[0].ptr, my_weight_accessor[0].ptr, diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 3832428c64..51954597d7 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -16,6 +16,7 @@ #include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/decompress_kernels.h" #include "flexflow/ops/kernels/linear_kernels.h" +#include "flexflow/ops/lora_linear_params.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { @@ -73,6 +74,17 @@ LinearMeta::~LinearMeta(void) { } } +bool lora_applies_to_this_layer(LinearMeta const *m, + LoraLinearConfig const &config) { + for (std::string s : config.target_modules) { + std::string n(m->op_name); + if (n.find(s) != std::string::npos) { + return true; + } + } + return false; +} + namespace Kernels { namespace Linear { @@ -285,6 +297,7 @@ void inference_kernel_wrapper(LinearMeta *m, } void peft_bwd_kernel_wrapper(LinearMeta const *m, + BatchConfig const *bc, void *input_grad_ptr, void *output_grad_ptr, void const *weight_ptr, @@ -302,6 +315,7 @@ void peft_bwd_kernel_wrapper(LinearMeta const *m, } if (m->input_type[0] == DT_FLOAT) { Internal::peft_bwd_kernel(m, + bc, input_grad_ptr, output_grad_ptr, weight_ptr, @@ -312,6 +326,7 @@ void peft_bwd_kernel_wrapper(LinearMeta const *m, stream); } else if (m->input_type[0] == DT_HALF) { Internal::peft_bwd_kernel(m, + bc, input_grad_ptr, output_grad_ptr, weight_ptr, @@ -568,6 +583,7 @@ void forward_kernel(LinearMeta const *m, template void peft_bwd_kernel(LinearMeta const *m, + BatchConfig const *bc, void *input_grad_ptr, void *output_grad_ptr, void const *kernel_ptr, @@ -611,6 +627,35 @@ void peft_bwd_kernel(LinearMeta const *m, // NOTE: we use beta=1 for input_grad to accumulate gradients when needed DT alpha = 1.0f; DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f; + + // ensure that we only have one finetuning request, with a single lora + int num_peft_requests = 0; + bool lora_applies = false; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i] || + bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || + !bc->requestsInfo[i].peft_bwd) { + continue; + } + num_peft_requests++; + std::string peft_model_config_str = + std::string(bc->requestsInfo[i].peft_model_config_str); + LoraLinearConfig lora_config = + LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); + if (!lora_applies_to_this_layer(m, lora_config)) { + continue; + } + lora_applies = true; + } + assert(num_peft_requests == 1 && + "Exactly one PEFT finetuning request is required"); + // if the request does not have any active lora in the current layer, reset + // beta to 0 std::cout << m->op_name << " original beta: " << (float)beta << " + // lora_applies: " << lora_applies << std::endl; + if (lora_applies) { + beta = 1.0f; + } + if (input_grad_ptr != NULL) { checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index dabe40c501..40095484b5 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -23,14 +23,12 @@ namespace FlexFlow { LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li) - : OpMeta(handler, li) { -} + : OpMeta(handler, li) {} LoraLinearMeta::~LoraLinearMeta(void) {} -std::string get_peft_dbg_folder(LoraLinearMeta const *m, - int shard_id, - bool is_fwd) { +std::string + get_peft_dbg_folder(LoraLinearMeta const *m, int shard_id, bool is_fwd) { std::string op_name_without_uid = LoraLinear::get_op_name_without_uid(m); fs::path dst_filepath; if (is_fwd) { @@ -51,8 +49,6 @@ std::string get_peft_dbg_folder(LoraLinearMeta const *m, namespace Kernels { namespace LoraLinear { - - void inference_kernel_wrapper(LoraLinearMeta *m, BatchConfig const *bc, GenericTensorAccessorR const &input, @@ -174,7 +170,6 @@ bool lora_applies_to_this_layer(LoraLinearMeta *m, namespace Internal { - template void inference_kernel(LoraLinearMeta *m, BatchConfig const *bc, @@ -208,8 +203,8 @@ void inference_kernel(LoraLinearMeta *m, if (!lora_applies_to_this_layer(m, lora_config)) { continue; } - std::cout << "Lora layer activated!" << std::endl; - std::cout << "Lora Config: " << peft_model_config_str << std::endl; + // std::cout << "Lora layer activated!" << std::endl; + // std::cout << "Lora Config: " << peft_model_config_str << std::endl; assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && "Trainable flag mismatch"); int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; @@ -311,7 +306,7 @@ void peft_bwd_kernel(Context ctx, Runtime *runtime, LoraLinearMeta *m, BatchConfig const *bc, - int shard_id, + int shard_id, DT *input_grad_ptr, DT const *output_grad_ptr, int in_dim, @@ -340,8 +335,8 @@ void peft_bwd_kernel(Context ctx, if (!lora_applies_to_this_layer(m, lora_config)) { continue; } - std::cout << "Lora layer activated!" << std::endl; - std::cout << "Lora Config: " << peft_model_config_str << std::endl; + // std::cout << "Lora layer activated!" << std::endl; + // std::cout << "Lora Config: " << peft_model_config_str << std::endl; assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && "Trainable flag mismatch"); m->peft_memory_manager->check_ft_model_id( @@ -359,12 +354,17 @@ void peft_bwd_kernel(Context ctx, DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero) ? 0.0f : 1.0f; - std::cout << "Lora B gradient computation, beta = " << (float) beta << std::endl; + // std::cout << "Lora B gradient computation, beta = " << (float) beta << + // std::endl; if (m->inference_debugging) { // save result to file for checking - std::string filename = get_peft_dbg_folder(m, shard_id, false) + ".low_rank_activation"; - std::cout << "Save low_rank_activation (" << lora_config.rank << ", " << num_peft_tokens << ") to " << filename << std::endl; - save_tensor(static_cast(weight.low_rank_activation), lora_config.rank*num_peft_tokens, filename.c_str()); + std::string filename = + get_peft_dbg_folder(m, shard_id, false) + ".low_rank_activation"; + std::cout << "Save low_rank_activation (" << lora_config.rank << ", " + << num_peft_tokens << ") to " << filename << std::endl; + save_tensor(static_cast(weight.low_rank_activation), + lora_config.rank * num_peft_tokens, + filename.c_str()); } checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 09170d3c28..8c2120e283 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -769,6 +769,7 @@ void Linear::peft_bwd_task(Task const *task, num_peft_tokens); } peft_bwd_kernel_wrapper(m, + bc, input_grad.ptr, output_grad.ptr, weight.ptr, diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 5f67709358..68605160a5 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -846,7 +846,8 @@ void LoraLinear::peft_bwd_task(Task const *task, int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; // int num_infr_tokens = bc->num_active_infr_tokens(); // int num_peft_tokens = bc->num_active_peft_tokens(); - peft_bwd_kernel_wrapper(ctx, runtime, m, bc, shard_id, input_grad, output_grad); + peft_bwd_kernel_wrapper( + ctx, runtime, m, bc, shard_id, input_grad, output_grad); save_peft_weights_if_needed(m, bc, in_dim, out_dim, shard_id); diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc index 4bc75d17e4..69c0081ec9 100644 --- a/src/ops/lora_linear_params.cc +++ b/src/ops/lora_linear_params.cc @@ -291,7 +291,8 @@ LoraLinearConfig LoraLinearConfig::deserialize_from_json_string( config.target_modules = j["target_modules"].get>(); config.trainable = j["trainable"].get(); config.init_lora_weights = j["init_lora_weights"].get(); - config.base_model_name_or_path = j["base_model_name_or_path"].get(); + config.base_model_name_or_path = + j["base_model_name_or_path"].get(); config.precision = j["precision"].get(); config.optimizer_config = optimizer_config_; return config; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index f39ea91f28..45b6ba0db8 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -273,7 +273,9 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { } reset_inputs.insert(op->inputs[i]->region); } else { - reset_inputs.insert(op->inputs[i]->region); + if (op->op_type != OP_LORA) { + reset_inputs.insert(op->inputs[i]->region); + } } } } diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc index bd33076309..1fcef3678e 100644 --- a/src/runtime/peft_weight_allocator.cc +++ b/src/runtime/peft_weight_allocator.cc @@ -22,8 +22,9 @@ using Legion::TaskLauncher; void PEFTMemoryManager::allocate_inference_memory() { // allocate chunk of memory for all the PEFT adapters - Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), - Realm::Point<1, coord_t>(max_lora_size*max_concurrent_adapters - 1)); + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(max_lora_size * max_concurrent_adapters - 1)); std::vector field_sizes; field_sizes.push_back(sizeof(char)); Realm::RegionInstance::create_instance(peftLegionInst, @@ -38,8 +39,8 @@ void PEFTMemoryManager::allocate_inference_memory() { void PEFTMemoryManager::allocate_finetuning_memory() { size_t ft_size = max_lora_size * 3; // weights, gradients, momentum values - ft_size += - max_peft_tokens * (in_dim + max_rank) * data_type_size(dt); // input, low-rank activations + ft_size += max_peft_tokens * (in_dim + max_rank) * + data_type_size(dt); // input, low-rank activations // allocate chunk of memory for PEFT adapter Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(ft_size - 1)); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 7d1e338d8f..798da75b01 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -268,7 +268,8 @@ void RequestManager::set_peft_config(PEFTModelID const &peft_model_id, // check that peft_model_id is not already in use assert(peft_configs.find(peft_model_id) == peft_configs.end() && "PEFT model ID already in use"); - // LoraLinearConfig new_config = LoraLinearConfig::deserialize_from_json_string( + // LoraLinearConfig new_config = + // LoraLinearConfig::deserialize_from_json_string( // peft_config.serialize_to_json_string()); peft_configs[peft_model_id] = peft_config; } @@ -305,7 +306,8 @@ PEFTModelID * std::cout << peft_config << std::endl; assert(false); } - std::cout << "Registering PEFT adapter" << peft_config.serialize_to_json_string() << std::endl; + std::cout << "Registering PEFT adapter" + << peft_config.serialize_to_json_string() << std::endl; // go over base_layer_to_peft_layer and check that you can find at least one // match for (int i = 0; i < peft_config.target_modules.size(); i++) { diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py index bc9d8d9d24..ee82b298e0 100644 --- a/tests/peft/peft_alignment_test.py +++ b/tests/peft/peft_alignment_test.py @@ -485,12 +485,16 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) compare(hf_tensor, ff_tensor, label=f"W2 {i} gradient output") + down_proj_grad_output_pre = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE, pre=True) + down_proj_grad_output = ff_tensor.clone() + compare_loaded_tensors(down_proj_grad_output, down_proj_grad_output_pre) # LoRA_B hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_B.default" ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + lora_grad_output = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) * self.lora_scaling_factor compare(hf_tensor, ff_tensor, label=f"LoRA_B {i} gradient output") @@ -501,6 +505,7 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) compare(hf_tensor, ff_tensor, label=f"LoRA_A {i} gradient input") + lora_a_grad_input = ff_tensor.clone() # W2 (down_proj) input hf_tensor_name = f"layers.{i}.mlp.down_proj" @@ -508,7 +513,15 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + down_proj_grad_input_pre = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION, pre=True) compare(hf_tensor, ff_tensor, label=f"W2 {i} gradient input") + + # down proj output (before/after kernel) should match output of lora_b + compare_loaded_tensors(down_proj_grad_output, lora_grad_output) + # down proj input (before kernel) should match input of lora_a + compare_loaded_tensors(down_proj_grad_input_pre, lora_a_grad_input) + # compare_loaded_tensors(down_proj_grad_input_pre.squeeze(), ff_tensor.squeeze()) + # W2 input (HF) and SigmoidSiluMulti output (FF) hf_w2_input = hf_tensor.clone() @@ -538,11 +551,11 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) - print(f"w3 {i} grad output") - print("flexflow tensor shape:", ff_tensor.squeeze().shape) - print(ff_tensor.squeeze()) - print("huggingface tensor shape:", hf_tensor.squeeze().T.shape) - print(hf_tensor.squeeze().T) + # print(f"w3 {i} grad output") + # print("flexflow tensor shape:", ff_tensor.squeeze().shape) + # print(ff_tensor.squeeze()) + # print("huggingface tensor shape:", hf_tensor.squeeze().T.shape) + # print(hf_tensor.squeeze().T) compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient output") # print(f"W3 {i} output matches!") # print(f"FF shape: {ff_tensor.shape}") @@ -573,11 +586,11 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance # simulated_w3_input = torch.matmul(hf_w3_output.squeeze(), hf_up_proj_weight) # print("simulated W3 input shape:", simulated_w3_input.T.shape) # print(simulated_w3_input.T) - print(f"w3 {i} grad input") - print("flexflow tensor shape:", ff_tensor.squeeze().shape) - print(ff_tensor.squeeze()) - print("huggingface tensor shape:", hf_tensor.squeeze().T.shape) - print(hf_tensor.squeeze().T) + # print(f"w3 {i} grad input") + # print("flexflow tensor shape:", ff_tensor.squeeze().shape) + # print(ff_tensor.squeeze()) + # print("huggingface tensor shape:", hf_tensor.squeeze().T.shape) + # print(hf_tensor.squeeze().T) compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient input") @@ -740,11 +753,11 @@ def compare(hf_tensor, ff_tensor, label="", tolerance=1e-4): lora_low_rank_activation_bwd = torch.from_numpy(lora_low_rank_activation_bwd) torch.testing.assert_close(lora_low_rank_activation_fwd, lora_low_rank_activation_bwd, rtol=1.3e-6, atol=1e-5) - print(f"LoRA_B {i} gradient") - print("FlexFlow shape: ", ff_gradient.shape) - print(ff_gradient) - print("HuggingFace shape: ", hf_gradient.shape) - print(hf_gradient.squeeze().T) + # print(f"LoRA_B {i} gradient") + # print("FlexFlow shape: ", ff_gradient.shape) + # print(ff_gradient) + # print("HuggingFace shape: ", hf_gradient.shape) + # print(hf_gradient.squeeze().T) compare(hf_gradient, ff_gradient, label=f"LoRA_B {i} gradient") From 3632754422355eb3cd7c630e6dcdaa6944530972 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 8 Nov 2024 16:35:37 +0000 Subject: [PATCH 27/37] fix --- src/runtime/model.cc | 57 +++---------------------------- tests/peft/peft_alignment_test.py | 3 +- tests/peft_test.sh | 6 ++-- 3 files changed, 10 insertions(+), 56 deletions(-) diff --git a/src/runtime/model.cc b/src/runtime/model.cc index de798890ef..465ee21fc9 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3420,63 +3420,16 @@ bool FFModel::need_to_add_combine(int layer_idx) const { bool FFModel::need_to_add_allreduce(int layer_idx) const { auto const &l = layers[layer_idx]; - if (config.computationMode == COMP_MODE_INFERENCE && - config.tensor_parallelism_degree > 1 && - ( - // l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || - // l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || - (std::string(l->name).find("attn.o_proj") != std::string::npos) || - // mlp layer - is_mlp_block(layer_idx) || - // llama mlp layer - (l->op_type == OP_LINEAR && layer_idx >= 2 && - layers[layer_idx - 1]->op_type == OP_GELU && - layers[layer_idx - 2]->op_type == OP_LINEAR) || - // LLAMA without element-wise operator fusion - (l->op_type == OP_LINEAR && layer_idx >= 5 && - layers[layer_idx - 1]->op_type == OP_EW_MUL && - layers[layer_idx - 2]->op_type == OP_EW_MUL && - layers[layer_idx - 3]->op_type == OP_SIGMOID && - layers[layer_idx - 4]->op_type == OP_LINEAR && - layers[layer_idx - 5]->op_type == OP_LINEAR) || - // LLAMA with element-wise operator fusion - (l->op_type == OP_LINEAR && layer_idx >= 3 && - layers[layer_idx - 1]->op_type == OP_SIGMOID_SILU_MULTI && - layers[layer_idx - 2]->op_type == OP_LINEAR && - layers[layer_idx - 3]->op_type == OP_LINEAR))) { + if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 && + ((l->op_type == OP_LINEAR && std::string(l->name).find("attn.o_proj") != std::string::npos) || + is_mlp_block(layer_idx) || + (l->op_type == OP_LINEAR && std::string(l->name).find("mlp.down_proj") != std::string::npos) + )) { return true; } return false; } -#ifdef DEADCODE -bool FFModel::need_to_add_parallel_identity(int layer_idx) const { - auto const &l = layers[layer_idx]; - // add parallel identity (allreduce in the backward pass) before the lm head - // we find the lm head by looking for the linear layer right after a residual - // rms norm / layer norm, and before a softmax, followed by - // argmax/argtopk/sampling - if (config.computationMode == COMP_MODE_INFERENCE && - config.tensor_parallelism_degree > 1 && - ((l->op_type == OP_RESIDUAL_RMS_NORM || - l->op_type == OP_RESIDUAL_LAYERNORM) && - // there are at least 2 layers before the norm, and at least 3 following - // the norm - layer_idx >= 2 && layer_idx < layers.size() - 3 && - // norm is followed by linear layer (lm head) - layers[layer_idx + 1]->op_type == OP_LINEAR && - // lm head is followed by softmax - layers[layer_idx + 2]->op_type == OP_SOFTMAX && - // softmax is followed by argmax/argtopk/sampling - (layers[layer_idx + 3]->op_type == OP_ARG_TOPK || - layers[layer_idx + 3]->op_type == OP_SAMPLING || - layers[layer_idx + 3]->op_type == OP_ARGMAX || - layers[layer_idx + 3]->op_type == OP_SCALAR_TRUE_DIV))) { - return true; - } - return false; -} -#endif bool FFModel::need_to_add_parallel_identity(int layer_idx) const { auto const &l = layers[layer_idx]; // add parallel identity (allreduce in the backward pass) before the lm head diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py index ee82b298e0..c4db87c099 100644 --- a/tests/peft/peft_alignment_test.py +++ b/tests/peft/peft_alignment_test.py @@ -655,7 +655,8 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance ff_tensor_name = f"layers.{i}.layers.{i}.input_layernorm" _output_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=1) input_layernorm_out1 = get_ff_tensor(ff_tensor_name, _output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) - torch.testing.assert_close(attn_input, input_layernorm_out1, rtol=1.3e-6, atol=1e-5) + compare_loaded_tensors(attn_input, input_layernorm_out1, tolerance=1e-5) + # torch.testing.assert_close(attn_input, input_layernorm_out1, rtol=1.3e-6, atol=1e-5) # Input layernorm diff --git a/tests/peft_test.sh b/tests/peft_test.sh index b7adce8028..6152844f5e 100755 --- a/tests/peft_test.sh +++ b/tests/peft_test.sh @@ -45,8 +45,8 @@ echo "Python test" # C++ test echo "C++ test" ./build/inference/peft/peft \ - -ll:gpu 1 -ll:cpu 4 -ll:util 4 \ - -tensor-parallelism-degree 1 \ + -ll:gpu 4 -ll:cpu 4 -ll:util 4 \ + -tensor-parallelism-degree 4 \ -ll:fsize 8192 -ll:zsize 12000 \ -llm-model JackFram/llama-160m \ -finetuning-dataset ./inference/prompt/peft_dataset.json \ @@ -55,7 +55,7 @@ echo "C++ test" --use-full-precision \ --inference-debugging # Check alignment -python ./tests/peft/peft_alignment_test.py -tp 1 -lr 1.0 +python ./tests/peft/peft_alignment_test.py -tp 4 -lr 1.0 # Print succeess message echo "" From fca3d95db5b23da604734dd7705a1be33f32e2fa Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 8 Nov 2024 21:14:55 +0000 Subject: [PATCH 28/37] update --- inference/python/streamlit/app.py | 21 +++++---------- inference/python/streamlit/fastapi_incr.py | 31 +++++++++++++--------- python/flexflow/core/flexflow_cffi.py | 1 + 3 files changed, 26 insertions(+), 27 deletions(-) diff --git a/inference/python/streamlit/app.py b/inference/python/streamlit/app.py index 4d8633e167..9788765a3a 100644 --- a/inference/python/streamlit/app.py +++ b/inference/python/streamlit/app.py @@ -8,7 +8,7 @@ st.set_page_config(page_title="🚀💻 FlexLLM Server", layout="wide") # FastAPI server URL -FASTAPI_URL = "http://localhost:8000/generate/" # Adjust the port if necessary +FASTAPI_URL = "http://localhost:8000/chat/completions" # Adjust the port if necessary FINETUNE_URL = "http://localhost:8000/finetuning" # Initialize session state variables @@ -30,18 +30,11 @@ def clear_chat_history(): st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}] # Function for generating LLaMA2 response -def generate_llama2_response(prompt_input): - string_dialogue = "You are a helpful assistant. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'." - for dict_message in st.session_state.messages: - if dict_message["role"] == "user": - string_dialogue += "User: " + dict_message["content"] + "\n\n" - else: - string_dialogue += "Assistant: " + dict_message["content"] + "\n\n" - - full_prompt = f"{string_dialogue} {prompt_input} Assistant: " +def generate_llama3_response(prompt_input): + system_prompt="You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature." # Send request to FastAPI server - response = requests.post(FASTAPI_URL, json={"prompt": full_prompt}) + response = requests.post(FASTAPI_URL, json={"max_new_tokens": 1024, "messages": [{"role": "system", "content": system_prompt}] + st.session_state.messages + [{"role": "user", "content": prompt_input}]}) if response.status_code == 200: return response.json()["response"] @@ -58,7 +51,7 @@ def generate_llama2_response(prompt_input): st.sidebar.button('Clear Chat History', on_click=clear_chat_history) st.subheader('Generation parameters') - max_length = st.sidebar.slider('Max generation length', min_value=64, max_value=4096, value=2048, step=8) + max_length = st.sidebar.slider('Max generation length', min_value=64, max_value=2048, value=1024, step=8) # selected_model = st.sidebar.selectbox('Choose a Llama2 model', ['Llama2-7B', 'Llama2-13B', 'Llama2-70B'], key='selected_model') decoding_method = st.sidebar.selectbox('Decoding method', ['Greedy decoding (default)', 'Sampling'], key='decoding_method') temperature = st.sidebar.slider('temperature', min_value=0.01, max_value=5.0, value=0.1, step=0.01, disabled=decoding_method == 'Greedy decoding (default)') @@ -181,8 +174,8 @@ def generate_llama2_response(prompt_input): # Generate a new response if last message is not from assistant if st.session_state.messages[-1]["role"] != "assistant": with st.chat_message("assistant"): - with st.spinner("Thinking..."): - response = generate_llama2_response(prompt) + with st.spinner("Running..."): + response = generate_llama3_response(prompt) placeholder = st.empty() full_response = '' for item in response: diff --git a/inference/python/streamlit/fastapi_incr.py b/inference/python/streamlit/fastapi_incr.py index a1095e13dc..6ac7f4149a 100644 --- a/inference/python/streamlit/fastapi_incr.py +++ b/inference/python/streamlit/fastapi_incr.py @@ -46,12 +46,16 @@ class Message(BaseModel): content: str +# class ChatCompletionRequest(BaseModel): +# model: Optional[str] = "mock-gpt-model" +# messages: List[Message] +# max_tokens: Optional[int] = 512 +# temperature: Optional[float] = 0.1 +# stream: Optional[bool] = False + class ChatCompletionRequest(BaseModel): - model: Optional[str] = "mock-gpt-model" + max_new_tokens: Optional[int] = 1024 messages: List[Message] - max_tokens: Optional[int] = 512 - temperature: Optional[float] = 0.1 - stream: Optional[bool] = False # Global variable to store the LLM model llm = None @@ -76,12 +80,12 @@ def get_configs(): # Define sample configs ff_init_configs = { # required parameters - "num_gpus": 4, + "num_gpus": 8, "memory_per_gpu": 20000, "zero_copy_memory_per_node": 40000, # optional parameters "num_cpus": 4, - "legion_utility_processors": 4, + "legion_utility_processors": 8, "data_parallelism_degree": 1, "tensor_parallelism_degree": 4, "pipeline_parallelism_degree": 1, @@ -98,7 +102,7 @@ def get_configs(): } llm_configs = { # required parameters - "llm_model": "meta-llama/Meta-Llama-3.1-8B", + "llm_model": "meta-llama/Llama-3.1-8B-Instruct", # optional parameters "cache_path": os.environ.get("FF_CACHE_PATH", ""), "refresh_cache": False, @@ -139,7 +143,7 @@ async def startup_event(): generation_config, max_requests_per_batch=16, max_seq_length=2048, - max_tokens_per_batch=64, + max_tokens_per_batch=1024, ) llm.start_server() @@ -171,11 +175,12 @@ async def chat_completions(request: ChatCompletionRequest): if llm is None: raise HTTPException(status_code=503, detail="LLM model is not initialized.") - if request.messages and request.messages[0].role == 'user': - resp_content = "As a mock AI Assitant, I can only echo your last message:" + request.messages[-1].content - else: - resp_content = "As a mock AI Assitant, I can only echo your last message, but there were no messages!" - + print("received request:", request) + result = llm.generate([message.dict() for message in request.messages], max_new_tokens=request.max_new_tokens)[0].output_text.decode('utf-8') + print("returning response:", result) + return { + "response": result + } return { "id": "1337", "object": "chat.completion", diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 151b01b873..4ff8348f46 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -4759,6 +4759,7 @@ def generate(self, requests_list: List[Request]): finetuning_losses=finetuning_losses, ) ) + return results def set_position_offset(self, offset): ffc().flexflow_model_set_position_offset(self.handle, offset) From 9a1eae589ab2283d8583bc59190394db4b840a21 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 8 Nov 2024 22:03:33 +0000 Subject: [PATCH 29/37] demo fixes & readme --- inference/python/streamlit/README.md | 18 ++++++++++++++++++ python/flexflow/serve/serve.py | 16 ++++++++++++---- src/runtime/request_manager.cc | 4 ++++ 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/inference/python/streamlit/README.md b/inference/python/streamlit/README.md index e69de29bb2..86a15e2d6d 100644 --- a/inference/python/streamlit/README.md +++ b/inference/python/streamlit/README.md @@ -0,0 +1,18 @@ +# Streamlit demo + +## Instructions + +1. Build and install FlexFlow, or build and run `source ./set_python_envs.sh` from the build folder +2. Edit the FlexFlow/inference/python/streamlit/fastapi_incr.py to configure the model to run and the system configs (num gpus, amount of memory, etc) +3. In one terminal, launch the LLM engine with the commands below, and wait until the model's weights loading completes +``` +cd FlexFlow/inference/python/streamlit +python fastapi_incr.py +``` +4. In another terminal, launch the streamlit app: +``` +cd FlexFlow/inference/python/streamlit +streamlit run app.py +``` +5. Open the URL printed to the terminal, e.g. `http://localhost:8501` and interact with the app via browser + diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index d06d59b8c9..9d3fa19706 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -521,7 +521,7 @@ def compile( atexit.register(self.rm.stop_server) - def _generate(self, requests: List[Request]): + def _generate(self, requests: List[Request]) -> List[GenerationResult]: if len(requests) == 0: return [] for req in requests: @@ -554,7 +554,7 @@ def _generate(self, requests: List[Request]): ) return self.model.ffmodel.generate(requests) - def __chat2prompt(self, messages: List[dict]): + def __chat2prompt(self, messages: List[dict]) -> str: """Convert a list of messages to a single prompt string :param messages: The list of messages to convert @@ -573,6 +573,12 @@ def __chat2prompt(self, messages: List[dict]): if self.tokenizer.chat_template is None: raise ValueError(f"Model {self.model_name} does not support chat completion") return self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + + def __output2chat_response(self, requests: List[Request], outputs: List[GenerationResult]) -> List[GenerationResult]: + assert(len(requests) == len(outputs)) + for i in range(len(outputs)): + outputs[i].output_text = outputs[i].output_text[len(requests[i].prompt):] + return outputs def generate( self, @@ -626,7 +632,8 @@ def generate( max_new_tokens=max_new_tokens, add_special_tokens=False, ) - return self._generate([request]) + outputs = self._generate([request]) + return self.__output2chat_response([request], outputs) elif type(requests_or_prompts[0]) == list: prompts = [self.__chat2prompt(messages) for messages in requests_or_prompts] requests = [ @@ -639,7 +646,8 @@ def generate( ) for prompt in prompts ] - return self._generate(requests) + outputs = self._generate(requests) + return self.__output2chat_response(requests, outputs) elif type(requests_or_prompts[0]) == Request: print(requests_or_prompts) return self._generate(requests_or_prompts) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 798da75b01..d98d327dba 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -765,6 +765,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, assert(processed_tokens < request.tokens.size()); bool request_completed = check_inf_req_completion(old_bc, i); if (request_completed) { + if (is_eos_token(request.tokens.back())) { + // remove the EOS token + request.tokens.pop_back(); + } std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically // removes the BOS token From c71c6b319d1d71bb4e0da16da9aebc05d1a160f8 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 9 Nov 2024 03:38:47 +0000 Subject: [PATCH 30/37] load weights in parallel --- include/flexflow/model.h | 3 + include/flexflow/utils/file_loader.h | 28 +++++++++ inference/python/chat.py | 22 ++++--- src/c/flexflow_c.cc | 5 +- src/mapper/mapper.cc | 6 ++ src/runtime/file_loader.cc | 91 ++++++++++++++++++++++++++++ src/runtime/model.cc | 57 +++++++++++++++-- src/runtime/request_manager.cc | 6 +- 8 files changed, 200 insertions(+), 18 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index e50c5f9578..3a80aa6b12 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -278,6 +278,9 @@ enum TaskIDs { RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID, RM_BACKGROUND_SERVING_TASK_ID, + LOAD_FLOAT_WEIGHT_TASK_ID, + LOAD_HALF_WEIGHT_TASK_ID, + LOAD_QUANT_WEIGHT_TASK_ID, // Custom tasks CUSTOM_GPU_TASK_ID_FIRST, CUSTOM_GPU_TASK_ID_1, diff --git a/include/flexflow/utils/file_loader.h b/include/flexflow/utils/file_loader.h index 646eb18da2..44cb15d10f 100644 --- a/include/flexflow/utils/file_loader.h +++ b/include/flexflow/utils/file_loader.h @@ -39,7 +39,26 @@ class FileDataLoader { void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx); void load_quantization_weight(FFModel *ff, Layer *l, int weight_idx); +#ifdef DEADCODE void load_weights(FFModel *ff); +#endif + + static void + load_float_weight_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void + load_half_weight_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void + load_quant_weight_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + void load_weights_parallel(FFModel *ff, Context ctx, Runtime *runtime); void load_positions(FFModel *ff, Tensor pt, @@ -54,3 +73,12 @@ class FileDataLoader { std::string weights_folder; bool use_full_precision; }; + +struct WeightLoadTaskArgs { + FFModel *ff; + FileDataLoader *loader; + Layer *layer; + int weight_idx; + WeightLoadTaskArgs(FFModel *_ff, FileDataLoader *_loader, Layer *_l, int _idx) + : ff(_ff), loader(_loader), layer(_l), weight_idx(_idx) {} +}; diff --git a/inference/python/chat.py b/inference/python/chat.py index 70b8ee0067..95132443a2 100644 --- a/inference/python/chat.py +++ b/inference/python/chat.py @@ -21,14 +21,14 @@ def get_configs(): # Define sample configs ff_init_configs = { # required parameters - "num_gpus": 1, - "memory_per_gpu": 30000, - "zero_copy_memory_per_node": 60000, + "num_gpus": 8, + "memory_per_gpu": 34000, + "zero_copy_memory_per_node": 200000, # optional parameters - "num_cpus": 4, - "legion_utility_processors": 4, + "num_cpus": 16, + "legion_utility_processors": 16, "data_parallelism_degree": 1, - "tensor_parallelism_degree": 1, + "tensor_parallelism_degree": 8, "pipeline_parallelism_degree": 1, "offload": False, "offload_reserve_space_size": 8 * 1024, # 8GB @@ -43,7 +43,7 @@ def get_configs(): } llm_configs = { # required parameters - "llm_model": "meta-llama/Meta-Llama-3-8B-Instruct", + "llm_model": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", # optional parameters "cache_path": os.environ.get("FF_CACHE_PATH", ""), "refresh_cache": False, @@ -85,11 +85,15 @@ def main(): llm.start_server() + nemotron_system = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature." + llama_generic_system = "You are a helpful an honest programming assistant." + + messages=[ - {"role": "system", "content": "You are a helpful an honest programming assistant."}, + {"role": "system", "content": nemotron_system}, {"role": "user", "content": "Is Rust better than Python?"}, ] - llm.generate(messages, max_new_tokens=256) + llm.generate(messages, max_new_tokens=1024) llm.stop_server() diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 837608c9f6..b4056960f4 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -2929,7 +2929,10 @@ void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, flexflow_model_t model_handle_) { FileDataLoader *handle = FFCObjectWrapper::unwrap(handle_); FFModel *model = FFCObjectWrapper::unwrap(model_handle_); - handle->load_weights(model); + // handle->load_weights(model); + Context ctx = model->config.lg_ctx; + Runtime *runtime = model->config.lg_hlr; + handle->load_weights_parallel(model, ctx, runtime); } // // ----------------------------------------------------------------------- diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index d7b9a5e99d..e79bf5e371 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -288,6 +288,12 @@ void FFMapper::select_task_options(const MapperContext ctx, output.initial_proc = all_cpus[0]; return; } + if ((task.task_id == LOAD_FLOAT_WEIGHT_TASK_ID) || + (task.task_id == LOAD_HALF_WEIGHT_TASK_ID) || + (task.task_id == LOAD_QUANT_WEIGHT_TASK_ID)) { + output.initial_proc = all_cpus[0]; + return; + } if (task.task_id == TOP_LEVEL_TASK_ID) { output.initial_proc = all_cpus[0]; // control replicate top level task diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index e73893475c..1c1dba32c8 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -16,6 +16,7 @@ #include "flexflow/utils/file_loader.h" #include "flexflow/ffconst_utils.h" #include "flexflow/inference.h" +#include "flexflow/model.h" #include using namespace std; @@ -851,6 +852,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, delete data; } +#ifdef DEADCODE void FileDataLoader::load_weights(FFModel *ff) { for (Layer *l : ff->layers) { if (l->numWeights < 1 || l->name == NULL || strlen(l->name) < 1) { @@ -883,3 +885,92 @@ void FileDataLoader::load_weights(FFModel *ff) { } } } +#endif + +void FileDataLoader::load_float_weight_task( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime) { + WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args; + args->loader->load_single_weight_tensor( + args->ff, args->layer, args->weight_idx); +} + +void FileDataLoader::load_half_weight_task( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime) { + WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args; + args->loader->load_single_weight_tensor( + args->ff, args->layer, args->weight_idx); +} + +void FileDataLoader::load_quant_weight_task( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime) { + WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args; + args->loader->load_quantization_weight( + args->ff, args->layer, args->weight_idx); +} + +void FileDataLoader::load_weights_parallel(FFModel *ff, + Context ctx, + Runtime *runtime) { + std::vector futures; + + for (Layer *l : ff->layers) { + if (l->numWeights < 1 || l->name == NULL || strlen(l->name) < 1) { + continue; + } + + for (int i = 0; i < l->numWeights; i++) { + Tensor weight = l->weights[i]; + if (weight == NULL) { + continue; + } + + if (l->op_type == OP_LORA) { + continue; + } + + // Create task arguments + WeightLoadTaskArgs args(ff, this, l, i); + + switch (weight->data_type) { + case DT_HALF: { + TaskLauncher launcher( + LOAD_HALF_WEIGHT_TASK_ID, + TaskArgument(&args, sizeof(WeightLoadTaskArgs))); + futures.push_back(runtime->execute_task(ctx, launcher)); + break; + } + case DT_FLOAT: { + TaskLauncher launcher( + LOAD_FLOAT_WEIGHT_TASK_ID, + TaskArgument(&args, sizeof(WeightLoadTaskArgs))); + futures.push_back(runtime->execute_task(ctx, launcher)); + break; + } + case DT_INT4: + case DT_INT8: { + TaskLauncher launcher( + LOAD_QUANT_WEIGHT_TASK_ID, + TaskArgument(&args, sizeof(WeightLoadTaskArgs))); + futures.push_back(runtime->execute_task(ctx, launcher)); + break; + } + default: + assert(false && "Unsupported data type"); + } + } + } + + // Wait for all tasks to complete + for (Future &f : futures) { + f.get_void_result(); + } +} diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 465ee21fc9..6bb11b6fa5 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3420,11 +3420,13 @@ bool FFModel::need_to_add_combine(int layer_idx) const { bool FFModel::need_to_add_allreduce(int layer_idx) const { auto const &l = layers[layer_idx]; - if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 && - ((l->op_type == OP_LINEAR && std::string(l->name).find("attn.o_proj") != std::string::npos) || - is_mlp_block(layer_idx) || - (l->op_type == OP_LINEAR && std::string(l->name).find("mlp.down_proj") != std::string::npos) - )) { + if (config.computationMode == COMP_MODE_INFERENCE && + config.tensor_parallelism_degree > 1 && + ((l->op_type == OP_LINEAR && + std::string(l->name).find("attn.o_proj") != std::string::npos) || + is_mlp_block(layer_idx) || + (l->op_type == OP_LINEAR && + std::string(l->name).find("mlp.down_proj") != std::string::npos))) { return true; } return false; @@ -4798,6 +4800,51 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(LOAD_FLOAT_WEIGHT_TASK_ID, + "load_float_weight_task"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "load_float_weight_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(LOAD_HALF_WEIGHT_TASK_ID, + "load_half_weight_task"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "load_half_weight_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(LOAD_QUANT_WEIGHT_TASK_ID, + "load_quant_weight_task"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "load_quant_weight_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } #endif // ElementUnary task { diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index d98d327dba..fddaae09ce 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -3025,7 +3025,7 @@ void RequestManager::serve_incr_decoding(FFModel *llm) { assert(im->model_weights_loaders.find(llm) != im->model_weights_loaders.end()); // Load model weights - im->model_weights_loaders[llm]->load_weights(llm); + im->model_weights_loaders[llm]->load_weights_parallel(llm, ctx, runtime); // init operators im->init_operators_inference(llm); // Legion futures for inc_decoding and spec_infer @@ -3087,7 +3087,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) { assert(im->model_weights_loaders.find(llm) != im->model_weights_loaders.end()); // Load model weights - im->model_weights_loaders[llm]->load_weights(llm); + im->model_weights_loaders[llm]->load_weights_parallel(llm, ctx, runtime); // init operators im->init_operators_inference(llm); } @@ -3098,7 +3098,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) { assert(im->model_weights_loaders.find(llm) != im->model_weights_loaders.end()); // Load model weights - im->model_weights_loaders[ssm]->load_weights(ssm); + im->model_weights_loaders[ssm]->load_weights_parallel(ssm, ctx, runtime); // init operators im->init_operators_inference(ssm); } From d54fcf292c6a59204b3c4a8f36098f1c29e74b1f Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 9 Nov 2024 15:21:23 +0000 Subject: [PATCH 31/37] cleanup --- include/flexflow/model.h | 4 +- include/flexflow/utils/file_loader.h | 31 +++----- src/c/flexflow_c.cc | 1 - src/mapper/mapper.cc | 4 +- src/runtime/file_loader.cc | 115 +++++++-------------------- src/runtime/model.cc | 39 +-------- 6 files changed, 48 insertions(+), 146 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 3a80aa6b12..e352159af0 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -278,9 +278,7 @@ enum TaskIDs { RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID, RM_BACKGROUND_SERVING_TASK_ID, - LOAD_FLOAT_WEIGHT_TASK_ID, - LOAD_HALF_WEIGHT_TASK_ID, - LOAD_QUANT_WEIGHT_TASK_ID, + LOAD_WEIGHT_TASK_ID, // Custom tasks CUSTOM_GPU_TASK_ID_FIRST, CUSTOM_GPU_TASK_ID_1, diff --git a/include/flexflow/utils/file_loader.h b/include/flexflow/utils/file_loader.h index 44cb15d10f..8735f23571 100644 --- a/include/flexflow/utils/file_loader.h +++ b/include/flexflow/utils/file_loader.h @@ -39,25 +39,12 @@ class FileDataLoader { void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx); void load_quantization_weight(FFModel *ff, Layer *l, int weight_idx); -#ifdef DEADCODE - void load_weights(FFModel *ff); -#endif static void - load_float_weight_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); - static void - load_half_weight_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); - static void - load_quant_weight_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); + load_weight_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); void load_weights_parallel(FFModel *ff, Context ctx, Runtime *runtime); void load_positions(FFModel *ff, @@ -79,6 +66,12 @@ struct WeightLoadTaskArgs { FileDataLoader *loader; Layer *layer; int weight_idx; - WeightLoadTaskArgs(FFModel *_ff, FileDataLoader *_loader, Layer *_l, int _idx) - : ff(_ff), loader(_loader), layer(_l), weight_idx(_idx) {} + DataType data_type; + WeightLoadTaskArgs(FFModel *_ff, + FileDataLoader *_loader, + Layer *_l, + int _idx, + DataType _data_type) + : ff(_ff), loader(_loader), layer(_l), weight_idx(_idx), + data_type(_data_type) {} }; diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index b4056960f4..4094fb7b44 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -2929,7 +2929,6 @@ void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, flexflow_model_t model_handle_) { FileDataLoader *handle = FFCObjectWrapper::unwrap(handle_); FFModel *model = FFCObjectWrapper::unwrap(model_handle_); - // handle->load_weights(model); Context ctx = model->config.lg_ctx; Runtime *runtime = model->config.lg_hlr; handle->load_weights_parallel(model, ctx, runtime); diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index e79bf5e371..c02f70f752 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -288,9 +288,7 @@ void FFMapper::select_task_options(const MapperContext ctx, output.initial_proc = all_cpus[0]; return; } - if ((task.task_id == LOAD_FLOAT_WEIGHT_TASK_ID) || - (task.task_id == LOAD_HALF_WEIGHT_TASK_ID) || - (task.task_id == LOAD_QUANT_WEIGHT_TASK_ID)) { + if (task.task_id == LOAD_WEIGHT_TASK_ID) { output.initial_proc = all_cpus[0]; return; } diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index 1c1dba32c8..3ebe6cf095 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -852,69 +852,33 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, delete data; } -#ifdef DEADCODE -void FileDataLoader::load_weights(FFModel *ff) { - for (Layer *l : ff->layers) { - if (l->numWeights < 1 || l->name == NULL || strlen(l->name) < 1) { - continue; - } - for (int i = 0; i < l->numWeights; i++) { - Tensor weight = l->weights[i]; - if (weight == NULL) { - continue; - } - // TODO: currently skip Lora layers - if (l->op_type == OP_LORA) { - continue; - } - switch (weight->data_type) { - case DT_HALF: - load_single_weight_tensor(ff, l, i); - break; - case DT_FLOAT: - load_single_weight_tensor(ff, l, i); - break; - case DT_INT4: - case DT_INT8: - // load weights in quantization - load_quantization_weight(ff, l, i); - break; - default: - assert(false && "Unsupported data type"); - } - } - } -} -#endif - -void FileDataLoader::load_float_weight_task( - Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime) { - WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args; - args->loader->load_single_weight_tensor( - args->ff, args->layer, args->weight_idx); -} - -void FileDataLoader::load_half_weight_task( +void FileDataLoader::load_weight_task( Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime) { WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args; - args->loader->load_single_weight_tensor( - args->ff, args->layer, args->weight_idx); -} -void FileDataLoader::load_quant_weight_task( - Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime) { - WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args; - args->loader->load_quantization_weight( - args->ff, args->layer, args->weight_idx); + switch (args->data_type) { + case DT_HALF: { + args->loader->load_single_weight_tensor( + args->ff, args->layer, args->weight_idx); + break; + } + case DT_FLOAT: { + args->loader->load_single_weight_tensor( + args->ff, args->layer, args->weight_idx); + break; + } + case DT_INT4: + case DT_INT8: { + args->loader->load_quantization_weight( + args->ff, args->layer, args->weight_idx); + break; + } + default: + assert(false && "Unsupported data type"); + } } void FileDataLoader::load_weights_parallel(FFModel *ff, @@ -937,35 +901,16 @@ void FileDataLoader::load_weights_parallel(FFModel *ff, continue; } - // Create task arguments - WeightLoadTaskArgs args(ff, this, l, i); - - switch (weight->data_type) { - case DT_HALF: { - TaskLauncher launcher( - LOAD_HALF_WEIGHT_TASK_ID, - TaskArgument(&args, sizeof(WeightLoadTaskArgs))); - futures.push_back(runtime->execute_task(ctx, launcher)); - break; - } - case DT_FLOAT: { - TaskLauncher launcher( - LOAD_FLOAT_WEIGHT_TASK_ID, - TaskArgument(&args, sizeof(WeightLoadTaskArgs))); - futures.push_back(runtime->execute_task(ctx, launcher)); - break; - } - case DT_INT4: - case DT_INT8: { - TaskLauncher launcher( - LOAD_QUANT_WEIGHT_TASK_ID, - TaskArgument(&args, sizeof(WeightLoadTaskArgs))); - futures.push_back(runtime->execute_task(ctx, launcher)); - break; - } - default: - assert(false && "Unsupported data type"); + if (weight->data_type != DT_FLOAT && weight->data_type != DT_HALF && + weight->data_type != DT_INT4 && weight->data_type != DT_INT8) { + assert(false && "Unsupported data type"); } + + // Create task arguments + WeightLoadTaskArgs args(ff, this, l, i, weight->data_type); + TaskLauncher launcher(LOAD_WEIGHT_TASK_ID, + TaskArgument(&args, sizeof(WeightLoadTaskArgs))); + futures.push_back(runtime->execute_task(ctx, launcher)); } } diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 6bb11b6fa5..ca947039d0 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4801,47 +4801,16 @@ void register_flexflow_internal_tasks(Runtime *runtime, } } { - TaskVariantRegistrar registrar(LOAD_FLOAT_WEIGHT_TASK_ID, - "load_float_weight_task"); + TaskVariantRegistrar registrar(LOAD_WEIGHT_TASK_ID, "load_weight_task"); registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); if (pre_register) { - Runtime::preregister_task_variant( - registrar, "load_float_weight_task"); + Runtime::preregister_task_variant( + registrar, "load_weight_task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant( - registrar); - } - } - { - TaskVariantRegistrar registrar(LOAD_HALF_WEIGHT_TASK_ID, - "load_half_weight_task"); - registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); - if (pre_register) { - Runtime::preregister_task_variant( - registrar, "load_half_weight_task"); - } else { - if (enable_control_replication) { - registrar.global_registration = false; - } - runtime->register_task_variant( - registrar); - } - } - { - TaskVariantRegistrar registrar(LOAD_QUANT_WEIGHT_TASK_ID, - "load_quant_weight_task"); - registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); - if (pre_register) { - Runtime::preregister_task_variant( - registrar, "load_quant_weight_task"); - } else { - if (enable_control_replication) { - registrar.global_registration = false; - } - runtime->register_task_variant( + runtime->register_task_variant( registrar); } } From f7485151e75bc8244f45920723d40b0fd965503b Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 9 Nov 2024 23:00:54 +0000 Subject: [PATCH 32/37] cleanup --- include/flexflow/flexflow_c.h | 3 + inference/peft/peft.cc | 2 +- inference/python/ff_peft.py | 50 +-- inference/python/peft_demo/INSTRUCTIONS.md | 2 +- inference/python/peft_demo/demo.ipynb | 4 +- inference/python/peft_demo/demo.py | 4 +- inference/utils/download_peft_model.py | 32 +- python/flexflow/core/flexflow_cffi.py | 5 + python/flexflow/serve/serve.py | 459 ++++++++++----------- src/c/flexflow_c.cc | 8 + tests/peft_test.sh | 10 +- 11 files changed, 291 insertions(+), 288 deletions(-) diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 906cacb920..677f9915cd 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -1029,6 +1029,9 @@ void flexflow_request_manager_set_max_sequence_length( int flexflow_request_manager_get_max_sequence_length( flexflow_request_manager_t handle_); +void flexflow_request_manager_set_max_concurrent_adapters( + flexflow_request_manager_t handle_, int max_concurrent_adapters); + void flexflow_request_manager_set_enable_peft_finetuning( flexflow_request_manager_t handle_, bool enable_peft_finetuning_); diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index da2993187c..4f2d47055a 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -256,7 +256,7 @@ void FlexFlow::top_level_task(Task const *task, LoraOptimizerConfig *optim_config = nullptr; if (enable_peft_finetuning) { // float sgd_learning_rate = 2e-1; - float sgd_learning_rate = 1.0f; + float sgd_learning_rate = 0.001f; optim_config = new LoraSGDOptimizerConfig(sgd_learning_rate); } LoraLinearConfig peft_config_finetuning = diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py index 35338f5227..0167cecebc 100644 --- a/inference/python/ff_peft.py +++ b/inference/python/ff_peft.py @@ -41,14 +41,14 @@ def get_configs(): # Define sample configs ff_init_configs = { # required parameters - "num_gpus": 2, + "num_gpus": 4, "memory_per_gpu": 14000, "zero_copy_memory_per_node": 10000, # optional parameters "num_cpus": 4, "legion_utility_processors": 4, "data_parallelism_degree": 1, - "tensor_parallelism_degree": 2, + "tensor_parallelism_degree": 4, "pipeline_parallelism_degree": 1, "offload": False, "offload_reserve_space_size": 8 * 1024, # 8GB @@ -102,6 +102,23 @@ def main(): refresh_cache=configs.refresh_cache, output_file=configs.output_file, ) + + # Compile the LLM for inference and load the weights into memory + generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + enable_peft_finetuning = len(configs.finetuning_dataset) > 0 + llm.compile( + generation_config, + max_requests_per_batch=1 if not enable_peft_finetuning else 2, + max_seq_length=256, + max_tokens_per_batch=128, + max_concurrent_adapters=1 if not enable_peft_finetuning else 2, + enable_peft_finetuning=enable_peft_finetuning, + ) + + llm.start_server() + # Add inference and/or finetuning lora lora_inference_config = None lora_finetuning_config = None @@ -111,18 +128,8 @@ def main(): configs.inference_peft_model_id, base_model_name_or_path=configs.base_model, ) - llm.add_peft(lora_inference_config) + llm.register_peft_adapter(lora_inference_config) if len(configs.finetuning_dataset) > 0: - # lora_finetuning_config = ff.LoraLinearConfig( - # llm.cache_path, - # configs.finetuning_peft_model_id, - # target_modules=["down_proj"], - # rank=16, - # lora_alpha=16, - # trainable=True, - # init_lora_weights=True, - # optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD, - # ) lora_finetuning_config = ff.LoraLinearConfig( llm.cache_path, configs.inference_peft_model_id, @@ -136,22 +143,7 @@ def main(): "nesterov": False, }, ) - llm.add_peft(lora_finetuning_config) - - # Compile the LLM for inference and load the weights into memory - generation_config = ff.GenerationConfig( - do_sample=False, temperature=0.9, topp=0.8, topk=1 - ) - enable_peft_finetuning = len(configs.finetuning_dataset) > 0 - llm.compile( - generation_config, - enable_peft_finetuning=enable_peft_finetuning, - max_requests_per_batch=1 if not enable_peft_finetuning else 2, - max_seq_length=256, - max_tokens_per_batch=128, - ) - - llm.start_server() + llm.register_peft_adapter(lora_finetuning_config) requests = [] # Serving diff --git a/inference/python/peft_demo/INSTRUCTIONS.md b/inference/python/peft_demo/INSTRUCTIONS.md index 9b2a7a53b2..0f78efdea9 100644 --- a/inference/python/peft_demo/INSTRUCTIONS.md +++ b/inference/python/peft_demo/INSTRUCTIONS.md @@ -13,7 +13,7 @@ * `export HUGGINGFACE_TOKEN="[Your token]"` * `huggingface-cli login --token "$HUGGINGFACE_TOKEN"` - * `python3 inference/utils/download_peft_model.py "goliaro/llama-2-7b-lora-full" --base_model_name "meta-llama/Llama-2-7b-hf"` + * `python3 inference/utils/download_peft_model.py "goliaro/llama-2-7b-lora-full"` * Run the demo ``` diff --git a/inference/python/peft_demo/demo.ipynb b/inference/python/peft_demo/demo.ipynb index d29ad5ad2f..ea2b8417b6 100644 --- a/inference/python/peft_demo/demo.ipynb +++ b/inference/python/peft_demo/demo.ipynb @@ -194,7 +194,7 @@ } ], "source": [ - "args = [configs.inference_peft_model_id, '--base_model_name', configs.base_model]\n", + "args = [configs.inference_peft_model_id]\n", "subprocess.run(['python', '../../utils/download_peft_model.py'] + args)" ] }, @@ -1813,7 +1813,7 @@ "configs = SimpleNamespace(**configs_dict)\n", "\n", "\n", - "args = [configs.finetuning_peft_model_id+\"-dolly\", '--base_model_name', configs.base_model]\n", + "args = [configs.finetuning_peft_model_id+\"-dolly\"]\n", "subprocess.run(['python', '../../utils/download_peft_model.py'] + args)\n", "\n", "# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs\n", diff --git a/inference/python/peft_demo/demo.py b/inference/python/peft_demo/demo.py index 34b15b9a76..b70f3c8966 100644 --- a/inference/python/peft_demo/demo.py +++ b/inference/python/peft_demo/demo.py @@ -98,7 +98,7 @@ def create_datasets(finetune_dataset_size=2, inference_file_path='inference_data file.write('') # Download base and peft inference models -args = [configs.inference_peft_model_id, '--base_model_name', configs.base_model] +args = [configs.inference_peft_model_id] # hf_token = input("Please enter your HuggingFace personal access token: ") # subprocess.run(['huggingface-cli', 'login', '--token', hf_token]) subprocess.run(['python', '../../utils/download_peft_model.py'] + args) @@ -206,7 +206,7 @@ def create_datasets(finetune_dataset_size=2, inference_file_path='inference_data ) llm.add_peft(lora_inference_config) -args = [configs.finetuning_peft_model_id, '--base_model_name', configs.base_model] +args = [configs.finetuning_peft_model_id] #hf_token = input("Please enter your HuggingFace personal access token: ") # subprocess.run(['huggingface-cli', 'login', '--token', hf_token]) # subprocess.run(['python', '../../utils/download_peft_model.py'] + args) diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py index 38dd577574..2ee63b10bc 100644 --- a/inference/utils/download_peft_model.py +++ b/inference/utils/download_peft_model.py @@ -1,13 +1,11 @@ #!/usr/bin/env python import flexflow.serve as ff import argparse, os +from peft import PeftConfig def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument( - "--base_model_name", type=str, help="Name of the model to download" - ) parser.add_argument( "peft_model_ids", type=str, @@ -48,19 +46,21 @@ def main(args): else: data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF) - for data_type in data_types: - llm = ff.LLM( - args.base_model_name, - data_type=data_type, - cache_path=args.cache_folder, - refresh_cache=args.refresh_cache, - ) - for peft_model_id in args.peft_model_ids: - lora_config = ff.LoraLinearConfig(llm.cache_path, peft_model_id) - llm.add_peft(lora_config) - llm.download_hf_weights_if_needed() - llm.download_hf_config() - llm.download_hf_tokenizer_if_needed() + for peft_model_id in args.peft_model_ids: + hf_config = PeftConfig.from_pretrained(peft_model_id) + for data_type in data_types: + llm = ff.LLM( + hf_config.base_model_name_or_path, + data_type=data_type, + cache_path=args.cache_folder, + refresh_cache=args.refresh_cache, + ) + # Download base model config, weights and tokenizer + llm.download_hf_config() + llm.download_hf_weights_if_needed() + llm.download_hf_tokenizer_if_needed() + # Download PEFT adapter + llm.download_peft_adapter_if_needed(peft_model_id) if __name__ == "__main__": diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 4ff8348f46..02eff0ca76 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -1633,6 +1633,11 @@ def set_max_sequence_length(self, max_length): def get_max_sequence_length(self): return ffc().flexflow_request_manager_get_max_sequence_length(self.handle) + + def set_max_concurrent_adapters(self, max_adapters): + return ffc().flexflow_request_manager_set_max_concurrent_adapters( + self.handle, max_adapters + ) def set_enable_peft_finetuning(self, enable_peft_finetuning): return ffc().flexflow_request_manager_set_enable_peft_finetuning( diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 9d3fa19706..7932441c81 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -31,9 +31,17 @@ from peft import PeftModel, PeftConfig, LoraConfig from huggingface_hub import HfApi import torch, shutil, hashlib, json, gc -from typing import Union, List +from typing import Union, List, Tuple +from safetensors import safe_open from huggingface_hub import snapshot_download +from enum import Enum + + +class CachedResourceType(Enum): + TOKENIZER = "tokenizer" + WEIGHTS = "weights" + class _SupportedModels: def __init__( @@ -104,14 +112,14 @@ def __init__( self.output_file = output_file self.rm = None self.pefts = {} - self.tokenizer=None + self.tokenizer = None def __del__(self): # Stop the background server before deleting the object if type(self) == LLM and self.rm is not None: self.rm.stop_server() - def add_peft(self, lora_config: LoraLinearConfig): + def register_peft_adapter(self, lora_config: LoraLinearConfig): """Add a PEFT adapter to the LLM""" if lora_config is None: raise ValueError("lora_config cannot be None") @@ -145,9 +153,12 @@ def add_peft(self, lora_config: LoraLinearConfig): f"Attempting to add PEFT with base model name {peft_config.base_model_name_or_path} to LLM {self.model_name}" ) + lora_config.ff_compile() + self.pefts[lora_config] = { "peft_config": peft_config, "peft_type": peft_config.peft_type, + "ff_peft_model_id": self.model.ffmodel.register_peft_adapter(lora_config), } def get_ff_peft_id(self, lora_config: LoraLinearConfig) -> PEFTModelID: @@ -175,34 +186,33 @@ def download_hf_config(self): os.makedirs(config_dir, exist_ok=True) print(f"Creating directory {config_dir} (if it doesn't exist)...") print(f"Saving {self.model_name} configs to file {config_path}...") - self.hf_config.to_json_file(config_path) - - # Save PEFT configs if the LLM has any registered PEFTs - for ff_peft_config, peft_dict in self.pefts.items(): - peft_config = peft_dict["peft_config"] - peft_model_id = ff_peft_config.peft_model_id - peft_config_dir = os.path.join( - os.path.expanduser(self.cache_path), "configs", peft_model_id.lower() - ) - os.makedirs(peft_config_dir, exist_ok=True) - peft_config_path = os.path.join(peft_config_dir, "config.json") - print(f"Saving {peft_model_id} configs to file {peft_config_path}...") - with open(peft_config_path, "w") as json_file: - - class SetEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, set): - return list(obj) - return super().default(obj) - - json.dump(peft_config.to_dict(), json_file, indent=2, cls=SetEncoder) - - def __get_revision_hashes(self, model_name: str, folder: str): + # self.hf_config.to_json_file(config_path) + src_folder = snapshot_download( + repo_id=self.model_name, allow_patterns="config.json" + ) + src_path = os.path.join(src_folder, "config.json") + if os.path.exists(src_path): + shutil.copy(src_path, config_path) + + def __get_revision_hashes( + self, model_name: str, folder: str + ) -> Tuple[Union[str, None], str, str]: + """Return the commit hash of the object (weight, tokenizer, etc) cached by FlexFlow and the latest commit hash of the object from HuggingFace (or other source) + + Args: + model_name (str): Name of the model cached by FlexFlow + folder (str): Folder where the cached object is stored + + Returns: + ff_revision: Commit hash of the object cached by FlexFlow + ff_revision_filepath: Path to the file containing the commit hash of the object cached by FlexFlow + latest_revision: Latest commit hash of the object from HuggingFace (or other source) + """ ff_revision = None - ff_revision_file = os.path.join(folder, "rev_sha.txt") + ff_revision_filepath = os.path.join(folder, "rev_sha.txt") - if os.path.exists(ff_revision_file): - ff_revision = "".join(open(ff_revision_file).read().split()) + if os.path.exists(ff_revision_filepath): + ff_revision = "".join(open(ff_revision_filepath).read().split()) if os.path.exists(model_name) and os.path.isdir(model_name): # Local model @@ -215,16 +225,21 @@ def __get_revision_hashes(self, model_name: str, folder: str): # Remote HuggingFace model hf_api = HfApi() latest_revision = hf_api.model_info(self.model_name).sha - return ff_revision, ff_revision_file, latest_revision + return ff_revision, latest_revision - def download_hf_weights_if_needed(self): - """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date. - If not, or if the refresh_cache parameter is set to True, download new weights. + def __get_resource_path( + self, model_name: str, resource_type: CachedResourceType + ) -> str: + """Returns the path to the folder where the model weights or tokenizer files are stored - If any PEFT adapter is registered, perform the same operation for PEFT. - """ + Args: + model_name (str): Name of the model + resource_type (CachedResourceType): Whether to get the path to the weights or the tokenizer - def get_weights_path(model_name): + Returns: + str: Path to the folder where the model weights or tokenizer files are stored + """ + if resource_type == CachedResourceType.WEIGHTS: return os.path.join( os.path.expanduser(self.cache_path), "weights", @@ -235,19 +250,56 @@ def get_weights_path(model_name): else "half-precision" ), ) + elif resource_type == CachedResourceType.TOKENIZER: + return os.path.join( + os.path.expanduser(self.cache_path), "tokenizers", model_name.lower() + ) + else: + raise ValueError(f"Invalid resource type {resource_type}") - def refresh_cache_if_needed(model_name): - weights_path = get_weights_path(model_name) - if self.refresh_cache: - print( - f"Refreshing weights in cache for model {model_name} at path {weights_path} ..." - ) - if os.path.exists(weights_path): - shutil.rmtree(weights_path) - os.makedirs(weights_path, exist_ok=True) + def __need_cache_refresh( + self, model_name: str, resource_type: CachedResourceType + ) -> bool: + """Check whether the model weights or tokenizer files are available and up to date. + If they need a refresh, create the folder for the resource, save the new commit hash to the rev_sha.txt file, delete any existing files, and return true. - def get_hf_llm(model_name): - return AutoModelForCausalLM.from_pretrained( + Args: + model_name (str): Name of the model to check + resource_type (CachedResourceType): Whether to check the weights or the tokenizer + + Returns: + bool: True if the weights or tokenizer need a refresh, False otherwise + """ + need_refresh = False + resource_path = self.__get_resource_path(model_name, resource_type) + if self.refresh_cache or not os.path.exists(resource_path): + need_refresh = True + else: + ff_revision, latest_revision = self.__get_revision_hashes( + self.model_name, resource_path + ) + if ff_revision != latest_revision: + need_refresh = True + if need_refresh: + print( + f"Refreshing {resource_type} in cache for model {model_name} at path {resource_path} ..." + ) + if os.path.exists(resource_path): + shutil.rmtree(resource_path) + os.makedirs(resource_path, exist_ok=True) + ff_revision_file = os.path.join(resource_path, "rev_sha.txt") + with open(ff_revision_file, "w+") as f: + f.write(latest_revision) + return need_refresh + + def download_hf_weights_if_needed(self) -> None: + """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date. + If not, or if the refresh_cache parameter is set to True, download new weights and convert them. + """ + + # TODO: edit this to download the weights using snapshot_download and convert them to FlexFlow format without loading them to GPU + def download_and_convert_llm_weights(model_name): + hf_model = AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True, torch_dtype=( @@ -256,73 +308,26 @@ def get_hf_llm(model_name): else torch.float16 ), ) - - def download_llm_weights(): - refresh_cache_if_needed(self.model_name) - ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( - self.model_name, self.weights_path + # Convert the model to FlexFlow format + weights_path = self.__get_resource_path( + model_name, CachedResourceType.WEIGHTS ) - if ff_revision != latest_revision: - print( - f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now..." - ) - hf_model = get_hf_llm(self.model_name) - # Convert the model to FlexFlow format - self.model_class.convert_hf_model(hf_model, self.weights_path) - # Save new revision hash to file - with open(ff_revision_file, "w+") as f: - f.write(latest_revision) - print(f"Done converting the weights for model {self.model_name}") - # Deallocate hf model - del hf_model - gc.collect() - torch.cuda.empty_cache() - - def convert_peft_model(hf_peft_model, peft_type, weights_path): - for name, params in hf_peft_model.named_parameters(): - if peft_type.lower() in name: - name = name.replace("base_model.model.model.", "").replace( - ".default", "" - ) - name = self.model_class.convert_hf_weight_name(name) - params.detach().cpu().numpy().tofile(f"{weights_path}/{name}") - - def download_peft_weights(): - for ff_peft_config, peft_dict in self.pefts.items(): - if not ff_peft_config.init_lora_weights: - peft_config = peft_dict["peft_config"] - peft_type = peft_dict["peft_type"] - peft_model_id = ff_peft_config.peft_model_id - - weights_path = get_weights_path(peft_model_id) - refresh_cache_if_needed(peft_model_id) - ff_revision, ff_revision_file, latest_revision = ( - self.__get_revision_hashes(peft_model_id, weights_path) - ) - - if ff_revision != latest_revision: - print( - f"'{peft_model_id}' local model weights need updating! Downloading/converting new weights now..." - ) - hf_model = get_hf_llm(peft_model_id) - hf_peft_model = PeftModel.from_pretrained( - hf_model, peft_model_id, config=peft_config - ) - # Convert the model to FlexFlow format - convert_peft_model(hf_peft_model, peft_type, weights_path) - # Save new revision hash to file - with open(ff_revision_file, "w+") as f: - f.write(latest_revision) - print(f"Done converting the weights for model {peft_model_id}") - # Deallocate hf model - del hf_peft_model - del hf_model - gc.collect() - torch.cuda.empty_cache() - - self.weights_path = get_weights_path(self.model_name) - download_llm_weights() - download_peft_weights() + self.model_class.convert_hf_model(hf_model, weights_path) + # Save new revision hash to file + print(f"Done converting the weights for model {self.model_name}") + # Deallocate hf model + del hf_model + gc.collect() + torch.cuda.empty_cache() + + need_refresh = self.__need_cache_refresh( + self.model_name, CachedResourceType.WEIGHTS + ) + if need_refresh: + print( + f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now..." + ) + download_and_convert_llm_weights(self.model_name) def download_hf_tokenizer_if_needed(self): """Check in the folder specified by the cache_path whether the LLM's tokenizer files are available and up to date. @@ -331,25 +336,10 @@ def download_hf_tokenizer_if_needed(self): print("Loading tokenizer...") # Use local cache, or download new version - self.tokenizer_path = os.path.join( - os.path.expanduser(self.cache_path), "tokenizers", self.model_name.lower() + need_refresh = self.__need_cache_refresh( + self.model_name, CachedResourceType.TOKENIZER ) - if self.refresh_cache: - print( - f"Refreshing cached tokenizer for model {self.model_name} at path {self.tokenizer_path} ..." - ) - if os.path.exists(self.tokenizer_path): - shutil.rmtree(self.tokenizer_path) - if not os.path.exists(self.tokenizer_path): - print(f"Creating directory {self.tokenizer_path} (if it doesn't exist)...") - os.makedirs(self.tokenizer_path, exist_ok=True) - - # Get local revision SHA, check if it matches latest one on huggingface - ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( - self.model_name, self.tokenizer_path - ) - - if ff_revision != latest_revision: + if need_refresh: print( f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now..." ) @@ -367,15 +357,76 @@ def download_hf_tokenizer_if_needed(self): hf_tokenizer_path = snapshot_download( repo_id=self.model_name, allow_patterns=target_tokenizer_files ) + tokenizer_path = self.__get_resource_path( + self.model_name, CachedResourceType.TOKENIZER + ) for file in target_tokenizer_files: src_path = os.path.join(hf_tokenizer_path, file) - dst_path = os.path.join(self.tokenizer_path, file) + dst_path = os.path.join(tokenizer_path, file) if os.path.exists(src_path): shutil.copy(src_path, dst_path) print("Done updating HF tokenizer.") - # Save new revision hash to file - with open(ff_revision_file, "w+") as f: - f.write(latest_revision) + + def download_peft_adapter_if_needed(self, hf_peft_model_id: str): + """Check in the folder specified by the cache_path whether the PEFT model weights are available and up to date. + If not, or if the refresh_cache parameter is set to True, download new weights and convert them. + """ + + def download_and_convert_peft_model(hf_peft_model_id: str): + if ( + self.data_type != DataType.DT_FLOAT + and self.data_type != DataType.DT_HALF + ): + raise ValueError( + "data_type must be either DataType.DT_FLOAT or DataType.DT_HALF" + ) + + # Save peft config to file + peft_config_dir = os.path.join( + os.path.expanduser(self.cache_path), "configs", hf_peft_model_id.lower() + ) + dst_path = os.path.join(peft_config_dir, "config.json") + os.makedirs(peft_config_dir, exist_ok=True) + print(f"Saving {hf_peft_model_id} configs to file {dst_path}...") + config_path = snapshot_download( + repo_id=hf_peft_model_id, allow_patterns="adapter_config.json" + ) + src_path = os.path.join(config_path, "adapter_config.json") + if os.path.exists(src_path): + shutil.copy(src_path, dst_path) + + # Save peft weights to file + adapter_path = snapshot_download( + repo_id=hf_peft_model_id, allow_patterns="adapter_model.safetensors" + ) + weights_path = self.__get_resource_path( + hf_peft_model_id.lower(), CachedResourceType.WEIGHTS + ) + with safe_open(adapter_path, framework="pt", device="cpu") as f: + for tensor_name in f.keys(): + tensor = f.get_tensor(tensor_name) + if self.data_type == DataType.DT_HALF: + tensor = tensor.half() + else: + tensor = tensor.float() + tensor_name = tensor_name.replace( + "base_model.model.model.", "" + ).replace(".default", "") + print(tensor_name) + + tensor_name = self.model_class.convert_hf_weight_name(tensor_name) + tensor.detach().cpu().numpy().tofile( + f"{weights_path}/{tensor_name}" + ) + + need_refresh = self.__need_cache_refresh( + hf_peft_model_id, CachedResourceType.WEIGHTS + ) + if need_refresh: + print( + f"'{hf_peft_model_id}' local model weights need updating! Downloading/converting new weights now..." + ) + download_and_convert_peft_model(hf_peft_model_id) def compile( self, @@ -383,10 +434,8 @@ def compile( max_requests_per_batch: int = 1, max_seq_length: int = 256, max_tokens_per_batch: int = 64, + max_concurrent_adapters: int = 1, enable_peft_finetuning: bool = False, - model_specific_data_parallelism_degree: int = None, - model_specific_tensor_parallelism_degree: int = None, - model_specific_pipeline_parallelism_degree: int = None, ssms: list = [], ): """Compile the LLM for inference and load the weights into memory @@ -399,14 +448,10 @@ def compile( :type max_seq_length: int, optional :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 64 :type max_tokens_per_batch: int, optional + :param max_concurrent_adapters: The maximum number of concurrent LoRA adapters, defaults to 1 + :type max_concurrent_adapters: int, optional :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False :type enable_peft_finetuning: bool, optional - :param model_specific_data_parallelism_degree: Use this parameter if you want to give the LLM a different data parallelism degree than the one used to initialize the runtime, defaults to None - :type model_specific_data_parallelism_degree: int, optional - :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the LLM a different tensor parallelism degree than the one used to initialize the runtime, defaults to None - :type model_specific_tensor_parallelism_degree: int, optional - :param model_specific_pipeline_parallelism_degree: Use this parameter if you want to give the LLM a different pipeline parallelism degree than the one used to initialize the runtime, defaults to None - :type model_specific_pipeline_parallelism_degree: int, optional :param ssms: The SSMs to use when operating in speculative inference mode, defaults to [] :type ssms: list, optional """ @@ -422,20 +467,6 @@ def compile( assert type(self) == LLM mode = InferenceMode.INC_DECODING_MODE - # Apply model-specific parallelism degrees, if needed - if model_specific_data_parallelism_degree: - self.ffconfig.data_parallelism_degree = ( - model_specific_data_parallelism_degree - ) - if model_specific_tensor_parallelism_degree: - self.ffconfig.tensor_parallelism_degree = ( - model_specific_tensor_parallelism_degree - ) - if model_specific_pipeline_parallelism_degree: - self.ffconfig.pipeline_parallelism_degree = ( - model_specific_pipeline_parallelism_degree - ) - self.max_seq_length = max_seq_length # Create request manager and set serving configuration @@ -443,6 +474,7 @@ def compile( self.rm.set_max_requests_per_batch(max_requests_per_batch) self.rm.set_max_tokens_per_batch(max_tokens_per_batch) self.rm.set_max_sequence_length(max_seq_length) + self.rm.set_max_concurrent_adapters(max_concurrent_adapters) self.rm.set_enable_peft_finetuning(enable_peft_finetuning) # Instantiate the relevant model @@ -473,8 +505,11 @@ def compile( else 20 ) + weights_path = self.__get_resource_path( + self.model_name, CachedResourceType.WEIGHTS + ) self.fileloader = FileDataLoader( - self.weights_path, + weights_path, model_configs.num_attention_heads, model_configs.num_key_value_heads, model_configs.hidden_size, @@ -498,21 +533,17 @@ def compile( eos_token_id = [eos_token_id] elif type(eos_token_id) != list: raise ValueError("eos_token_id must be an integer or a list of integers") + tokenizer_path = self.__get_resource_path( + self.model_name, CachedResourceType.TOKENIZER + ) self.rm.register_tokenizer( - self.model_type, bos_token_id, eos_token_id, self.tokenizer_path + self.model_type, bos_token_id, eos_token_id, tokenizer_path ) self.rm.register_output_filepath(self.output_file) for ssm in self.ssms: self.rm.register_ssm_model(ssm.model.ffmodel) - # Add PEFT layer if registered - for ff_peft_config, peft_dict in self.pefts.items(): - ff_peft_config.ff_compile() - ff_peft_model_id = self.model.ffmodel.register_peft_adapter(ff_peft_config) - peft_dict["ff_peft_model_id"] = ff_peft_model_id - - # start background server if (mode == InferenceMode.TREE_VERIFY_MODE) or ( mode == InferenceMode.INC_DECODING_MODE @@ -528,7 +559,7 @@ def _generate(self, requests: List[Request]) -> List[GenerationResult]: if req.req_type == RequestType.REQ_INFERENCE: # check max_length and max_new_tokens parameters if req.max_length == -1 and req.max_new_tokens == -1: - req.max_length = self.max_seq_length -1 + req.max_length = self.max_seq_length - 1 elif req.max_length != -1 and req.max_new_tokens != -1: warnings.warn( f"Both `max_new_tokens` (={req.max_new_tokens}) and `max_length`(={req.max_length}) seem to have been set. `max_new_tokens` will take precedence." @@ -547,7 +578,7 @@ def _generate(self, requests: List[Request]) -> List[GenerationResult]: f"max_new_tokens ({req.max_new_tokens}) is not allowed for finetuning requests." ) if req.max_length == -1: - req.max_length = self.max_seq_length -1 + req.max_length = self.max_seq_length - 1 if req.max_length >= self.max_seq_length: raise ValueError( f"max_length ({req.max_length}) exceeds the maximum sequence length ({self.max_seq_length})" @@ -564,20 +595,30 @@ def __chat2prompt(self, messages: List[dict]) -> str: """ # ensure that each element is a dictionary, containing the "role" and "content" keys for message in messages: - if type(message) != dict or "role" not in message or "content" not in message: + if ( + type(message) != dict + or "role" not in message + or "content" not in message + ): raise ValueError( "Each element in the list must be a dictionary with the keys 'role' and 'content'" ) if self.tokenizer is None: self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) if self.tokenizer.chat_template is None: - raise ValueError(f"Model {self.model_name} does not support chat completion") - return self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - - def __output2chat_response(self, requests: List[Request], outputs: List[GenerationResult]) -> List[GenerationResult]: - assert(len(requests) == len(outputs)) + raise ValueError( + f"Model {self.model_name} does not support chat completion" + ) + return self.tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + def __output2chat_response( + self, requests: List[Request], outputs: List[GenerationResult] + ) -> List[GenerationResult]: + assert len(requests) == len(outputs) for i in range(len(outputs)): - outputs[i].output_text = outputs[i].output_text[len(requests[i].prompt):] + outputs[i].output_text = outputs[i].output_text[len(requests[i].prompt) :] return outputs def generate( @@ -635,7 +676,9 @@ def generate( outputs = self._generate([request]) return self.__output2chat_response([request], outputs) elif type(requests_or_prompts[0]) == list: - prompts = [self.__chat2prompt(messages) for messages in requests_or_prompts] + prompts = [ + self.__chat2prompt(messages) for messages in requests_or_prompts + ] requests = [ Request( req_type=RequestType.REQ_INFERENCE, @@ -652,7 +695,9 @@ def generate( print(requests_or_prompts) return self._generate(requests_or_prompts) else: - assert False, "Please pass a string, list of strings, Request, or list of Requests" + assert ( + False + ), "Please pass a string, list of strings, Request, or list of Requests" def start_server(self): self.rm.start_server(self.model.ffmodel) @@ -695,10 +740,8 @@ def compile( max_requests_per_batch: int = 16, max_seq_length: int = 256, max_tokens_per_batch: int = 2048, + max_concurrent_adapters: int = 1, enable_peft_finetuning: bool = False, - model_specific_data_parallelism_degree: int = 1, - model_specific_tensor_parallelism_degree: int = 1, - model_specific_pipeline_parallelism_degree: int = 1, ssms: list = [], ): """Compile the SSM for inference and load the weights into memory @@ -710,14 +753,10 @@ def compile( :type max_seq_length: int, optional :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 2048 :type max_tokens_per_batch: int, optional + :param max_concurrent_adapters: The maximum number of concurrent LoRA adapters, defaults to 1 + :type max_concurrent_adapters: int, optional :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False :type enable_peft_finetuning: bool, optional - :param model_specific_data_parallelism_degree: Use this parameter if you want to give the SSM a different data parallelism degree than the default one, defaults to 1 - :type model_specific_data_parallelism_degree: int, optional - :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the SSM a different tensor parallelism degree than the default one, defaults to 1 - :type model_specific_tensor_parallelism_degree: int, optional - :param model_specific_pipeline_parallelism_degree: Use this parameter if you want to give the SSM a different pipeline parallelism degree than the default one, defaults to 1 - :type model_specific_pipeline_parallelism_degree: int, optional :param ssms: The SSMs to use when operating in speculative inference mode, defaults to [] :type ssms: list, optional """ @@ -726,51 +765,7 @@ def compile( max_requests_per_batch, max_seq_length, max_tokens_per_batch, + max_concurrent_adapters, enable_peft_finetuning, - model_specific_data_parallelism_degree, - model_specific_tensor_parallelism_degree, - model_specific_pipeline_parallelism_degree, ssms, ) - -from safetensors import safe_open -from huggingface_hub import hf_hub_download -def download_and_convert_peft_model(peft_model_id: str, data_type: DataType = DataType.DT_HALF, cache_path: str = "", refresh_cache: bool = False): - if data_type != DataType.DT_FLOAT and data_type != DataType.DT_HALF: - raise ValueError("data_type must be either DataType.DT_FLOAT or DataType.DT_HALF") - adapter_path = hf_hub_download(repo_id=peft_model_id, filename="adapter_model.safetensors") - peft_config = PeftConfig.from_pretrained(peft_model_id) - base_model_name_or_path = peft_config.base_model_name_or_path - llm = LLM(base_model_name_or_path, data_type, cache_path, refresh_cache) - - # Save peft config to file - peft_config_dir = os.path.join( - os.path.expanduser(llm.cache_path), "configs", peft_model_id.lower() - ) - os.makedirs(peft_config_dir, exist_ok=True) - peft_config_path = os.path.join(peft_config_dir, "config.json") - print(f"Saving {peft_model_id} configs to file {peft_config_path}...") - with open(peft_config_path, "w") as json_file: - - class SetEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, set): - return list(obj) - return super().default(obj) - - json.dump(peft_config.to_dict(), json_file, indent=2, cls=SetEncoder) - - # Save peft weights to file - with safe_open(adapter_path, framework="pt", device="cpu") as f: - for tensor_name in f.keys(): - tensor = f.get_tensor(tensor_name) - if data_type == DataType.DT_HALF: - tensor = tensor.half() - else: - tensor = tensor.float() - tensor_name = tensor_name.replace("base_model.model.model.", "").replace(".default", "") - print(tensor_name) - - tensor_name = llm.model_class.convert_hf_weight_name(tensor_name) - tensor.detach().cpu().numpy().tofile(f"{llm.weights_path}/{tensor_name}") - diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 4094fb7b44..e16b0e87bd 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -2785,6 +2785,14 @@ int flexflow_request_manager_get_max_sequence_length( return handle->get_max_sequence_length(); } +void flexflow_request_manager_set_max_concurrent_adapters( + flexflow_request_manager_t handle_, int max_concurrent_adapters) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + handle->set_max_concurrent_adapters(max_concurrent_adapters); + DEBUG_PRINT("[RequestManager] set max_concurrent_adapters %d", + max_concurrent_adapters); +} + void flexflow_request_manager_set_enable_peft_finetuning( flexflow_request_manager_t handle_, bool enable_peft_finetuning_) { RequestManager *handle = FFCObjectWrapper::unwrap(handle_); diff --git a/tests/peft_test.sh b/tests/peft_test.sh index 6152844f5e..e497d4224e 100755 --- a/tests/peft_test.sh +++ b/tests/peft_test.sh @@ -31,16 +31,16 @@ mkdir -p ./inference/output export LEGION_BACKTRACE=1 # Download test model -python ./inference/utils/download_peft_model.py goliaro/llama-160m-lora --base_model_name JackFram/llama-160m +python ./inference/utils/download_peft_model.py goliaro/llama-160m-lora # Run PEFT in Huggingface to get ground truth tensors -python ./tests/peft/hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision -lr 1.0 +python ./tests/peft/hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision -lr 0.001 # Python test echo "Python test" -# python ./inference/python/ff_peft.py +python ./inference/python/ff_peft.py # Check alignment -# python ./tests/peft/peft_alignment_test.py -tp 2 +python ./tests/peft/peft_alignment_test.py -tp 4 -lr 0.001 # C++ test echo "C++ test" @@ -55,7 +55,7 @@ echo "C++ test" --use-full-precision \ --inference-debugging # Check alignment -python ./tests/peft/peft_alignment_test.py -tp 4 -lr 1.0 +python ./tests/peft/peft_alignment_test.py -tp 4 -lr 0.001 # Print succeess message echo "" From 266a1edd990d100b59bfc618c00b200d6b00d857 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 9 Nov 2024 23:06:25 +0000 Subject: [PATCH 33/37] load weights faster in inference test --- tests/inference/python_test_configs/generate_configs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index 4f7929e2db..637198f6ff 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -4,12 +4,12 @@ # Base configs dictionaries ff_init_configs = { # required parameters - "num_gpus": 4, + "num_gpus": 8, "memory_per_gpu": 14000, "zero_copy_memory_per_node": 40000, # optional parameters - "num_cpus": 4, - "legion_utility_processors": 4, + "num_cpus": 8, + "legion_utility_processors": 8, "data_parallelism_degree": 1, "tensor_parallelism_degree": 1, "pipeline_parallelism_degree": 4, From d771f6bb1b767dd87ac1836d682a8d8eddeea7bc Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 9 Nov 2024 23:10:47 +0000 Subject: [PATCH 34/37] fix --- tests/inference/python_test_configs/generate_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index 637198f6ff..2d6f115542 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -4,7 +4,7 @@ # Base configs dictionaries ff_init_configs = { # required parameters - "num_gpus": 8, + "num_gpus": 4, "memory_per_gpu": 14000, "zero_copy_memory_per_node": 40000, # optional parameters From fc626c67a6a88e29b7eb36653ea63c523b549857 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 9 Nov 2024 23:38:25 +0000 Subject: [PATCH 35/37] cleanup and fixes --- inference/models/opt.cc | 3 --- python/flexflow/serve/serve.py | 3 +++ src/runtime/model.cc | 26 ++++++++++++++----- .../python_test_configs/generate_configs.py | 7 +++-- 4 files changed, 25 insertions(+), 14 deletions(-) diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 03bb6600de..cb3d5290cf 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -243,9 +243,6 @@ void OPT::create_opt_model(FFModel &ff, REG_MODE_NONE, 0.0f, std::string("layers." + std::to_string(i) + ".fc2").c_str()); - // Low-Rank Adapter (LoRA) for the second linear layer - // ff.lora_linear(std::string("fc2"), std::string("layers." + - // std::to_string(i) + ".fc2.lora").c_str()); } // final diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 7932441c81..498fb4b616 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -463,6 +463,9 @@ def compile( mode = InferenceMode.TREE_VERIFY_MODE elif type(self) == SSM: mode = InferenceMode.BEAM_SEARCH_MODE + self.ffconfig.data_parallelism_degree = 1 + self.ffconfig.tensor_parallelism_degree = 1 + self.ffconfig.pipeline_parallelism_degree = 1 else: assert type(self) == LLM mode = InferenceMode.INC_DECODING_MODE diff --git a/src/runtime/model.cc b/src/runtime/model.cc index ca947039d0..2a76415818 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3420,13 +3420,25 @@ bool FFModel::need_to_add_combine(int layer_idx) const { bool FFModel::need_to_add_allreduce(int layer_idx) const { auto const &l = layers[layer_idx]; - if (config.computationMode == COMP_MODE_INFERENCE && - config.tensor_parallelism_degree > 1 && - ((l->op_type == OP_LINEAR && - std::string(l->name).find("attn.o_proj") != std::string::npos) || - is_mlp_block(layer_idx) || - (l->op_type == OP_LINEAR && - std::string(l->name).find("mlp.down_proj") != std::string::npos))) { + if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 && l->op_type == OP_LINEAR && + ( /*llama/mpt attention*/ + (std::string(l->name).find("attn.o_proj") != std::string::npos) || + /*opt/starcoder attention*/ + (std::string(l->name).find("self_attn.o_proj") != std::string::npos) || + /*falcon attention*/ + (std::string(l->name).find("self_attention.o_proj") != std::string::npos) || + /*llama mlp*/ + (std::string(l->name).find("mlp.down_proj") != std::string::npos) || + /*opt mlp*/ + (std::string(l->name).find("fc2") != std::string::npos) || + /*falcon mlp*/ + (std::string(l->name).find("mlp.dense_4h_to_h") != std::string::npos) || + /*mpt mlp*/ + (std::string(l->name).find("ffn.down_proj") != std::string::npos) || + /*starcoder mlp*/ + (std::string(l->name).find("mlp.c_proj") != std::string::npos) + ) + ) { return true; } return false; diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index 2d6f115542..afb7ffb9a7 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -62,15 +62,14 @@ # starcoder_models = ["bigcode/starcoderbase-7b",] parallelism_settings = [(1, 4), (2, 2), (4, 1)] -# The paths below should be with respect to the folder from which the tests are launched (FF_HOME/tests/inference) -prompt_file = "../../inference/prompt/test.json" -output_folder = "../../inference/output" - # Change working dir to folder storing this script abspath = os.path.abspath(__file__) dname = os.path.dirname(abspath) os.chdir(dname) +prompt_file = os.path.abspath("../../../inference/prompt/test.json") +output_folder = os.path.abspath("../../../inference/output") + # Generate incremental decoding configs all_models = llama_models + opt_models + falcon_models + mpt_models From ab5aa4bb638aad62d6593a512a450f8f806a446e Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 9 Nov 2024 23:42:16 +0000 Subject: [PATCH 36/37] linting --- src/runtime/model.cc | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 2a76415818..2a95caf6cb 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3420,25 +3420,25 @@ bool FFModel::need_to_add_combine(int layer_idx) const { bool FFModel::need_to_add_allreduce(int layer_idx) const { auto const &l = layers[layer_idx]; - if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 && l->op_type == OP_LINEAR && - ( /*llama/mpt attention*/ - (std::string(l->name).find("attn.o_proj") != std::string::npos) || - /*opt/starcoder attention*/ - (std::string(l->name).find("self_attn.o_proj") != std::string::npos) || - /*falcon attention*/ - (std::string(l->name).find("self_attention.o_proj") != std::string::npos) || - /*llama mlp*/ - (std::string(l->name).find("mlp.down_proj") != std::string::npos) || - /*opt mlp*/ - (std::string(l->name).find("fc2") != std::string::npos) || - /*falcon mlp*/ - (std::string(l->name).find("mlp.dense_4h_to_h") != std::string::npos) || - /*mpt mlp*/ - (std::string(l->name).find("ffn.down_proj") != std::string::npos) || - /*starcoder mlp*/ - (std::string(l->name).find("mlp.c_proj") != std::string::npos) - ) - ) { + if (config.computationMode == COMP_MODE_INFERENCE && + config.tensor_parallelism_degree > 1 && l->op_type == OP_LINEAR && + (/*llama/mpt attention*/ + (std::string(l->name).find("attn.o_proj") != std::string::npos) || + /*opt/starcoder attention*/ + (std::string(l->name).find("self_attn.o_proj") != std::string::npos) || + /*falcon attention*/ + (std::string(l->name).find("self_attention.o_proj") != + std::string::npos) || + /*llama mlp*/ + (std::string(l->name).find("mlp.down_proj") != std::string::npos) || + /*opt mlp*/ + (std::string(l->name).find("fc2") != std::string::npos) || + /*falcon mlp*/ + (std::string(l->name).find("mlp.dense_4h_to_h") != std::string::npos) || + /*mpt mlp*/ + (std::string(l->name).find("ffn.down_proj") != std::string::npos) || + /*starcoder mlp*/ + (std::string(l->name).find("mlp.c_proj") != std::string::npos))) { return true; } return false; From 7d99cf777f0bcf15e00ee2b59b849fb3771dc61d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 11 Nov 2024 02:11:14 +0000 Subject: [PATCH 37/37] fix --- python/flexflow/serve/serve.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 498fb4b616..c2804b6966 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -270,27 +270,20 @@ def __need_cache_refresh( Returns: bool: True if the weights or tokenizer need a refresh, False otherwise """ - need_refresh = False resource_path = self.__get_resource_path(model_name, resource_type) - if self.refresh_cache or not os.path.exists(resource_path): - need_refresh = True - else: - ff_revision, latest_revision = self.__get_revision_hashes( - self.model_name, resource_path - ) - if ff_revision != latest_revision: - need_refresh = True - if need_refresh: + ff_revision, latest_revision = self.__get_revision_hashes(self.model_name, resource_path) + if self.refresh_cache or not os.path.exists(resource_path) or ff_revision != latest_revision: print( f"Refreshing {resource_type} in cache for model {model_name} at path {resource_path} ..." ) if os.path.exists(resource_path): shutil.rmtree(resource_path) - os.makedirs(resource_path, exist_ok=True) - ff_revision_file = os.path.join(resource_path, "rev_sha.txt") - with open(ff_revision_file, "w+") as f: - f.write(latest_revision) - return need_refresh + os.makedirs(resource_path, exist_ok=True) + ff_revision_file = os.path.join(resource_path, "rev_sha.txt") + with open(ff_revision_file, "w+") as f: + f.write(latest_revision) + return True + return False def download_hf_weights_if_needed(self) -> None: """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date.