diff --git a/inference/python/streamlit/app.py b/inference/python/streamlit/app.py index 4d8633e167..9788765a3a 100644 --- a/inference/python/streamlit/app.py +++ b/inference/python/streamlit/app.py @@ -8,7 +8,7 @@ st.set_page_config(page_title="🚀💻 FlexLLM Server", layout="wide") # FastAPI server URL -FASTAPI_URL = "http://localhost:8000/generate/" # Adjust the port if necessary +FASTAPI_URL = "http://localhost:8000/chat/completions" # Adjust the port if necessary FINETUNE_URL = "http://localhost:8000/finetuning" # Initialize session state variables @@ -30,18 +30,11 @@ def clear_chat_history(): st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}] # Function for generating LLaMA2 response -def generate_llama2_response(prompt_input): - string_dialogue = "You are a helpful assistant. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'." - for dict_message in st.session_state.messages: - if dict_message["role"] == "user": - string_dialogue += "User: " + dict_message["content"] + "\n\n" - else: - string_dialogue += "Assistant: " + dict_message["content"] + "\n\n" - - full_prompt = f"{string_dialogue} {prompt_input} Assistant: " +def generate_llama3_response(prompt_input): + system_prompt="You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature." # Send request to FastAPI server - response = requests.post(FASTAPI_URL, json={"prompt": full_prompt}) + response = requests.post(FASTAPI_URL, json={"max_new_tokens": 1024, "messages": [{"role": "system", "content": system_prompt}] + st.session_state.messages + [{"role": "user", "content": prompt_input}]}) if response.status_code == 200: return response.json()["response"] @@ -58,7 +51,7 @@ def generate_llama2_response(prompt_input): st.sidebar.button('Clear Chat History', on_click=clear_chat_history) st.subheader('Generation parameters') - max_length = st.sidebar.slider('Max generation length', min_value=64, max_value=4096, value=2048, step=8) + max_length = st.sidebar.slider('Max generation length', min_value=64, max_value=2048, value=1024, step=8) # selected_model = st.sidebar.selectbox('Choose a Llama2 model', ['Llama2-7B', 'Llama2-13B', 'Llama2-70B'], key='selected_model') decoding_method = st.sidebar.selectbox('Decoding method', ['Greedy decoding (default)', 'Sampling'], key='decoding_method') temperature = st.sidebar.slider('temperature', min_value=0.01, max_value=5.0, value=0.1, step=0.01, disabled=decoding_method == 'Greedy decoding (default)') @@ -181,8 +174,8 @@ def generate_llama2_response(prompt_input): # Generate a new response if last message is not from assistant if st.session_state.messages[-1]["role"] != "assistant": with st.chat_message("assistant"): - with st.spinner("Thinking..."): - response = generate_llama2_response(prompt) + with st.spinner("Running..."): + response = generate_llama3_response(prompt) placeholder = st.empty() full_response = '' for item in response: diff --git a/inference/python/streamlit/fastapi_incr.py b/inference/python/streamlit/fastapi_incr.py index a1095e13dc..6ac7f4149a 100644 --- a/inference/python/streamlit/fastapi_incr.py +++ b/inference/python/streamlit/fastapi_incr.py @@ -46,12 +46,16 @@ class Message(BaseModel): content: str +# class ChatCompletionRequest(BaseModel): +# model: Optional[str] = "mock-gpt-model" +# messages: List[Message] +# max_tokens: Optional[int] = 512 +# temperature: Optional[float] = 0.1 +# stream: Optional[bool] = False + class ChatCompletionRequest(BaseModel): - model: Optional[str] = "mock-gpt-model" + max_new_tokens: Optional[int] = 1024 messages: List[Message] - max_tokens: Optional[int] = 512 - temperature: Optional[float] = 0.1 - stream: Optional[bool] = False # Global variable to store the LLM model llm = None @@ -76,12 +80,12 @@ def get_configs(): # Define sample configs ff_init_configs = { # required parameters - "num_gpus": 4, + "num_gpus": 8, "memory_per_gpu": 20000, "zero_copy_memory_per_node": 40000, # optional parameters "num_cpus": 4, - "legion_utility_processors": 4, + "legion_utility_processors": 8, "data_parallelism_degree": 1, "tensor_parallelism_degree": 4, "pipeline_parallelism_degree": 1, @@ -98,7 +102,7 @@ def get_configs(): } llm_configs = { # required parameters - "llm_model": "meta-llama/Meta-Llama-3.1-8B", + "llm_model": "meta-llama/Llama-3.1-8B-Instruct", # optional parameters "cache_path": os.environ.get("FF_CACHE_PATH", ""), "refresh_cache": False, @@ -139,7 +143,7 @@ async def startup_event(): generation_config, max_requests_per_batch=16, max_seq_length=2048, - max_tokens_per_batch=64, + max_tokens_per_batch=1024, ) llm.start_server() @@ -171,11 +175,12 @@ async def chat_completions(request: ChatCompletionRequest): if llm is None: raise HTTPException(status_code=503, detail="LLM model is not initialized.") - if request.messages and request.messages[0].role == 'user': - resp_content = "As a mock AI Assitant, I can only echo your last message:" + request.messages[-1].content - else: - resp_content = "As a mock AI Assitant, I can only echo your last message, but there were no messages!" - + print("received request:", request) + result = llm.generate([message.dict() for message in request.messages], max_new_tokens=request.max_new_tokens)[0].output_text.decode('utf-8') + print("returning response:", result) + return { + "response": result + } return { "id": "1337", "object": "chat.completion", diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 151b01b873..4ff8348f46 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -4759,6 +4759,7 @@ def generate(self, requests_list: List[Request]): finetuning_losses=finetuning_losses, ) ) + return results def set_position_offset(self, offset): ffc().flexflow_model_set_position_offset(self.handle, offset)