Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
goliaro committed Nov 8, 2024
1 parent 39e47a5 commit fca3d95
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 27 deletions.
21 changes: 7 additions & 14 deletions inference/python/streamlit/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
st.set_page_config(page_title="🚀💻 FlexLLM Server", layout="wide")

# FastAPI server URL
FASTAPI_URL = "http://localhost:8000/generate/" # Adjust the port if necessary
FASTAPI_URL = "http://localhost:8000/chat/completions" # Adjust the port if necessary
FINETUNE_URL = "http://localhost:8000/finetuning"

# Initialize session state variables
Expand All @@ -30,18 +30,11 @@ def clear_chat_history():
st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}]

# Function for generating LLaMA2 response
def generate_llama2_response(prompt_input):
string_dialogue = "You are a helpful assistant. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'."
for dict_message in st.session_state.messages:
if dict_message["role"] == "user":
string_dialogue += "User: " + dict_message["content"] + "\n\n"
else:
string_dialogue += "Assistant: " + dict_message["content"] + "\n\n"

full_prompt = f"{string_dialogue} {prompt_input} Assistant: "
def generate_llama3_response(prompt_input):
system_prompt="You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature."

# Send request to FastAPI server
response = requests.post(FASTAPI_URL, json={"prompt": full_prompt})
response = requests.post(FASTAPI_URL, json={"max_new_tokens": 1024, "messages": [{"role": "system", "content": system_prompt}] + st.session_state.messages + [{"role": "user", "content": prompt_input}]})

if response.status_code == 200:
return response.json()["response"]
Expand All @@ -58,7 +51,7 @@ def generate_llama2_response(prompt_input):
st.sidebar.button('Clear Chat History', on_click=clear_chat_history)

st.subheader('Generation parameters')
max_length = st.sidebar.slider('Max generation length', min_value=64, max_value=4096, value=2048, step=8)
max_length = st.sidebar.slider('Max generation length', min_value=64, max_value=2048, value=1024, step=8)
# selected_model = st.sidebar.selectbox('Choose a Llama2 model', ['Llama2-7B', 'Llama2-13B', 'Llama2-70B'], key='selected_model')
decoding_method = st.sidebar.selectbox('Decoding method', ['Greedy decoding (default)', 'Sampling'], key='decoding_method')
temperature = st.sidebar.slider('temperature', min_value=0.01, max_value=5.0, value=0.1, step=0.01, disabled=decoding_method == 'Greedy decoding (default)')
Expand Down Expand Up @@ -181,8 +174,8 @@ def generate_llama2_response(prompt_input):
# Generate a new response if last message is not from assistant
if st.session_state.messages[-1]["role"] != "assistant":
with st.chat_message("assistant"):
with st.spinner("Thinking..."):
response = generate_llama2_response(prompt)
with st.spinner("Running..."):
response = generate_llama3_response(prompt)
placeholder = st.empty()
full_response = ''
for item in response:
Expand Down
31 changes: 18 additions & 13 deletions inference/python/streamlit/fastapi_incr.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,16 @@ class Message(BaseModel):
content: str


# class ChatCompletionRequest(BaseModel):
# model: Optional[str] = "mock-gpt-model"
# messages: List[Message]
# max_tokens: Optional[int] = 512
# temperature: Optional[float] = 0.1
# stream: Optional[bool] = False

class ChatCompletionRequest(BaseModel):
model: Optional[str] = "mock-gpt-model"
max_new_tokens: Optional[int] = 1024
messages: List[Message]
max_tokens: Optional[int] = 512
temperature: Optional[float] = 0.1
stream: Optional[bool] = False

# Global variable to store the LLM model
llm = None
Expand All @@ -76,12 +80,12 @@ def get_configs():
# Define sample configs
ff_init_configs = {
# required parameters
"num_gpus": 4,
"num_gpus": 8,
"memory_per_gpu": 20000,
"zero_copy_memory_per_node": 40000,
# optional parameters
"num_cpus": 4,
"legion_utility_processors": 4,
"legion_utility_processors": 8,
"data_parallelism_degree": 1,
"tensor_parallelism_degree": 4,
"pipeline_parallelism_degree": 1,
Expand All @@ -98,7 +102,7 @@ def get_configs():
}
llm_configs = {
# required parameters
"llm_model": "meta-llama/Meta-Llama-3.1-8B",
"llm_model": "meta-llama/Llama-3.1-8B-Instruct",
# optional parameters
"cache_path": os.environ.get("FF_CACHE_PATH", ""),
"refresh_cache": False,
Expand Down Expand Up @@ -139,7 +143,7 @@ async def startup_event():
generation_config,
max_requests_per_batch=16,
max_seq_length=2048,
max_tokens_per_batch=64,
max_tokens_per_batch=1024,
)
llm.start_server()

Expand Down Expand Up @@ -171,11 +175,12 @@ async def chat_completions(request: ChatCompletionRequest):
if llm is None:
raise HTTPException(status_code=503, detail="LLM model is not initialized.")

if request.messages and request.messages[0].role == 'user':
resp_content = "As a mock AI Assitant, I can only echo your last message:" + request.messages[-1].content
else:
resp_content = "As a mock AI Assitant, I can only echo your last message, but there were no messages!"

print("received request:", request)
result = llm.generate([message.dict() for message in request.messages], max_new_tokens=request.max_new_tokens)[0].output_text.decode('utf-8')
print("returning response:", result)
return {
"response": result
}
return {
"id": "1337",
"object": "chat.completion",
Expand Down
1 change: 1 addition & 0 deletions python/flexflow/core/flexflow_cffi.py
Original file line number Diff line number Diff line change
Expand Up @@ -4759,6 +4759,7 @@ def generate(self, requests_list: List[Request]):
finetuning_losses=finetuning_losses,
)
)
return results

def set_position_offset(self, offset):
ffc().flexflow_model_set_position_offset(self.handle, offset)

0 comments on commit fca3d95

Please sign in to comment.