Skip to content

Commit

Permalink
Chatbot demo for K8s version (#48)
Browse files Browse the repository at this point in the history
* Chatbot demo for K8s version
  • Loading branch information
AyushSawant18588 authored Jan 4, 2024
1 parent c62f659 commit 3fe6dcd
Show file tree
Hide file tree
Showing 9 changed files with 1,138 additions and 3 deletions.
10 changes: 7 additions & 3 deletions .github/workflows/lint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,14 @@ jobs:
python-version: 3.11

- name: Install Python dependencies
run: pip install --no-cache-dir pytest black pylint torchserve==0.8.2 torch==2.0.1 transformers==4.33.0 -r llm/requirements.txt
run: pip install --no-cache-dir pytest black pylint torchserve==0.8.2 torch==2.0.1 transformers==4.33.0 -r llm/requirements.txt -r demo/requirements.txt

- name: Run pylint
run: pylint ./llm
run: |
pylint ./llm
pylint ./demo
- name: Run black
run: black ./llm --check
run: |
black ./llm --check
black ./demo --check
5 changes: 5 additions & 0 deletions demo/.streamlit/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[theme]
base="dark"

[ui]
hideTopBar = true
25 changes: 25 additions & 0 deletions demo/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Chatbot demo

This is a real time chatbot demo which talks to the deployed model endpoint over the REST API.

## Install Python requirements

pip install -r requirements.txt

## Deploy models

Download and deploy the following models on K8s cluster as per instructions provided in the [docs](https://opendocs.nutanix.com/gpt-in-a-box/overview/).

lama2-7b-chat

codellama-7b-python

## Run Chatbot app
>**NOTE:**
> Before deploying the Chatbot app, ensure that you have the necessary prerequisites. This includes having **kubectl** installed and a valid **KubeConfig** file for the Kubernetes (K8s) cluster where the Language Model (LLM) is deployed. If prerequisites are not present, follow the steps below:
>* Install [kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl).
>* Download and set up KubeConfig by following the steps outlined in [Downloading the Kubeconfig](https://portal.nutanix.com/page/documents/details?targetId=Nutanix-Kubernetes-Engine-v2_5:top-download-kubeconfig-t.html) on the Nutanix Support Portal.
Once the inference server is up, run

streamlit run chat.py
765 changes: 765 additions & 0 deletions demo/assistant.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
299 changes: 299 additions & 0 deletions demo/chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,299 @@
"""
GPT-in-a-Box Streamlit App
This module defines a Streamlit app for interacting with different Large Language models.
"""

import os
import json
import sys
import subprocess
import requests
import streamlit as st

# Add supported models to the list
AVAILABLE_MODELS = ["llama2-7b-chat", "codellama-7b-python"]
# AVAILABLE_MODELS = ["llama2-7b", "mpt-7b" , "falcon-7b"]
ASSISTANT_SVG = "assistant.svg"
USER_SVG = "user.svg"
LOGO_SVG = "nutanix.svg"

LLM_MODE = "chat"
LLM_HISTORY = "off"

if not os.path.exists(ASSISTANT_SVG):
ASSISTANT_AVATAR = None
else:
ASSISTANT_AVATAR = ASSISTANT_SVG

if not os.path.exists(USER_SVG):
USER_AVATAR = None
else:
USER_AVATAR = USER_SVG

# App title
st.title("Hola Nutanix")


def clear_chat_history():
"""
Clears the chat history by resetting the session state messages.
"""
st.session_state.messages = [
{"role": "assistant", "content": "How may I assist you today?"}
]


with st.sidebar:
if os.path.exists(LOGO_SVG):
_, col2, _, _ = st.columns(4)
with col2:
st.image(LOGO_SVG, width=150)

st.title("GPT-in-a-Box")
st.markdown(
"GPT-in-a-Box is a turnkey AI solution for organizations wanting to implement GPT"
"capabilities while maintaining control of their data and applications. Read the "
"[announcement]"
"(https://www.nutanix.com/blog/nutanix-simplifies-your-ai-innovation-learning-curve)"
)

st.subheader("Models")
selected_model = st.sidebar.selectbox(
"Choose a model", AVAILABLE_MODELS, key="selected_model"
)
if selected_model == "llama2-7b":
LLM = "llama2_7b"
st.markdown(
"Llama2 is a state-of-the-art foundational large language model which was "
"pretrained on publicly available online data sources. This chat model "
"leverages publicly available instruction datasets and over 1 "
"million human annotations."
)
elif selected_model == "mpt-7b":
LLM = "mpt_7b"
st.markdown(
"MPT-7B is a decoder-style transformer with 6.7B parameters. It was trained "
"on 1T tokens of text and code that was curated by MosaicML’s data team. "
"This base model includes FlashAttention for fast training and inference and "
"ALiBi for finetuning and extrapolation to long context lengths."
)
elif selected_model == "falcon-7b":
LLM = "falcon_7b"
st.markdown(
"Falcon-7B is a 7B parameters causal decoder-only model built by TII and "
"trained on 1,500B tokens of RefinedWeb enhanced with curated corpora."
)
elif selected_model == "codellama-7b-python":
LLM = "codellama_7b_python"
LLM_MODE = "code"
st.markdown(
"Code Llama is a large language model that can use text prompts to generate "
"and discuss code. It has the potential to make workflows faster and more "
"efficient for developers and lower the barrier to entry for people who are "
"learning to code."
)
elif selected_model == "llama2-7b-chat":
LLM = "llama2_7b_chat"
LLM_HISTORY = "on"
st.markdown(
"Llama2 is a state-of-the-art foundational large language model which was "
"pretrained on publicly available online data sources. This chat model "
"leverages publicly available instruction datasets and over 1 million "
"human annotations."
)
else:
sys.exit()

if "model" in st.session_state and st.session_state["model"] != LLM:
clear_chat_history()

st.session_state["model"] = LLM

# Store LLM generated responses
if "messages" not in st.session_state.keys():
st.session_state.messages = [
{"role": "assistant", "content": "How may I assist you today?"}
]


def add_message(chatmessage):
"""
Adds a message to the chat history.
Parameters:
- chatmessage (dict): A dictionary containing role ("assistant" or "user")
and content of the message.
"""

if chatmessage["role"] == "assistant":
avatar = ASSISTANT_AVATAR
else:
avatar = USER_AVATAR
if LLM_MODE == "code":
with st.chat_message(chatmessage["role"], avatar=avatar):
st.code(chatmessage["content"], language="python")
else:
with st.chat_message(chatmessage["role"], avatar=avatar):
st.write(chatmessage["content"])


# Display or clear chat messages
for message in st.session_state.messages:
add_message(message)

st.sidebar.button("Clear Chat History", on_click=clear_chat_history)


def generate_response(input_text):
"""
Generates a response from the LLM based on the given prompt.
Parameters:
- prompt_input (str): The input prompt for generating a response.
Returns:
- str: The generated response.
"""
try:
kubectl_command = (
"kubectl get po -l istio=ingressgateway "
"-n istio-system -o jsonpath='{.items[0].status.hostIP}'"
)
host_ip = subprocess.check_output(
kubectl_command, shell=True, text=True
).strip()

kubectl_command = (
"kubectl -n istio-system get service istio-ingressgateway "
"-o jsonpath='{.spec.ports[?(@.name==\"http2\")].nodePort}'"
)
ingress_port = subprocess.check_output(
kubectl_command, shell=True, text=True
).strip()

kubectl_command = (
"kubectl get inferenceservice llm-deploy -o jsonpath='{.status.url}'"
)
service_url = subprocess.check_output(
kubectl_command, shell=True, text=True
).strip()
service_hostname = service_url.split("/")[2]
except subprocess.CalledProcessError:
print("Inference backend is unavailable.")
return ""

input_prompt = get_json_format_prompt(input_text)
url = f"http://{host_ip}:" f"{ingress_port}/v2/models/{LLM}/infer"
headers = {
"Content-Type": "application/json; charset=utf-8",
"Host": service_hostname,
}
try:
response = requests.post(url, json=input_prompt, timeout=120, headers=headers)
response.raise_for_status()
except requests.exceptions.RequestException:
print("Error in requests: ", url)
return ""
output_dict = json.loads(response.text)
output = output_dict["outputs"][0]["data"][0]
return output


def generate_chat_response(input_prompt):
"""
Generates a chat-based response by including the chat history in the input prompt.
Parameters:
- prompt_input (str): The user-provided prompt.
Returns:
- str: The generated chat-based response.
"""
# Used [INST] and <<SYS>> tags in the input prompts for LLAMA 2 models.
# These are tags used to indicate different types of input within the conversation.
# "INST" stands for "instruction" and used to provide user queries to the model.
# "<<SYS>>" signifies system-related instructions and used to prime the
# model with context, instructions, or other information relevant to the use case.

string_dialogue = (
"[INST] <<SYS>> You are a helpful assistant. "
" You answer the question asked by 'User' once"
" as 'Assistant'. <</SYS>>[/INST]" + "\n\n"
)

for dict_message in st.session_state.messages[:-1]:
if dict_message["role"] == "user":
string_dialogue += "User: " + dict_message["content"] + "[/INST]" + "\n\n"
else:
string_dialogue += (
"Assistant: " + dict_message["content"] + " [INST]" + "\n\n"
)
string_dialogue += "User: " + f"{input_prompt}" + "\n\n"
input_text = f"{string_dialogue}" + "\n\n" + "Assistant: [/INST]"
output = generate_response(input_text)
# Generation failed
if len(output) <= len(input_text):
return ""
response = output[len(input_text) :]
return response


# User-provided prompt
if prompt := st.chat_input("Ask your query"):
message = {"role": "user", "content": prompt}
st.session_state.messages.append(message)
add_message(message)


def get_json_format_prompt(prompt_input):
"""
Converts the input prompt into the JSON format expected by the LLM.
Parameters:
- prompt_input (str): The input prompt.
Returns:
- dict: The prompt in JSON format.
"""
data = [prompt_input]
data_dict = {
"id": "1",
"inputs": [
{"name": "input0", "shape": [-1], "datatype": "BYTES", "data": data}
],
}
return data_dict


# Generate a new response if last message is not from assistant
def add_assistant_response():
"""
Adds the assistant's response to the chat history and displays
it to the user.
"""
if st.session_state.messages[-1]["role"] != "assistant":
with st.chat_message("assistant", avatar=ASSISTANT_AVATAR):
with st.spinner("Thinking..."):
if LLM_HISTORY == "on":
response = generate_chat_response(prompt)
else:
response = generate_response(prompt)
if not response:
st.markdown(
"<p style='color:red'>Inference backend is unavailable. "
"Please verify if the inference server is running</p>",
unsafe_allow_html=True,
)
return
if LLM_MODE == "code":
st.code(response, language="python")
else:
st.write(response)
chatmessage = {"role": "assistant", "content": response}
st.session_state.messages.append(chatmessage)


add_assistant_response()
32 changes: 32 additions & 0 deletions demo/nutanix.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 2 additions & 0 deletions demo/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
streamlit==1.28.1
streamlit-extras==0.3.5
1 change: 1 addition & 0 deletions demo/user.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 2 additions & 0 deletions llm/utils/marsgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ def generate_mars(
mar_size_thread.join()
print(f"## {gen_model.model_name}.mar is generated.\n")
except subprocess.CalledProcessError as exc:
stop_monitoring.set()
mar_size_thread.join()
print("## Creation failed !\n")
if debug:
print(f"## {gen_model.model_name} creation failed !, error: {exc}\n")
Expand Down

0 comments on commit 3fe6dcd

Please sign in to comment.