diff --git a/chart/model-values.yaml b/chart/model-values.yaml index b331dcf..2ba0a4b 100644 --- a/chart/model-values.yaml +++ b/chart/model-values.yaml @@ -365,16 +365,29 @@ deployments: ESRGAN_MODEL_IDS: |- RealESRGAN_x2plus - - name: "common-llms-sealion" - image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-common:5" + - name: "common-llms-sealion-v2" + image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-common:7" + limits_gpu: "30Gi" limits: - memory: "45Gi" + memory: "80Gi" # (220 / 80) * 30 cpu: "2" env: IMPORTS: |- common.llms LLM_MODEL_IDS: |- - aisingapore/sea-lion-7b-instruct + aisingapore/llama3-8b-cpt-sea-lionv2-instruct + + - name: "common-llms-sarvam-2b" + image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-common:7" + limits_gpu: "6Gi" + limits: + memory: "16Gi" # (220 / 80) * 6 + cpu: "2" + env: + IMPORTS: |- + common.llms + LLM_MODEL_IDS: |- + sarvamai/sarvam-2b-v0.5 ## Dependencies nfs-server-provisioner: diff --git a/common/llms.py b/common/llms.py index 133eec7..49ec697 100644 --- a/common/llms.py +++ b/common/llms.py @@ -6,6 +6,7 @@ import transformers from pydantic import BaseModel from transformers import AutoTokenizer +from transformers.models.auto.tokenization_auto import get_tokenizer_config import gooey_gpu from celeryconfig import app, setup_queues @@ -14,10 +15,11 @@ class PipelineInfo(BaseModel): model_id: str seed: int = None + fallback_chat_template_from: str | None class LLMChatInputs(BaseModel): - messages: typing.List[dict] + text_inputs: typing.List[dict] | str max_new_tokens: int stop_strings: typing.Optional[typing.List[str]] temperature: float = 1 @@ -33,8 +35,15 @@ class LLMChatOutput(BaseModel): @gooey_gpu.endpoint def llm_chat(pipeline: PipelineInfo, inputs: LLMChatInputs) -> LLMChatOutput: pipe = load_pipe(pipeline.model_id) - return pipe( - inputs.messages, + + if pipeline.fallback_chat_template_from and not pipe.tokenizer.chat_template: + # if the tokenizer does not have a chat template, use the provided fallback + config = get_tokenizer_config(pipeline.fallback_chat_template_from) + pipe.tokenizer.chat_template = config.get("chat_template") + + # for a list of parameters, see https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig + ret = pipe( + inputs.text_inputs, max_new_tokens=inputs.max_new_tokens, stop_strings=inputs.stop_strings, temperature=inputs.temperature, @@ -44,17 +53,26 @@ def llm_chat(pipeline: PipelineInfo, inputs: LLMChatInputs) -> LLMChatOutput: eos_token_id=pipe.tokenizer.eos_token_id, )[0] + # strip stop strings & eos token from final output + for s in (inputs.stop_strings or []) + [pipe.tokenizer.eos_token]: + ret["generated_text"] = ret["generated_text"].split(s, 1)[0] + + return ret + @lru_cache -def load_pipe(model_id: str): +def load_pipe(model_id: str) -> transformers.TextGenerationPipeline: print(f"Loading llm model {model_id!r}...") # this should return a TextGenerationPipeline - pipe = transformers.pipeline( - "text-generation", - model=model_id, - device=gooey_gpu.DEVICE_ID, - torch_dtype=torch.float16, - trust_remote_code=True, + pipe = typing.cast( + transformers.TextGenerationPipeline, + transformers.pipeline( + "text-generation", + model=model_id, + device=gooey_gpu.DEVICE_ID, + torch_dtype=torch.float16, + trust_remote_code=True, + ), ) if not pipe.tokenizer: pipe.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) diff --git a/scripts/run-dev.sh b/scripts/run-dev.sh index f3bd08a..4350b50 100755 --- a/scripts/run-dev.sh +++ b/scripts/run-dev.sh @@ -74,7 +74,7 @@ docker run \ RealESRGAN_x2plus "\ -e LLM_MODEL_IDS=" - aisingapore/sea-lion-7b-instruct + aisingapore/llama3-8b-cpt-sea-lionv2-instruct "\ -e C_FORCE_ROOT=1 \ -e BROKER_URL=${BROKER_URL:-"amqp://"} \