From fadc9105208855a781ed62dd912cc33950fc26f8 Mon Sep 17 00:00:00 2001 From: Dev Aggarwal Date: Thu, 5 Dec 2024 21:01:39 +0530 Subject: [PATCH] add new whisper model dmatekenya/whisper-large-v3-chichewa --- chart/model-values.yaml | 136 ++++++++++++++++++++++++---------------- common/whisper.py | 22 ++++--- scripts/run-dev.sh | 7 ++- 3 files changed, 101 insertions(+), 64 deletions(-) diff --git a/chart/model-values.yaml b/chart/model-values.yaml index 2ceb3d0..19283ed 100644 --- a/chart/model-values.yaml +++ b/chart/model-values.yaml @@ -36,13 +36,14 @@ controlnetModelIds: &controlnetModelIds |- ioclab/control_v1p_sd15_brightness monster-labs/control_v1p_sd15_qrcode_monster/v2 -commonImg: &commonImg "crgooeyprodwestus1.azurecr.io/gooey-gpu-common:3" +commonImgOld: &commonImgOld "crgooeyprodwestus1.azurecr.io/gooey-gpu-common:3" +commonImg: &commonImg "crgooeyprodwestus1.azurecr.io/gooey-gpu-common:8" retroImg: &retroImg "crgooeyprodwestus1.azurecr.io/gooey-gpu-retro:6" deforumImg: &deforumImg "crgooeyprodwestus1.azurecr.io/gooey-gpu-deforum_sd:1" deployments: - name: "common-diffusion-dreamshaper" - image: *commonImg + image: *commonImgOld limits: memory: "35Gi" env: @@ -53,7 +54,7 @@ deployments: SD_MODEL_IDS: |- Lykon/DreamShaper - name: "common-diffusion-stable-diffusion-2-1" - image: *commonImg + image: *commonImgOld limits: memory: "35Gi" env: @@ -64,7 +65,7 @@ deployments: SD_MODEL_IDS: |- stabilityai/stable-diffusion-2-1 - name: "common-diffusion-dreamlike-photoreal-2" - image: *commonImg + image: *commonImgOld limits: memory: "35Gi" env: @@ -75,7 +76,7 @@ deployments: SD_MODEL_IDS: |- dreamlike-art/dreamlike-photoreal-2.0 - name: "common-diffusion-stable-diffusion-v1-5" - image: *commonImg + image: *commonImgOld limits: memory: "35Gi" env: @@ -87,7 +88,7 @@ deployments: runwayml/stable-diffusion-v1-5 - name: "common-diffusion-on-demand" - image: *commonImg + image: *commonImgOld limits: memory: "50Gi" env: @@ -102,7 +103,7 @@ deployments: darkstorm2150/Protogen_v5.3_Official_Release - name: "common-diffusion-inpaint" - image: *commonImg + image: *commonImgOld limits: memory: "20Gi" env: @@ -142,7 +143,7 @@ deployments: epicdream.safetensors - name: "common-whisper-en-short" - image: *commonImg + image: *commonImgOld limits: memory: "20Gi" env: @@ -153,7 +154,7 @@ deployments: openai/whisper-large-v2 - name: "common-whisper-te-short" - image: *commonImg + image: *commonImgOld limits: memory: "20Gi" env: @@ -163,6 +164,69 @@ deployments: WHISPER_MODEL_IDS: |- vasista22/whisper-telugu-large-v2 + - name: "common-whisper-en-te-long" + image: *commonImgOld + limits: + memory: "40Gi" + env: + QUEUE_PREFIX: "gooey-gpu/long" + IMPORTS: |- + common.whisper + WHISPER_MODEL_IDS: |- + openai/whisper-large-v2 + vasista22/whisper-telugu-large-v2 + + - name: "common-whisper-hi-bho-short" + image: *commonImgOld + limits: + memory: "10Gi" + env: + QUEUE_PREFIX: "gooey-gpu/short" + IMPORTS: |- + common.whisper + WHISPER_MODEL_IDS: |- + vasista22/whisper-hindi-large-v2 + Harveenchadha/vakyansh-wav2vec2-bhojpuri-bhom-60 + + - name: "common-whisper-hi-bho-long" + image: *commonImgOld + limits: + memory: "40Gi" + env: + QUEUE_PREFIX: "gooey-gpu/long" + IMPORTS: |- + common.whisper + WHISPER_MODEL_IDS: |- + vasista22/whisper-hindi-large-v2 + Harveenchadha/vakyansh-wav2vec2-bhojpuri-bhom-60 + + - name: "common-whisper-chichewa-short" + image: *commonImg + limits_gpu: "10Gi" + limits: + memory: "28Gi" # (220 / 80) * 10 + env: + QUEUE_PREFIX: "gooey-gpu/short" + IMPORTS: |- + common.whisper + WHISPER_MODEL_IDS: |- + dmatekenya/whisper-large-v3-chichewa + WHISPER_TOKENIZER_FROM: openai/whisper-large-v3 + - name: "common-whisper-chichewa-long" + image: *commonImg + autoscaling: + minReplicaCount: 0 + limits_gpu: "10Gi" + limits: + memory: "28Gi" # (220 / 80) * 10 + env: + QUEUE_PREFIX: "gooey-gpu/long" + IMPORTS: |- + common.whisper + WHISPER_MODEL_IDS: |- + dmatekenya/whisper-large-v3-chichewa + WHISPER_TOKENIZER_FROM: openai/whisper-large-v3 + - name: "retro-nemo-asr" image: *retroImg limits: @@ -175,7 +239,7 @@ deployments: https://objectstore.e2enetworks.net/indic-asr-public/checkpoints/conformer/stt_hi_conformer_ctc_large_v2.nemo - name: "common-audio-ldm-bark" - image: *commonImg + image: *commonImgOld limits: memory: "20Gi" env: @@ -190,7 +254,7 @@ deployments: bark - name: "common-seamless" - image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-common:6" + image: *commonImg limits_gpu: "10Gi" limits: memory: "28Gi" # (220 / 80) * 10 @@ -201,7 +265,7 @@ deployments: facebook/seamless-m4t-v2-large - name: "common-diffusion-instruct-pix2pix" - image: *commonImg + image: *commonImgOld limits: memory: "12Gi" env: @@ -211,7 +275,7 @@ deployments: timbrooks/instruct-pix2pix - name: "common-diffusion-upscale" - image: *commonImg + image: *commonImgOld limits: memory: "52Gi" env: @@ -221,7 +285,7 @@ deployments: stabilityai/stable-diffusion-x4-upscaler - name: "common-mms" - image: *commonImg + image: *commonImgOld limits: memory: "25Gi" env: @@ -230,44 +294,8 @@ deployments: MMS_MODEL_IDS: |- facebook/mms-1b-all - - name: "common-whisper-en-te-long" - image: *commonImg - limits: - memory: "40Gi" - env: - QUEUE_PREFIX: "gooey-gpu/long" - IMPORTS: |- - common.whisper - WHISPER_MODEL_IDS: |- - openai/whisper-large-v2 - vasista22/whisper-telugu-large-v2 - - - name: "common-whisper-hi-bho-long" - image: *commonImg - limits: - memory: "40Gi" - env: - QUEUE_PREFIX: "gooey-gpu/long" - IMPORTS: |- - common.whisper - WHISPER_MODEL_IDS: |- - vasista22/whisper-hindi-large-v2 - Harveenchadha/vakyansh-wav2vec2-bhojpuri-bhom-60 - - - name: "common-whisper-hi-bho-short" - image: *commonImg - limits: - memory: "10Gi" - env: - QUEUE_PREFIX: "gooey-gpu/short" - IMPORTS: |- - common.whisper - WHISPER_MODEL_IDS: |- - vasista22/whisper-hindi-large-v2 - Harveenchadha/vakyansh-wav2vec2-bhojpuri-bhom-60 - - name: "common-embeddings-1" - image: *commonImg + image: *commonImgOld autoscaling: queueLength: 20 limits: @@ -344,7 +372,7 @@ deployments: RealESRGAN_x2plus - name: "common-llms-afrollama-v1" - image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-common:7" + image: *commonImg limits_gpu: "30Gi" limits: memory: "80Gi" # (220 / 80) * 30 @@ -355,7 +383,7 @@ deployments: Jacaranda/AfroLlama_V1 - name: "common-llms-sealion-v2-1" - image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-common:7" + image: *commonImg limits_gpu: "30Gi" limits: memory: "80Gi" # (220 / 80) * 30 @@ -366,7 +394,7 @@ deployments: aisingapore/llama3-8b-cpt-sea-lionv2.1-instruct - name: "common-llms-sarvam-2b" - image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-common:7" + image: *commonImg limits_gpu: "6Gi" limits: memory: "16Gi" # (220 / 80) * 6 diff --git a/common/whisper.py b/common/whisper.py index 5df5f4d..81ced96 100644 --- a/common/whisper.py +++ b/common/whisper.py @@ -1,10 +1,12 @@ import os +import typing from functools import lru_cache import numpy as np import requests import torch import transformers +from transformers import WhisperTokenizer import gooey_gpu from api import PipelineInfo, WhisperInputs, AsrOutput @@ -20,12 +22,14 @@ def whisper(pipeline: PipelineInfo, inputs: WhisperInputs) -> AsrOutput: kwargs = {} if inputs.return_timestamps: kwargs["return_timestamps"] = True + generate_kwargs = {} if inputs.language: - kwargs["generate_kwargs"] = dict( - forced_decoder_ids=pipe.tokenizer.get_decoder_prompt_ids( - task=inputs.task, language=inputs.language - ) - ) + generate_kwargs["language"] = inputs.language + if inputs.task: + generate_kwargs["task"] = inputs.task + if generate_kwargs: + kwargs["generate_kwargs"] = generate_kwargs + # see https://github.com/huggingface/transformers/issues/24707 old_postprocess = pipe.postprocess if inputs.decoder_kwargs: @@ -58,15 +62,19 @@ def postprocess(model_outputs): @lru_cache -def load_pipe(model_id: str): +def load_pipe(model_id: str) -> transformers.AutomaticSpeechRecognitionPipeline: print(f"Loading asr model {model_id!r}...") + kwargs = {} + if tokenizer_from := os.environ.get("WHISPER_TOKENIZER_FROM"): + kwargs["tokenizer"] = WhisperTokenizer.from_pretrained(tokenizer_from.strip()) pipe = transformers.pipeline( "automatic-speech-recognition", model=model_id, device=gooey_gpu.DEVICE_ID, torch_dtype=torch.float16, + **kwargs, ) - return pipe + return typing.cast(transformers.AutomaticSpeechRecognitionPipeline, pipe) setup_queues( diff --git a/scripts/run-dev.sh b/scripts/run-dev.sh index 4350b50..5bfaa62 100755 --- a/scripts/run-dev.sh +++ b/scripts/run-dev.sh @@ -34,10 +34,11 @@ docker run \ facebook/mms-1b-all "\ -e WHISPER_MODEL_IDS=" - openai/whisper-large-v2 - vasista22/whisper-telugu-large-v2 - vasista22/whisper-hindi-large-v2 + dmatekenya/whisper-large-v3-chichewa " \ + -e WHISPER_TOKENIZER_FROM=" + openai/whisper-large-v3 + "\ -e SD_MODEL_IDS=" stabilityai/stable-diffusion-2-inpainting runwayml/stable-diffusion-inpainting