diff --git a/daras_ai_v2/asr.py b/daras_ai_v2/asr.py index 0dc03b00a..1ed8e94bf 100644 --- a/daras_ai_v2/asr.py +++ b/daras_ai_v2/asr.py @@ -384,19 +384,15 @@ def run_asr( if result.alternatives ) elif "nemo" in selected_model.name: - r = requests.post( - str(GpuEndpoints.nemo_asr), - json={ - "pipeline": dict( - model_id=asr_model_ids[selected_model], - ), - "inputs": dict( - audio=audio_url, - ), - }, + data = call_celery_task( + "nemo_asr", + pipeline=dict( + model_id=asr_model_ids[selected_model], + ), + inputs=dict( + audio=audio_url, + ), ) - r.raise_for_status() - data = r.json() # check if we should use the fast queue # call one of the self-hosted models else: diff --git a/daras_ai_v2/enum_selector_widget.py b/daras_ai_v2/enum_selector_widget.py index dcbbd586a..3b9fcb905 100644 --- a/daras_ai_v2/enum_selector_widget.py +++ b/daras_ai_v2/enum_selector_widget.py @@ -14,6 +14,12 @@ def enum_multiselect( checkboxes=True, allow_none=True, ): + try: + deprecated = enum_cls._deprecated() + except AttributeError: + deprecated = set() + enums = [e for e in enum_cls if not e in deprecated] + if checkboxes: if label: st.write(label) @@ -31,13 +37,13 @@ def render(e): else: selected.discard(e.name) - grid_layout(2, enum_cls, render, separator=False) + grid_layout(2, enums, render, separator=False) st.session_state[key] = list(selected) return selected else: return st.multiselect( - options=[e.name for e in enum_cls], + options=[e.name for e in enums], format_func=lambda k: enum_cls[k].value, label=label, key=key, @@ -53,8 +59,13 @@ def enum_selector( exclude: list[E] | None = None, **kwargs, ) -> str: + try: + deprecated = enum_cls._deprecated() + except AttributeError: + deprecated = set() + enums = [e for e in enum_cls if not e in deprecated] label = label or enum_cls.__name__ - options = [e.name for e in enum_cls] + options = [e.name for e in enums] if exclude: options = [o for o in options if o not in exclude] if allow_none: diff --git a/daras_ai_v2/face_restoration.py b/daras_ai_v2/face_restoration.py index 09d6cfa00..cfd2d1ec0 100644 --- a/daras_ai_v2/face_restoration.py +++ b/daras_ai_v2/face_restoration.py @@ -75,11 +75,10 @@ def gfpgan(img: str, scale: int = 1) -> bytes: elif scale != 2: scale *= 2 - return call_gpu_server_b64( - endpoint=GpuEndpoints.gfpgan, - input_data={ - "img": img, - "version": "v1.4", - "scale": scale, - }, - )[0] + # https://replicate.com/nightmareai/real-esrgan/versions/42fed1c4974146d4d2414e2be2c5277c7fcf05fcc3a73abf41610695738c1d7b#output-schema + model = replicate.models.get("tencentarc/gfpgan") + version = model.versions.get( + "9283608cc6b7be6b65a8e44983db012355fde4132009bf99d976b2f0896856a3" + ) + img = version.predict(img=img, version="v1.4", scale=scale) + return requests.get(img).content diff --git a/daras_ai_v2/gpu_server.py b/daras_ai_v2/gpu_server.py index d56f2a9d0..d4c47a139 100644 --- a/daras_ai_v2/gpu_server.py +++ b/daras_ai_v2/gpu_server.py @@ -11,34 +11,6 @@ class GpuEndpoints: - wav2lip = settings.GPU_SERVER_1.copy().set(port=5001) - glid_3_xl_stable = settings.GPU_SERVER_1.copy().set(port=5002) - gfpgan = settings.GPU_SERVER_1.copy().set(port=5003) - dichotomous_image_segmentation = settings.GPU_SERVER_1.copy().set(port=5004) - # flan_t5 = f"{settings.GPU_SERVER_2}:5005" - # runway_ml_inpainting = f"{settings.GPU_SERVER_2}:5006" - u2net = settings.GPU_SERVER_1.copy().set(port=5007) - # deforum_sd = f"{settings.GPU_SERVER_2}:5008" - sd_2 = settings.GPU_SERVER_1.copy().set(port=5011) - # sd_multi = settings.GPU_SERVER_1.copy().set(port=5012) - # real_esrgan = settings.GPU_SERVER_1furl().set(port=5013) - # defourm_sd = settings.GPU_SERVER_2.copy().set(port=5014) / "deforum" - - lavis = settings.GPU_SERVER_1.copy().set(port=5015) - vqa = lavis / "vqa" - image_captioning = lavis / "image-captioning" - - _asr = settings.GPU_SERVER_1.copy().set(port=5016) - whisper = _asr / "whisper" - nemo_asr = _asr / "nemo/asr" - - _asr_fast = settings.GPU_SERVER_1.copy().set(port=5019) - whisper_fast = _asr_fast / "whisper" - nemo_asr_fast = _asr_fast / "nemo/asr" - - audio_ldm = settings.GPU_SERVER_1.copy().set(port=5017) / "audio_ldm" - bark = settings.GPU_SERVER_1.copy().set(port=5017) / "bark" - deepfloyd_if = settings.GPU_SERVER_1.copy().set(port=5018) / "deepfloyd_if" diff --git a/daras_ai_v2/image_segmentation.py b/daras_ai_v2/image_segmentation.py index 2db811e88..30a2055bc 100644 --- a/daras_ai_v2/image_segmentation.py +++ b/daras_ai_v2/image_segmentation.py @@ -1,7 +1,10 @@ from enum import Enum -from daras_ai.image_input import bytes_to_cv2_img, cv2_img_to_bytes -from daras_ai_v2.gpu_server import call_gpu_server_b64, GpuEndpoints +import requests + +from daras_ai_v2.gpu_server import ( + call_celery_task_outfile, +) class ImageSegmentationModels(Enum): @@ -10,18 +13,26 @@ class ImageSegmentationModels(Enum): def u2net(input_image: str) -> bytes: - return call_gpu_server_b64( - endpoint=GpuEndpoints.u2net, - input_data={ - "image": input_image, - }, + url = call_celery_task_outfile( + "u2net", + pipeline=dict(model_id="u2net"), + inputs={"images": [input_image]}, + content_type="image/png", + filename="u2net.png", )[0] + r = requests.get(url) + r.raise_for_status() + return r.content def dis(input_image: str) -> bytes: - return call_gpu_server_b64( - endpoint=GpuEndpoints.dichotomous_image_segmentation, - input_data={ - "input_image": input_image, - }, + url = call_celery_task_outfile( + "dis", + pipeline=dict(model_id="isnet-general-use.pth"), + inputs={"images": [input_image]}, + content_type="image/png", + filename="dis.png", )[0] + r = requests.get(url) + r.raise_for_status() + return r.content diff --git a/daras_ai_v2/img_model_settings_widgets.py b/daras_ai_v2/img_model_settings_widgets.py index c17bc5e2a..fea120051 100644 --- a/daras_ai_v2/img_model_settings_widgets.py +++ b/daras_ai_v2/img_model_settings_widgets.py @@ -81,7 +81,7 @@ def model_selector( col1, col2 = st.columns(2) with col1: selected_model = enum_selector( - Img2ImgModels, + models_enum, label=""" ### 🤖 Choose your preferred AI Model """, @@ -101,7 +101,7 @@ def model_selector( ): if "selected_controlnet_model" in st.session_state: st.session_state["selected_controlnet_model"] = None - else: + elif models_enum is Img2ImgModels: enum_multiselect( ControlNetModels, label=controlnet_explanation, @@ -109,12 +109,12 @@ def model_selector( checkboxes=False, allow_none=not require_controlnet, ) - with col2: - controlnet_settings( - extra_explanations=extra_explanations, - low_explanation=low_explanation, - high_explanation=high_explanation, - ) + with col2: + controlnet_settings( + extra_explanations=extra_explanations, + low_explanation=low_explanation, + high_explanation=high_explanation, + ) return selected_model @@ -410,7 +410,7 @@ def prompt_strength_setting(selected_model: str = None): def negative_prompt_setting(selected_model: str = None): - if selected_model in [Text2ImgModels.dall_e.name, InpaintingModels.runway_ml.name]: + if selected_model in [Text2ImgModels.dall_e.name]: return st.text_area( diff --git a/daras_ai_v2/language_model.py b/daras_ai_v2/language_model.py index 408e6bd51..754fc0c01 100644 --- a/daras_ai_v2/language_model.py +++ b/daras_ai_v2/language_model.py @@ -54,6 +54,10 @@ class LargeLanguageModels(Enum): code_davinci_002 = "Codex [Deprecated] (openai)" + @classmethod + def _deprecated(cls): + return {cls.code_davinci_002} + def is_chat_model(self) -> bool: return self in [ LargeLanguageModels.gpt_4, diff --git a/daras_ai_v2/settings.py b/daras_ai_v2/settings.py index 5046ee5f5..48df72c54 100644 --- a/daras_ai_v2/settings.py +++ b/daras_ai_v2/settings.py @@ -226,7 +226,6 @@ EXPLORE_URL = furl(APP_BASE_URL).add(path="explore").url GPU_SERVER_1 = furl(config("GPU_SERVER_1", "http://gpu-1.gooey.ai")) -GPU_SERVER_2 = furl(config("GPU_SERVER_2", "http://gpu-2.gooey.ai")) SERPER_API_KEY = config("SERPER_API_KEY", None) @@ -285,3 +284,5 @@ AZURE_FORM_RECOGNIZER_KEY = config("AZURE_FORM_RECOGNIZER_KEY", "") DEEPGRAM_API_KEY = config("DEEPGRAM_API_KEY", "") + +ELEVEN_LABS_API_KEY = config("ELEVEN_LABS_API_KEY", "") diff --git a/daras_ai_v2/stable_diffusion.py b/daras_ai_v2/stable_diffusion.py index e5bfdb250..a78e962bd 100644 --- a/daras_ai_v2/stable_diffusion.py +++ b/daras_ai_v2/stable_diffusion.py @@ -2,7 +2,6 @@ import typing from enum import Enum -import replicate import requests from PIL import Image from django.db import models @@ -16,8 +15,6 @@ ) from daras_ai_v2.extract_face import rgb_img_to_rgba from daras_ai_v2.gpu_server import ( - call_gpu_server_b64, - GpuEndpoints, b64_img_decode, call_sd_multi, ) @@ -28,9 +25,20 @@ class InpaintingModels(Enum): sd_2 = "Stable Diffusion v2.1 (stability.ai)" runway_ml = "Stable Diffusion v1.5 (RunwayML)" - jack_qiao = "Stable Diffusion v1.4 (Jack Qiao)" dall_e = "Dall-E (OpenAI)" + jack_qiao = "Stable Diffusion v1.4 [Deprecated] (Jack Qiao)" + + @classmethod + def _deprecated(cls): + return {cls.jack_qiao} + + +inpaint_model_ids = { + InpaintingModels.sd_2: "stabilityai/stable-diffusion-2-inpainting", + InpaintingModels.runway_ml: "runwayml/stable-diffusion-inpainting", +} + class Text2ImgModels(Enum): # sd_1_4 = "SD v1.4 (RunwayML)" # Host this too? @@ -42,27 +50,26 @@ class Text2ImgModels(Enum): analog_diffusion = "Analog Diffusion (wavymulder)" protogen_5_3 = "Protogen v5.3 (darkstorm2150)" dreamlike_2 = "Dreamlike Photoreal 2.0 (dreamlike.art)" - rodent_diffusion_1_5 = "Rodent Diffusion 1.5 (NerdyRodent)" - jack_qiao = "Stable Diffusion v1.4 (Jack Qiao)" dall_e = "Dall-E (OpenAI)" - deepfloyd_if = "DeepFloyd IF (stability.ai)" + + jack_qiao = "Stable Diffusion v1.4 [Deprecated] (Jack Qiao)" + deepfloyd_if = "DeepFloyd IF [Deprecated] (stability.ai)" + rodent_diffusion_1_5 = "Rodent Diffusion 1.5 [Deprecated] (NerdyRodent)" + + @classmethod + def _deprecated(cls): + return {cls.jack_qiao, cls.deepfloyd_if, cls.rodent_diffusion_1_5} text2img_model_ids = { - Text2ImgModels.sd_2: "stabilityai/stable-diffusion-2-1", Text2ImgModels.sd_1_5: "runwayml/stable-diffusion-v1-5", + Text2ImgModels.sd_2: "stabilityai/stable-diffusion-2-1", + Text2ImgModels.dream_shaper: "Lykon/DreamShaper", + Text2ImgModels.analog_diffusion: "wavymulder/Analog-Diffusion", Text2ImgModels.openjourney: "prompthero/openjourney", Text2ImgModels.openjourney_2: "prompthero/openjourney-v2", - Text2ImgModels.analog_diffusion: "wavymulder/Analog-Diffusion", - Text2ImgModels.protogen_5_3: "darkstorm2150/Protogen_v5.3_Official_Release", Text2ImgModels.dreamlike_2: "dreamlike-art/dreamlike-photoreal-2.0", - Text2ImgModels.rodent_diffusion_1_5: "devxpy/rodent-diffusion-1-5", - Text2ImgModels.dream_shaper: "Lykon/DreamShaper", - Text2ImgModels.deepfloyd_if: [ - "DeepFloyd/IF-I-XL-v1.0", - "DeepFloyd/IF-II-L-v1.0", - "stabilityai/stable-diffusion-x4-upscaler", - ], + Text2ImgModels.protogen_5_3: "darkstorm2150/Protogen_v5.3_Official_Release", } @@ -77,10 +84,15 @@ class Img2ImgModels(Enum): analog_diffusion = "Analog Diffusion (wavymulder)" protogen_5_3 = "Protogen v5.3 (darkstorm2150)" dreamlike_2 = "Dreamlike Photoreal 2.0 (dreamlike.art)" - rodent_diffusion_1_5 = "Rodent Diffusion 1.5 (NerdyRodent)" - jack_qiao = "Stable Diffusion v1.4 (Jack Qiao)" dall_e = "Dall-E (OpenAI)" + jack_qiao = "Stable Diffusion v1.4 [Deprecated] (Jack Qiao)" + rodent_diffusion_1_5 = "Rodent Diffusion 1.5 [Deprecated] (NerdyRodent)" + + @classmethod + def _deprecated(cls): + return {cls.jack_qiao, cls.rodent_diffusion_1_5} + img2img_model_ids = { Img2ImgModels.sd_2: "stabilityai/stable-diffusion-2-1", @@ -91,7 +103,6 @@ class Img2ImgModels(Enum): Img2ImgModels.analog_diffusion: "wavymulder/Analog-Diffusion", Img2ImgModels.protogen_5_3: "darkstorm2150/Protogen_v5.3_Official_Release", Img2ImgModels.dreamlike_2: "dreamlike-art/dreamlike-photoreal-2.0", - Img2ImgModels.rodent_diffusion_1_5: "devxpy/rodent-diffusion-1-5", } @@ -257,18 +268,6 @@ def text2img( _resolution_check(width, height, max_size=(1024, 1024)) match selected_model: - case Text2ImgModels.jack_qiao.name: - out_imgs = call_gpu_server_b64( - endpoint=GpuEndpoints.glid_3_xl_stable, - input_data={ - "prompt": prompt, - "num_inference_steps": num_inference_steps, - "num_outputs": num_outputs, - "negative_prompt": negative_prompt or "", - "width": width, - "height": height, - }, - ) case Text2ImgModels.dall_e.name: import openai @@ -338,24 +337,6 @@ def img2img( _resolution_check(width, height) match selected_model: - case Img2ImgModels.jack_qiao.name: - out_imgs = call_gpu_server_b64( - endpoint=GpuEndpoints.glid_3_xl_stable, - input_data={ - "prompt": prompt, - "num_inference_steps": num_inference_steps, - "init_image": init_image, - # "edit_image": edit_image, - # "mask": mask, - "num_outputs": num_outputs, - "negative_prompt": negative_prompt or "", - # "outpaint": "expand", - "skip_timesteps": int(num_inference_steps * (1 - prompt_strength)), - "width": width, - "height": height, - "seed": seed, - }, - ) case Img2ImgModels.dall_e.name: import openai @@ -475,42 +456,6 @@ def inpainting( _resolution_check(width, height) match selected_model: - case InpaintingModels.sd_2.name: - if num_inference_steps == 110: - num_inference_steps = 100 - out_imgs = call_gpu_server_b64( - endpoint=GpuEndpoints.sd_2, - input_data={ - "prompt": prompt, - "width": width, - "height": height, - "num_outputs": num_outputs, - "num_inference_steps": num_inference_steps, - "edit_image": edit_image, - "mask_image": mask, - "guidance_scale": guidance_scale, - "negative_prompt": negative_prompt or "", - "seed": seed, - }, - ) - case InpaintingModels.runway_ml.name: - model = replicate.models.get("andreasjansson/stable-diffusion-inpainting") - version = model.versions.get( - "8eb2da8345bee796efcd925573f077e36ed5fb4ea3ba240ef70c23cf33f0d848" - ) - out_imgs = [ - requests.get(img).content - for img in version.predict( - prompt=prompt, - image=edit_image, - mask=mask, - invert_mask=True, - num_outputs=num_outputs, - num_inference_steps=num_inference_steps, - guidance_scale=guidance_scale, - seed=seed, - ) - ] case InpaintingModels.dall_e.name: import openai @@ -522,31 +467,41 @@ def inpainting( response = openai.Image.create_edit( prompt=prompt, image=image, - mask=None, n=num_outputs, size=f"{edge}x{edge}", response_format="b64_json", ) out_imgs = [b64_img_decode(part["b64_json"]) for part in response["data"]] - case _: - out_imgs = call_gpu_server_b64( - endpoint=GpuEndpoints.glid_3_xl_stable, - input_data={ - "prompt": prompt, - "num_inference_steps": num_inference_steps, - # "init_image": "string", - "edit_image": edit_image, - "mask": mask, - "num_outputs": num_outputs, - "negative_prompt": negative_prompt or "", - # "negative_prompt": "string", - # "outpaint": "expand", - # "skip_timesteps": 0, - "width": width, - "height": height, + + case InpaintingModels.sd_2.name | InpaintingModels.runway_ml.name: + out_imgs_urls = call_sd_multi( + "diffusion.inpaint", + pipeline={ + "model_id": inpaint_model_ids[InpaintingModels[selected_model]], "seed": seed, + # "scheduler": Schedulers[scheduler].label + # if scheduler + # else "UniPCMultistepScheduler", + "disable_safety_checker": True, + }, + inputs={ + "prompt": [prompt], + "negative_prompt": [negative_prompt] if negative_prompt else None, + "num_images_per_prompt": num_outputs, + "num_inference_steps": num_inference_steps, + "guidance_scale": guidance_scale, + "image": [edit_image], + "mask_image": [mask], }, ) + out_imgs = [] + for url in out_imgs_urls: + r = requests.get(url) + r.raise_for_status() + out_imgs.append(r.content) + + case _: + raise ValueError(f"Invalid model {selected_model}") out_imgs = _recomposite_inpainting_outputs(out_imgs, edit_image_bytes, mask_bytes) diff --git a/daras_ai_v2/text_to_speech_settings_widgets.py b/daras_ai_v2/text_to_speech_settings_widgets.py index 0fc45a609..153422c9a 100644 --- a/daras_ai_v2/text_to_speech_settings_widgets.py +++ b/daras_ai_v2/text_to_speech_settings_widgets.py @@ -22,10 +22,92 @@ class TextToSpeechProviders(Enum): GOOGLE_TTS = "Google Cloud Text-to-Speech" + ELEVEN_LABS = "ElevenLabs" UBERDUCK = "uberduck.ai" BARK = "Bark (suno-ai)" +# Mapping from Eleven Labs Voice Name -> Voice ID +ELEVEN_LABS_VOICES = { + "Rachel": "21m00Tcm4TlvDq8ikWAM", + "Clyde": "2EiwWnXFnvU5JabPnv8n", + "Domi": "AZnzlk1XvdvUeBnXmlld", + "Dave": "CYw3kZ02Hs0563khs1Fj", + "Fin": "D38z5RcWu1voky8WS1ja", + "Bella": "EXAVITQu4vr4xnSDxMaL", + "Antoni": "ErXwobaYiN019PkySvjV", + "Thomas": "GBv7mTt0atIp3Br8iCZE", + "Charlie": "IKne3meq5aSn9XLyUdCD", + "Emily": "LcfcDJNUP1GQjkzn1xUU", + "Elli": "MF3mGyEYCl7XYWbV9V6O", + "Callum": "N2lVS1w4EtoT3dr4eOWO", + "Patrick": "ODq5zmih8GrVes37Dizd", + "Harry": "SOYHLrjzK2X1ezoPC6cr", + "Liam": "TX3LPaxmHKxFdv7VOQHJ", + "Dorothy": "ThT5KcBeYPX3keUQqHPh", + "Josh": "TxGEqnHWrfWFTfGW9XjX", + "Arnold": "VR6AewLTigWG4xSOukaG", + "Charlotte": "XB0fDUnXU5powFXDhCwa", + "Matilda": "XrExE9yKIg1WjnnlVkGX", + "Matthew": "Yko7PKHZNXotIFUBG7I9", + "James": "ZQe5CZNOzWyzPSCn5a3c", + "Joseph": "Zlb1dXrM653N07WRdFW3", + "Jeremy": "bVMeCyTHy58xNoL34h3p", + "Michael": "flq6f7yk4E4fJM5XTYuZ", + "Ethan": "g5CIjZEefAph4nQFvHAz", + "Gigi": "jBpfuIE2acCO8z3wKNLl", + "Freya": "jsCqWAovK2LkecY7zXl4", + "Grace": "oWAxZDx7w5VEj9dCyTzz", + "Daniel": "onwK4e9ZLuTAKqWW03F9", + "Serena": "pMsXgVXv3BLzUgSXRplE", + "Adam": "pNInz6obpgDQGcFmaJgB", + "Nicole": "piTKgcLEGmPE4e6mEKli", + "Jessie": "t0jbNlBVZ17f02VDIeMI", + "Ryan": "wViXBPUzp2ZZixB1xQuM", + "Sam": "yoZ06aMxZJJ28mfd3POQ", + "Glinda": "z9fAnlkpzviPz146aGWa", + "Giovanni": "zcAOhNBS3c14rBihAFp1", + "Mimi": "zrHiDhphv9ZnVXBqCLjz", +} + +# Mapping from Model ID -> Title in UI +ELEVEN_LABS_MODELS = { + "eleven_multilingual_v2": "Multilingual V2", + "eleven_monolingual_v1": "English V1 - Low latency English TTS", +} + +ELEVEN_LABS_SUPPORTED_LANGS = [ + "English", + "Chinese", + "Spanish", + "Hindi", + "Portuguese", + "French", + "German", + "Japanese", + "Arabic", + "Korean", + "Indonesian", + "Italian", + "Dutch", + "Turkish", + "Polish", + "Swedish", + "Filipino", + "Malay", + "Romanian", + "Ukrainian", + "Greek", + "Czech", + "Danish", + "Finnish", + "Bulgarian", + "Croatian", + "Slovak", + "Tamil", +] + + BARK_SUPPORTED_LANGS = [ ("English", "en"), ("German", "de"), @@ -142,6 +224,63 @@ def text_to_speech_settings(): key="uberduck_speaking_rate", ) + case TextToSpeechProviders.ELEVEN_LABS.name: + with col2: + st.selectbox( + """ + ###### Voice name (ElevenLabs) + """, + key="elevenlabs_voice_name", + format_func=str, + options=ELEVEN_LABS_VOICES.keys(), + ) + st.selectbox( + """ + ###### Voice Model + """, + key="elevenlabs_model", + format_func=ELEVEN_LABS_MODELS.__getitem__, + options=ELEVEN_LABS_MODELS.keys(), + ) + + col1, col2 = st.columns(2) + with col1: + st.slider( + """ + ###### Stability + *A lower stability provides a broader emotional range. + A value lower than 0.3 can lead to too much instability. + [Read more](https://docs.elevenlabs.io/speech-synthesis/voice-settings#stability).* + """, + min_value=0, + max_value=1.0, + step=0.05, + key="elevenlabs_stability", + ) + with col2: + st.slider( + """ + ###### Similarity Boost + *Dictates how hard the model should try to replicate the original voice. + [Read more](https://docs.elevenlabs.io/speech-synthesis/voice-settings#similarity).* + """, + min_value=0, + max_value=1.0, + step=0.05, + key="elevenlabs_similarity_boost", + ) + + with st.expander( + "Eleven Labs Supported Languages", + style={"fontSize": "0.9rem", "textDecoration": "underline"}, + ): + st.caption( + "With Multilingual V2 voice model", style={"fontSize": "0.8rem"} + ) + st.caption( + ", ".join(ELEVEN_LABS_SUPPORTED_LANGS), style={"fontSize": "0.8rem"} + ) + @st.cache_data() def google_tts_voices() -> dict[texttospeech.Voice, str]: diff --git a/gooey_ui/components.py b/gooey_ui/components.py index ea3bbedda..f6ea3b6d5 100644 --- a/gooey_ui/components.py +++ b/gooey_ui/components.py @@ -131,7 +131,8 @@ def success(body: str, icon: str = "✅", *, unsafe_allow_html=False): def caption(body: str, **props): - markdown(body, style={"fontSize": "0.9rem"}, className="text-muted", **props) + style = props.setdefault("style", {"fontSize": "0.9rem"}) + markdown(body, className="text-muted", **props) def option_menu(*args, options, **kwargs): @@ -391,12 +392,13 @@ def button( form_submit_button = button -def expander(label: str, *, expanded: bool = False): +def expander(label: str, *, expanded: bool = False, **props): node = state.RenderTreeNode( name="expander", props=dict( label=dedent(label), open=expanded, + **props, ), ) node.mount() diff --git a/gooey_ui/pubsub.py b/gooey_ui/pubsub.py index 34d8ba852..2741c10af 100644 --- a/gooey_ui/pubsub.py +++ b/gooey_ui/pubsub.py @@ -2,7 +2,6 @@ import json import threading import typing -from functools import lru_cache from time import time import redis @@ -10,16 +9,11 @@ from daras_ai_v2 import settings -threadlocal = threading.local() - - -@lru_cache -def get_redis(): - return redis.Redis.from_url(settings.REDIS_URL) - - T = typing.TypeVar("T") +threadlocal = threading.local() +r = redis.Redis.from_url(settings.REDIS_URL) + def realtime_clear_subs(): threadlocal.channels = [] @@ -36,7 +30,6 @@ def get_subscriptions() -> list[str]: def realtime_pull(channels: list[str]) -> list[typing.Any]: channels = [f"gooey-gui/state/{channel}" for channel in channels] threadlocal.channels = channels - r = get_redis() out = [ json.loads(value) if (value := r.get(channel)) else None for channel in channels ] @@ -45,7 +38,6 @@ def realtime_pull(channels: list[str]) -> list[typing.Any]: def realtime_push(channel: str, value: typing.Any = "ping"): channel = f"gooey-gui/state/{channel}" - r = get_redis() msg = json.dumps(jsonable_encoder(value)) r.set(channel, msg) r.publish(channel, json.dumps(time())) diff --git a/recipes/LipsyncTTS.py b/recipes/LipsyncTTS.py index cfd15e2db..d31171596 100644 --- a/recipes/LipsyncTTS.py +++ b/recipes/LipsyncTTS.py @@ -6,7 +6,7 @@ from bots.models import Workflow from recipes.DeforumSD import safety_checker from recipes.Lipsync import LipsyncPage -from recipes.TextToSpeech import TextToSpeechPage +from recipes.TextToSpeech import TextToSpeechPage, TextToSpeechProviders from daras_ai_v2.loom_video_widget import youtube_video DEFAULT_LIPSYNC_TTS_META_IMG = "https://storage.googleapis.com/dara-c1b52.appspot.com/daras_ai/media/assets/lipsync_meta_img.gif" @@ -17,6 +17,13 @@ class LipsyncTTSPage(LipsyncPage, TextToSpeechPage): workflow = Workflow.LIPSYNC_TTS slug_versions = ["LipsyncTTS", "lipsync-maker"] + sane_defaults = { + "elevenlabs_voice_name": "Rachel", + "elevenlabs_model": "eleven_multilingual_v2", + "elevenlabs_stability": 0.5, + "elevenlabs_similarity_boost": 0.75, + } + class RequestModel(BaseModel): input_face: str input_audio: str | None @@ -37,6 +44,13 @@ class RequestModel(BaseModel): google_speaking_rate: float | None google_pitch: float | None + bark_history_prompt: str | None + + elevenlabs_voice_name: str | None + elevenlabs_model: str | None + elevenlabs_stability: float | None + elevenlabs_similarity_boost: float | None + class ResponseModel(BaseModel): output_video: str @@ -144,5 +158,26 @@ def render_example(self, state: dict): def render_output(self): self.render_example(st.session_state) + def get_raw_price(self, state: dict): + # _get_tts_provider comes from TextToSpeechPage + if self._get_tts_provider(state) == TextToSpeechProviders.ELEVEN_LABS: + return LipsyncPage.get_raw_price( + self, state + ) + TextToSpeechPage.get_raw_price(self, state) + else: + return LipsyncPage.get_raw_price(self, state) + + def additional_notes(self): + lipsync_notes = LipsyncPage.additional_notes(self) + if tts_notes := TextToSpeechPage.additional_notes(self): + notes = f""" + - *Lipsync* {lipsync_notes.strip()} + - *TTS* {tts_notes.strip()} + """ + else: + notes = lipsync_notes + + return notes + def render_usage_guide(self): youtube_video("RRmwQR-IytI") diff --git a/recipes/ObjectInpainting.py b/recipes/ObjectInpainting.py index 8afd229ca..ff4c81fde 100644 --- a/recipes/ObjectInpainting.py +++ b/recipes/ObjectInpainting.py @@ -19,7 +19,6 @@ ) from daras_ai_v2.loom_video_widget import youtube_video from daras_ai_v2.repositioning import ( - reposition_object, reposition_object_img_bytes, repositioning_preview_widget, ) diff --git a/recipes/Text2Audio.py b/recipes/Text2Audio.py index 54435751f..074462257 100644 --- a/recipes/Text2Audio.py +++ b/recipes/Text2Audio.py @@ -1,16 +1,13 @@ -import datetime import typing from enum import Enum -import requests -import gooey_ui as st from pydantic import BaseModel +import gooey_ui as st from bots.models import Workflow -from daras_ai.image_input import storage_blob_for from daras_ai_v2.base import BasePage from daras_ai_v2.enum_selector_widget import enum_multiselect -from daras_ai_v2.gpu_server import GpuEndpoints +from daras_ai_v2.gpu_server import call_celery_task_outfile from daras_ai_v2.img_model_settings_widgets import ( negative_prompt_setting, guidance_scale_setting, @@ -103,44 +100,31 @@ def run(self, state: dict) -> typing.Iterator[str | None]: state["output_audios"] = output_audios = {} for selected_model in request.selected_models: - yield f"Running {Text2AudioModels[selected_model].value}..." - - blobs = [ - storage_blob_for(f"gooey.ai - {request.text_prompt} ({i + 1}).wav") - for i in range(request.num_outputs) - ] - r = requests.post( - str(GpuEndpoints.audio_ldm), - json={ - "pipeline": { - "model_id": "cvssp/audioldm", - "upload_urls": [ - blob.generate_signed_url( - version="v4", - # This URL is valid for 15 minutes - expiration=datetime.timedelta(minutes=30), - # Allow PUT requests using this URL. - method="PUT", - content_type="audio/wav", - ) - for blob in blobs - ], - "seed": request.seed, - }, - "inputs": { - "prompt": [request.text_prompt], - "negative_prompt": [request.negative_prompt] - if request.negative_prompt - else None, - "num_waveforms_per_prompt": request.num_outputs, - "num_inference_steps": request.quality, - "guidance_scale": request.guidance_scale, - "audio_length_in_s": request.duration_sec, - }, - }, + model = Text2AudioModels[selected_model] + model_id = text2audio_model_ids[model] + + yield f"Running {model.value}..." + + output_audios[selected_model] = call_celery_task_outfile( + "audio_ldm", + pipeline=dict( + model_id=model_id, + seed=request.seed, + ), + inputs=dict( + prompt=[request.text_prompt], + negative_prompt=[request.negative_prompt] + if request.negative_prompt + else None, + num_waveforms_per_prompt=request.num_outputs, + num_inference_steps=request.quality, + guidance_scale=request.guidance_scale, + audio_length_in_s=request.duration_sec, + ), + filename=f"gooey.ai - {request.text_prompt}.wav", + content_type="audio/wav", + num_outputs=request.num_outputs, ) - r.raise_for_status() - output_audios[selected_model] = [blob.public_url for blob in blobs] def render_output(self): _render_output(st.session_state) diff --git a/recipes/TextToSpeech.py b/recipes/TextToSpeech.py index a6a85582e..778aba1d4 100644 --- a/recipes/TextToSpeech.py +++ b/recipes/TextToSpeech.py @@ -12,10 +12,12 @@ from daras_ai.image_input import upload_file_from_bytes, storage_blob_for from daras_ai_v2 import settings from daras_ai_v2.base import BasePage -from daras_ai_v2.gpu_server import GpuEndpoints +from daras_ai_v2.gpu_server import GpuEndpoints, call_celery_task_outfile from daras_ai_v2.loom_video_widget import youtube_video from daras_ai_v2.text_to_speech_settings_widgets import ( UBERDUCK_VOICES, + ELEVEN_LABS_VOICES, + ELEVEN_LABS_MODELS, text_to_speech_settings, TextToSpeechProviders, ) @@ -40,6 +42,10 @@ class TextToSpeechPage(BasePage): "google_speaking_rate": 1.0, "uberduck_voice_name": "Aiden Botha", "uberduck_speaking_rate": 1.0, + "elevenlabs_voice_name": "Rachel", + "elevenlabs_model": "eleven_multilingual_v2", + "elevenlabs_stability": 0.5, + "elevenlabs_similarity_boost": 0.75, } class RequestModel(BaseModel): @@ -58,6 +64,11 @@ class RequestModel(BaseModel): bark_history_prompt: str | None + elevenlabs_voice_name: str | None + elevenlabs_model: str | None + elevenlabs_stability: float | None + elevenlabs_similarity_boost: float | None + class ResponseModel(BaseModel): audio_url: str @@ -95,6 +106,14 @@ def validate_form_v2(self): def render_settings(self): text_to_speech_settings() + def get_raw_price(self, state: dict): + tts_provider = self._get_tts_provider(state) + match tts_provider: + case TextToSpeechProviders.ELEVEN_LABS: + return self._get_eleven_labs_price(state) + case _: + return super().get_raw_price(state) + def render_usage_guide(self): youtube_video("aD4N-g9qqhc") # loom_video("2d853b7442874b9cbbf3f27b98594add") @@ -107,41 +126,43 @@ def render_output(self): else: st.div() + def _get_eleven_labs_price(self, state: dict): + text = state.get("text_prompt", "") + # 0.079 credits / character ~ 4 credits / 10 words + return len(text) * 0.079 + + def _get_tts_provider(self, state: dict): + tts_provider = state.get("tts_provider", TextToSpeechProviders.UBERDUCK.name) + # TODO: validate tts_provider before lookup? + return TextToSpeechProviders[tts_provider] + + def additional_notes(self): + tts_provider = st.session_state.get("tts_provider") + if tts_provider == TextToSpeechProviders.ELEVEN_LABS.name: + return """ + *Eleven Labs cost ≈ 4 credits per 10 words* + """ + else: + return "" + def run(self, state: dict): text = state["text_prompt"].strip() - tts_provider = ( - state["tts_provider"] - if "tts_provider" in state - else TextToSpeechProviders.UBERDUCK.name - ) - provider = TextToSpeechProviders[tts_provider] + provider = self._get_tts_provider(state) yield f"Generating audio using {provider.value} ..." match provider: case TextToSpeechProviders.BARK: - blob = storage_blob_for(f"bark_tts.wav") - r = requests.post( - str(GpuEndpoints.bark), - json={ - "pipeline": dict( - upload_urls=[ - blob.generate_signed_url( - version="v4", - # This URL is valid for 15 minutes - expiration=datetime.timedelta(minutes=30), - # Allow PUT requests using this URL. - method="PUT", - content_type="audio/wav", - ), - ], - ), - "inputs": dict( - prompt=text.split("---"), - # history_prompt=history_prompt, - ), - }, - ) - r.raise_for_status() - state["audio_url"] = blob.public_url + state["audio_url"] = call_celery_task_outfile( + "bark", + pipeline=dict( + model_id="bark", + ), + inputs=dict( + prompt=text.split("---"), + # history_prompt=history_prompt, + ), + filename="bark_tts.wav", + content_type="audio/wav", + )[0] case TextToSpeechProviders.UBERDUCK: voicemodel_uuid = ( @@ -217,6 +238,47 @@ def run(self, state: dict): f"google_tts_gen.mp3", response.audio_content ) + case TextToSpeechProviders.ELEVEN_LABS: + # default to first in the mapping + default_voice_model = next(iter(ELEVEN_LABS_MODELS)) + default_voice_name = next(iter(ELEVEN_LABS_VOICES)) + + voice_model = state.get("elevenlabs_model", default_voice_model) + voice_name = state.get("elevenlabs_voice_name", default_voice_name) + + # validate voice_model / voice_name + if voice_model not in ELEVEN_LABS_MODELS: + raise ValueError(f"Invalid model: {voice_model}") + if voice_name not in ELEVEN_LABS_VOICES: + raise ValueError(f"Invalid voice_name: {voice_name}") + else: + voice_id = ELEVEN_LABS_VOICES[voice_name] + + stability = state.get("elevenlabs_stability", 0.5) + similarity_boost = state.get("elevenlabs_similarity_boost", 0.75) + + response = requests.post( + f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}", + headers={ + "xi-api-key": settings.ELEVEN_LABS_API_KEY, + "Accept": "audio/mpeg", + }, + json={ + "text": text, + "model_id": voice_model, + "voice_settings": { + "stability": stability, + "similarity_boost": similarity_boost, + }, + }, + ) + response.raise_for_status() + + yield "Uploading Audio file..." + state["audio_url"] = upload_file_from_bytes( + "elevenlabs_gen.mp3", response.content + ) + def related_workflows(self) -> list: from recipes.VideoBots import VideoBotsPage from recipes.LipsyncTTS import LipsyncTTSPage diff --git a/recipes/VideoBots.py b/recipes/VideoBots.py index d5f405ed6..98682aef6 100644 --- a/recipes/VideoBots.py +++ b/recipes/VideoBots.py @@ -162,6 +162,10 @@ class VideoBotsPage(BasePage): "google_speaking_rate": 1.0, "uberduck_voice_name": "Aiden Botha", "uberduck_speaking_rate": 1.0, + "elevenlabs_voice_name": "Rachel", + "elevenlabs_model": "eleven_multilingual_v2", + "elevenlabs_stability": 0.5, + "elevenlabs_similarity_boost": 0.75, # gpt3 "selected_model": LargeLanguageModels.text_davinci_003.name, "avoid_repetition": True, @@ -200,6 +204,11 @@ class RequestModel(BaseModel): google_voice_name: str | None google_speaking_rate: float | None google_pitch: float | None + bark_history_prompt: str | None + elevenlabs_voice_name: str | None + elevenlabs_model: str | None + elevenlabs_stability: float | None + elevenlabs_similarity_boost: float | None # llm settings selected_model: typing.Literal[ @@ -546,6 +555,30 @@ def render_steps(self): st.write(f"**Generated Audio {idx + 1}**") st.audio(audio_url) + def get_raw_price(self, state: dict): + match state.get("tts_provider"): + case TextToSpeechProviders.ELEVEN_LABS.name: + output_text_list = state.get( + "raw_tts_text", state.get("raw_output_text", []) + ) + tts_state = {"text_prompt": "".join(output_text_list)} + return super().get_raw_price(state) + TextToSpeechPage().get_raw_price( + tts_state + ) + case _: + return super().get_raw_price(state) + + def additional_notes(self): + tts_provider = st.session_state.get("tts_provider") + match tts_provider: + case TextToSpeechProviders.ELEVEN_LABS.name: + return f""" + - *Base cost = {super().get_raw_price(st.session_state)} credits* + - *Additional Eleven Labs cost ≈ 4 credits per 10 words of the output* + """ + case _: + return "" + def run(self, state: dict) -> typing.Iterator[str | None]: request: VideoBotsPage.RequestModel = self.RequestModel.parse_obj(state) diff --git a/recipes/embeddings_page.py b/recipes/embeddings_page.py index 17af91e98..f2474f7ea 100644 --- a/recipes/embeddings_page.py +++ b/recipes/embeddings_page.py @@ -27,6 +27,14 @@ class EmbeddingModels(models.TextChoices): "Multilingual E5 Large (Liang Wang)", "intfloat/multilingual-e5-large", ) + gte_large = ( + "General Text Embeddings Large (Dingkun Long)", + "thenlper/gte-large", + ) + gte_base = ( + "General Text Embeddings Base (Dingkun Long)", + "thenlper/gte-base", + ) class EmbeddingsPage(BasePage):