From a2f0265756816d635982bf2f43af090efafcd263 Mon Sep 17 00:00:00 2001 From: Dev Aggarwal Date: Mon, 9 Oct 2023 15:02:10 +0530 Subject: [PATCH 01/20] share global redis instance --- gooey_ui/pubsub.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/gooey_ui/pubsub.py b/gooey_ui/pubsub.py index 34d8ba852..2741c10af 100644 --- a/gooey_ui/pubsub.py +++ b/gooey_ui/pubsub.py @@ -2,7 +2,6 @@ import json import threading import typing -from functools import lru_cache from time import time import redis @@ -10,16 +9,11 @@ from daras_ai_v2 import settings -threadlocal = threading.local() - - -@lru_cache -def get_redis(): - return redis.Redis.from_url(settings.REDIS_URL) - - T = typing.TypeVar("T") +threadlocal = threading.local() +r = redis.Redis.from_url(settings.REDIS_URL) + def realtime_clear_subs(): threadlocal.channels = [] @@ -36,7 +30,6 @@ def get_subscriptions() -> list[str]: def realtime_pull(channels: list[str]) -> list[typing.Any]: channels = [f"gooey-gui/state/{channel}" for channel in channels] threadlocal.channels = channels - r = get_redis() out = [ json.loads(value) if (value := r.get(channel)) else None for channel in channels ] @@ -45,7 +38,6 @@ def realtime_pull(channels: list[str]) -> list[typing.Any]: def realtime_push(channel: str, value: typing.Any = "ping"): channel = f"gooey-gui/state/{channel}" - r = get_redis() msg = json.dumps(jsonable_encoder(value)) r.set(channel, msg) r.publish(channel, json.dumps(time())) From 7a9cba566a16ad13d052934c6c80b5523ae93897 Mon Sep 17 00:00:00 2001 From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com> Date: Tue, 3 Oct 2023 21:01:41 +0530 Subject: [PATCH 02/20] Add ElevenLabs for TTS -- with 5 default voices from VoiceLab --- daras_ai_v2/settings.py | 2 + .../text_to_speech_settings_widgets.py | 65 +++++++++++++++++++ recipes/TextToSpeech.py | 47 ++++++++++++++ 3 files changed, 114 insertions(+) diff --git a/daras_ai_v2/settings.py b/daras_ai_v2/settings.py index fe81fd829..4c762753f 100644 --- a/daras_ai_v2/settings.py +++ b/daras_ai_v2/settings.py @@ -284,3 +284,5 @@ AZURE_FORM_RECOGNIZER_KEY = config("AZURE_FORM_RECOGNIZER_KEY", "") DEEPGRAM_API_KEY = config("DEEPGRAM_API_KEY", "") + +ELEVEN_LABS_API_KEY = config("ELEVEN_LABS_API_KEY", "") diff --git a/daras_ai_v2/text_to_speech_settings_widgets.py b/daras_ai_v2/text_to_speech_settings_widgets.py index 0fc45a609..5fdaa8643 100644 --- a/daras_ai_v2/text_to_speech_settings_widgets.py +++ b/daras_ai_v2/text_to_speech_settings_widgets.py @@ -24,6 +24,22 @@ class TextToSpeechProviders(Enum): GOOGLE_TTS = "Google Cloud Text-to-Speech" UBERDUCK = "uberduck.ai" BARK = "Bark (suno-ai)" + ELEVEN_LABS = "ElevenLabs" + + +# mapping from title in UI -> voice ID +ELEVEN_LABS_VOICES = { + "Indian Man With A Deep Voice": "syN7Wt0nfLXAqq9LI9R6", + "[aaa] Ramakrishnan - INDIAN TEACHER. CEREBRAL AND THOUGHTFUL PRONUNCIATION 🔥": "99XLBDkANYZ0Ww7MKN8d", + "[ElevenVoices] Riya - Indian Female Young Adult": "TSh1KthMgVjY3BfYFkwS", + "Wise Grandma slow, seductive, pleasant, mature": "dvRCseVM0rT31i8clSVy", + "[ElevenVoices] Rahul - Indian Male Young Adult": "prqKmUi0Zo7WBmA81Vy4", +} + +ELEVEN_LABS_MODELS = { + "Multilingual V2": "eleven_multilingual_v2", + "English V1 - Low latency English TTS": "eleven_monolingual_v1", +} BARK_SUPPORTED_LANGS = [ @@ -142,6 +158,55 @@ def text_to_speech_settings(): key="uberduck_speaking_rate", ) + case TextToSpeechProviders.ELEVEN_LABS.name: + with col2: + st.selectbox( + """ + ###### Voice name (ElevenLabs) + """, + key="elevenlabs_voice_name", + format_func=str, + options=ELEVEN_LABS_VOICES.keys(), + ) + + col1, col2 = st.columns(2) + with col1: + st.slider( + """ + ###### Stability + *A lower stability provides a broader emotional range. + A value lower than 0.3 can lead to too much instability. + [Read more](https://docs.elevenlabs.io/speech-synthesis/voice-settings#stability).* + """, + min_value=0, + max_value=1.0, + step=0.05, + key="elevenlabs_stability", + ) + with col2: + st.slider( + """ + ###### Similarity Boost + *Dictates how hard the model should try to replicate the original voice. + [Read more](https://docs.elevenlabs.io/speech-synthesis/voice-settings#similarity).* + """, + min_value=0, + max_value=1.0, + step=0.05, + key="elevenlabs_similarity_boost", + ) + + col1, _ = st.columns(2) + with col1: + st.selectbox( + """ + ###### Voice Model + """, + key="elevenlabs_model", + options=ELEVEN_LABS_MODELS.keys(), + format_func=str, + ) + @st.cache_data() def google_tts_voices() -> dict[texttospeech.Voice, str]: diff --git a/recipes/TextToSpeech.py b/recipes/TextToSpeech.py index a6a85582e..ea65552d0 100644 --- a/recipes/TextToSpeech.py +++ b/recipes/TextToSpeech.py @@ -16,6 +16,8 @@ from daras_ai_v2.loom_video_widget import youtube_video from daras_ai_v2.text_to_speech_settings_widgets import ( UBERDUCK_VOICES, + ELEVEN_LABS_VOICES, + ELEVEN_LABS_MODELS, text_to_speech_settings, TextToSpeechProviders, ) @@ -40,6 +42,8 @@ class TextToSpeechPage(BasePage): "google_speaking_rate": 1.0, "uberduck_voice_name": "Aiden Botha", "uberduck_speaking_rate": 1.0, + "elevenlabs_stability": 0.5, + "elevenlabs_similarity_boost": 0.75, } class RequestModel(BaseModel): @@ -217,6 +221,49 @@ def run(self, state: dict): f"google_tts_gen.mp3", response.audio_content ) + case TextToSpeechProviders.ELEVEN_LABS: + # default to first voice ID in the mapping + default_voice_id = next(iter(ELEVEN_LABS_VOICES.values())) + + # default to first model ID in the mapping + default_voice_model = next(iter(ELEVEN_LABS_MODELS.values())) + + voice_id = ELEVEN_LABS_VOICES.get( + state.get("elevenlabs_voice_name"), + default_voice_id + ) + voice_model = ELEVEN_LABS_MODELS.get( + state.get("elevenlabs_model"), + default_voice_model, + ) + stability = state.get( + "elevenlabs_stability", 0.5, + ) + similarity_boost = state.get( + "elevenlabs_similarity_boost", 0.75, + ) + + response = requests.post( + f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}", + headers={ + "xi-api-key": settings.ELEVEN_LABS_API_KEY, + "Accept": "audio/mpeg", + }, + json={ + "text": text, + "model_id": voice_model, + "voice_settings": { + "stability": stability, + "similarity_boost": similarity_boost, + }, + }, + ) + response.raise_for_status() + + yield "Uploading Audio file..." + state["audio_url"] = upload_file_from_bytes("elevenlabs_gen.mp3", response.content) + + def related_workflows(self) -> list: from recipes.VideoBots import VideoBotsPage from recipes.LipsyncTTS import LipsyncTTSPage From 3361f725d0a27e440ae9339d7e275de5410a3944 Mon Sep 17 00:00:00 2001 From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com> Date: Tue, 3 Oct 2023 21:12:24 +0530 Subject: [PATCH 03/20] Reformat with black --- recipes/TextToSpeech.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/recipes/TextToSpeech.py b/recipes/TextToSpeech.py index ea65552d0..ac64d8106 100644 --- a/recipes/TextToSpeech.py +++ b/recipes/TextToSpeech.py @@ -229,19 +229,13 @@ def run(self, state: dict): default_voice_model = next(iter(ELEVEN_LABS_MODELS.values())) voice_id = ELEVEN_LABS_VOICES.get( - state.get("elevenlabs_voice_name"), - default_voice_id + state.get("elevenlabs_voice_name"), default_voice_id ) voice_model = ELEVEN_LABS_MODELS.get( - state.get("elevenlabs_model"), - default_voice_model, - ) - stability = state.get( - "elevenlabs_stability", 0.5, - ) - similarity_boost = state.get( - "elevenlabs_similarity_boost", 0.75, + state.get("elevenlabs_model"), default_voice_model ) + stability = state.get("elevenlabs_stability", 0.5) + similarity_boost = state.get("elevenlabs_similarity_boost", 0.75) response = requests.post( f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}", @@ -261,8 +255,9 @@ def run(self, state: dict): response.raise_for_status() yield "Uploading Audio file..." - state["audio_url"] = upload_file_from_bytes("elevenlabs_gen.mp3", response.content) - + state["audio_url"] = upload_file_from_bytes( + "elevenlabs_gen.mp3", response.content + ) def related_workflows(self) -> list: from recipes.VideoBots import VideoBotsPage From c78364a3bb0faa0ccc8490196c53113fb65e16e8 Mon Sep 17 00:00:00 2001 From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com> Date: Wed, 4 Oct 2023 16:54:05 +0530 Subject: [PATCH 04/20] Implement pricing for Eleven Labs One side effect of this change -- namely of summing up the cost of Lipsync and TTS models in the LipsyncTTS recipe is that now other LipsyncTTS are costlier too. Earlier, LipsyncTTS didn't consider the cost of TTS (which seemed to be a flat 5 credits for all TTS models). This change will need a pricing review and might be modified to only do this summing up for ElevenLabs TTS. --- recipes/LipsyncTTS.py | 19 +++++++++++++++++++ recipes/TextToSpeech.py | 41 +++++++++++++++++++++++++++++++++++------ 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/recipes/LipsyncTTS.py b/recipes/LipsyncTTS.py index cfd15e2db..bb43c39b9 100644 --- a/recipes/LipsyncTTS.py +++ b/recipes/LipsyncTTS.py @@ -37,6 +37,11 @@ class RequestModel(BaseModel): google_speaking_rate: float | None google_pitch: float | None + elevenlabs_voice_name: str | None + elevenlabs_model: str | None + elevenlabs_stability: float | None + elevenlabs_similarity_boost: float | None + class ResponseModel(BaseModel): output_video: str @@ -144,5 +149,19 @@ def render_example(self, state: dict): def render_output(self): self.render_example(st.session_state) + def get_raw_price(self, state: dict): + return LipsyncPage.get_raw_price(self, state) + \ + TextToSpeechPage.get_raw_price(self, state) + + def additional_notes(self): + notes = f""" +*Cost = Lipsync Cost + TTS Cost* +""" + if lipsync_notes := LipsyncPage.additional_notes(self): + notes += "\n" + f"*Lipsync* {lipsync_notes}" + if tts_notes := TextToSpeechPage.additional_notes(self): + notes += "\n" + f"*TTS* {tts_notes}" + return notes + def render_usage_guide(self): youtube_video("RRmwQR-IytI") diff --git a/recipes/TextToSpeech.py b/recipes/TextToSpeech.py index ac64d8106..592f2ed7f 100644 --- a/recipes/TextToSpeech.py +++ b/recipes/TextToSpeech.py @@ -62,6 +62,11 @@ class RequestModel(BaseModel): bark_history_prompt: str | None + elevenlabs_voice_name: str | None + elevenlabs_model: str | None + elevenlabs_stability: float | None + elevenlabs_similarity_boost: float | None + class ResponseModel(BaseModel): audio_url: str @@ -99,6 +104,14 @@ def validate_form_v2(self): def render_settings(self): text_to_speech_settings() + def get_raw_price(self, state: dict): + tts_provider = self._get_tts_provider(state) + match tts_provider: + case TextToSpeechProviders.ELEVEN_LABS: + return self._get_eleven_labs_price(state) + case _: + return super().get_raw_price(state) + def render_usage_guide(self): youtube_video("aD4N-g9qqhc") # loom_video("2d853b7442874b9cbbf3f27b98594add") @@ -111,14 +124,30 @@ def render_output(self): else: st.div() + def _get_eleven_labs_price(self, state: dict): + text = state.get("text_prompt", "") + # 4 credits for 10 words ~ 50 chars + return (len(text) / 50) * 4 + + def _get_tts_provider(self, state: dict): + tts_provider = state.get( + "tts_provider", TextToSpeechProviders.UBERDUCK.name + ) + # TODO: validate tts_provider before state lookup + return TextToSpeechProviders[tts_provider] + + def additional_notes(self): + tts_provider = st.session_state.get("tts_provider") + if tts_provider == TextToSpeechProviders.ELEVEN_LABS.name: + return """ + *Eleven Labs cost ≈ 4 credits for every 50 characters ≈ 0.4 credits per word* + """ + else: + return "" + def run(self, state: dict): text = state["text_prompt"].strip() - tts_provider = ( - state["tts_provider"] - if "tts_provider" in state - else TextToSpeechProviders.UBERDUCK.name - ) - provider = TextToSpeechProviders[tts_provider] + provider = self._get_tts_provider(state) yield f"Generating audio using {provider.value} ..." match provider: case TextToSpeechProviders.BARK: From bff6c17e7cf0ad3abd0f71a32e0665d5fe547a6f Mon Sep 17 00:00:00 2001 From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com> Date: Wed, 4 Oct 2023 17:26:25 +0530 Subject: [PATCH 05/20] LipsyncTTS: only add TTS cost when TTS provider is 11 labs --- recipes/LipsyncTTS.py | 25 ++++++++++++++++--------- recipes/TextToSpeech.py | 6 +++--- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/recipes/LipsyncTTS.py b/recipes/LipsyncTTS.py index bb43c39b9..e0024f2d0 100644 --- a/recipes/LipsyncTTS.py +++ b/recipes/LipsyncTTS.py @@ -1,3 +1,4 @@ +import textwrap import typing from pydantic import BaseModel @@ -6,7 +7,7 @@ from bots.models import Workflow from recipes.DeforumSD import safety_checker from recipes.Lipsync import LipsyncPage -from recipes.TextToSpeech import TextToSpeechPage +from recipes.TextToSpeech import TextToSpeechPage, TextToSpeechProviders from daras_ai_v2.loom_video_widget import youtube_video DEFAULT_LIPSYNC_TTS_META_IMG = "https://storage.googleapis.com/dara-c1b52.appspot.com/daras_ai/media/assets/lipsync_meta_img.gif" @@ -150,17 +151,23 @@ def render_output(self): self.render_example(st.session_state) def get_raw_price(self, state: dict): - return LipsyncPage.get_raw_price(self, state) + \ - TextToSpeechPage.get_raw_price(self, state) + # _get_tts_provider comes from TextToSpeechPage + if self._get_tts_provider(state) == TextToSpeechProviders.ELEVEN_LABS: + return LipsyncPage.get_raw_price(self, state) + \ + TextToSpeechPage.get_raw_price(self, state) + else: + return LipsyncPage.get_raw_price(self, state) def additional_notes(self): - notes = f""" -*Cost = Lipsync Cost + TTS Cost* -""" - if lipsync_notes := LipsyncPage.additional_notes(self): - notes += "\n" + f"*Lipsync* {lipsync_notes}" + lipsync_notes = LipsyncPage.additional_notes(self) if tts_notes := TextToSpeechPage.additional_notes(self): - notes += "\n" + f"*TTS* {tts_notes}" + notes = textwrap.dedent(f"""\ + - *Lipsync* {lipsync_notes.strip()} + - *TTS* {tts_notes.strip()} + """) + else: + notes = lipsync_notes + return notes def render_usage_guide(self): diff --git a/recipes/TextToSpeech.py b/recipes/TextToSpeech.py index 592f2ed7f..77773ca3f 100644 --- a/recipes/TextToSpeech.py +++ b/recipes/TextToSpeech.py @@ -126,8 +126,8 @@ def render_output(self): def _get_eleven_labs_price(self, state: dict): text = state.get("text_prompt", "") - # 4 credits for 10 words ~ 50 chars - return (len(text) / 50) * 4 + # 0.079 credits / character ~ 4 credits / 10 words + return len(text) * 0.079 def _get_tts_provider(self, state: dict): tts_provider = state.get( @@ -140,7 +140,7 @@ def additional_notes(self): tts_provider = st.session_state.get("tts_provider") if tts_provider == TextToSpeechProviders.ELEVEN_LABS.name: return """ - *Eleven Labs cost ≈ 4 credits for every 50 characters ≈ 0.4 credits per word* + *Eleven Labs cost ≈ 4 credits per 10 words* """ else: return "" From 0ff0f4b0e3a2f4f1bbd4f3ee3acf646cfcf2444c Mon Sep 17 00:00:00 2001 From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com> Date: Wed, 4 Oct 2023 17:33:15 +0530 Subject: [PATCH 06/20] Re-format with black --- recipes/LipsyncTTS.py | 10 +++++----- recipes/TextToSpeech.py | 4 +--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/recipes/LipsyncTTS.py b/recipes/LipsyncTTS.py index e0024f2d0..d250f746b 100644 --- a/recipes/LipsyncTTS.py +++ b/recipes/LipsyncTTS.py @@ -1,4 +1,3 @@ -import textwrap import typing from pydantic import BaseModel @@ -153,18 +152,19 @@ def render_output(self): def get_raw_price(self, state: dict): # _get_tts_provider comes from TextToSpeechPage if self._get_tts_provider(state) == TextToSpeechProviders.ELEVEN_LABS: - return LipsyncPage.get_raw_price(self, state) + \ - TextToSpeechPage.get_raw_price(self, state) + return LipsyncPage.get_raw_price( + self, state + ) + TextToSpeechPage.get_raw_price(self, state) else: return LipsyncPage.get_raw_price(self, state) def additional_notes(self): lipsync_notes = LipsyncPage.additional_notes(self) if tts_notes := TextToSpeechPage.additional_notes(self): - notes = textwrap.dedent(f"""\ + notes = f""" - *Lipsync* {lipsync_notes.strip()} - *TTS* {tts_notes.strip()} - """) + """ else: notes = lipsync_notes diff --git a/recipes/TextToSpeech.py b/recipes/TextToSpeech.py index 77773ca3f..1e5014986 100644 --- a/recipes/TextToSpeech.py +++ b/recipes/TextToSpeech.py @@ -130,9 +130,7 @@ def _get_eleven_labs_price(self, state: dict): return len(text) * 0.079 def _get_tts_provider(self, state: dict): - tts_provider = state.get( - "tts_provider", TextToSpeechProviders.UBERDUCK.name - ) + tts_provider = state.get("tts_provider", TextToSpeechProviders.UBERDUCK.name) # TODO: validate tts_provider before state lookup return TextToSpeechProviders[tts_provider] From cd800d753b1412e9baa1afb8497bce28e7c33739 Mon Sep 17 00:00:00 2001 From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com> Date: Wed, 4 Oct 2023 18:20:51 +0530 Subject: [PATCH 07/20] Use format_func and sane_defaults properly --- .../text_to_speech_settings_widgets.py | 18 ++++++++--------- recipes/TextToSpeech.py | 20 +++++++++++-------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/daras_ai_v2/text_to_speech_settings_widgets.py b/daras_ai_v2/text_to_speech_settings_widgets.py index 5fdaa8643..471049ce6 100644 --- a/daras_ai_v2/text_to_speech_settings_widgets.py +++ b/daras_ai_v2/text_to_speech_settings_widgets.py @@ -29,16 +29,16 @@ class TextToSpeechProviders(Enum): # mapping from title in UI -> voice ID ELEVEN_LABS_VOICES = { - "Indian Man With A Deep Voice": "syN7Wt0nfLXAqq9LI9R6", - "[aaa] Ramakrishnan - INDIAN TEACHER. CEREBRAL AND THOUGHTFUL PRONUNCIATION 🔥": "99XLBDkANYZ0Ww7MKN8d", - "[ElevenVoices] Riya - Indian Female Young Adult": "TSh1KthMgVjY3BfYFkwS", - "Wise Grandma slow, seductive, pleasant, mature": "dvRCseVM0rT31i8clSVy", - "[ElevenVoices] Rahul - Indian Male Young Adult": "prqKmUi0Zo7WBmA81Vy4", + "syN7Wt0nfLXAqq9LI9R6": "Indian Man With A Deep Voice", + "99XLBDkANYZ0Ww7MKN8d": "[aaa] Ramakrishnan - INDIAN TEACHER. CEREBRAL AND THOUGHTFUL PRONUNCIATION 🔥", + "TSh1KthMgVjY3BfYFkwS": "[ElevenVoices] Riya - Indian Female Young Adult", + "dvRCseVM0rT31i8clSVy": "Wise Grandma slow, seductive, pleasant, mature", + "prqKmUi0Zo7WBmA81Vy4": "[ElevenVoices] Rahul - Indian Male Young Adult", } ELEVEN_LABS_MODELS = { - "Multilingual V2": "eleven_multilingual_v2", - "English V1 - Low latency English TTS": "eleven_monolingual_v1", + "eleven_multilingual_v2": "Multilingual V2", + "eleven_monolingual_v1": "English V1 - Low latency English TTS", } @@ -165,7 +165,7 @@ def text_to_speech_settings(): ###### Voice name (ElevenLabs) """, key="elevenlabs_voice_name", - format_func=str, + format_func=ELEVEN_LABS_VOICES.__getitem__, options=ELEVEN_LABS_VOICES.keys(), ) @@ -203,8 +203,8 @@ def text_to_speech_settings(): ###### Voice Model """, key="elevenlabs_model", + format_func=ELEVEN_LABS_MODELS.__getitem__, options=ELEVEN_LABS_MODELS.keys(), - format_func=str, ) diff --git a/recipes/TextToSpeech.py b/recipes/TextToSpeech.py index 1e5014986..de2c86319 100644 --- a/recipes/TextToSpeech.py +++ b/recipes/TextToSpeech.py @@ -42,6 +42,8 @@ class TextToSpeechPage(BasePage): "google_speaking_rate": 1.0, "uberduck_voice_name": "Aiden Botha", "uberduck_speaking_rate": 1.0, + "elevenlabs_voice_name": "syN7Wt0nfLXAqq9LI9R6", + "elevenlabs_model": "eleven_multilingual_v2", "elevenlabs_stability": 0.5, "elevenlabs_similarity_boost": 0.75, } @@ -250,17 +252,19 @@ def run(self, state: dict): case TextToSpeechProviders.ELEVEN_LABS: # default to first voice ID in the mapping - default_voice_id = next(iter(ELEVEN_LABS_VOICES.values())) + default_voice_id = next(iter(ELEVEN_LABS_VOICES)) # default to first model ID in the mapping - default_voice_model = next(iter(ELEVEN_LABS_MODELS.values())) + default_voice_model = next(iter(ELEVEN_LABS_MODELS)) + + voice_id = state.get("elevenlabs_voice_name", default_voice_id) + voice_model = state.get("elevenlabs_model", default_voice_model) + + if voice_id not in ELEVEN_LABS_VOICES: + raise ValueError(f"Invalid voice_name: {voice_id}") + if voice_model not in ELEVEN_LABS_MODELS: + raise ValueError(f"Invalid model: {voice_model}") - voice_id = ELEVEN_LABS_VOICES.get( - state.get("elevenlabs_voice_name"), default_voice_id - ) - voice_model = ELEVEN_LABS_MODELS.get( - state.get("elevenlabs_model"), default_voice_model - ) stability = state.get("elevenlabs_stability", 0.5) similarity_boost = state.get("elevenlabs_similarity_boost", 0.75) From e02516de30345b60964590d46b627b7b5d9835b0 Mon Sep 17 00:00:00 2001 From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com> Date: Wed, 4 Oct 2023 18:25:34 +0530 Subject: [PATCH 08/20] Fix comments to reflect updates --- daras_ai_v2/text_to_speech_settings_widgets.py | 4 ++-- recipes/TextToSpeech.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/daras_ai_v2/text_to_speech_settings_widgets.py b/daras_ai_v2/text_to_speech_settings_widgets.py index 471049ce6..869069517 100644 --- a/daras_ai_v2/text_to_speech_settings_widgets.py +++ b/daras_ai_v2/text_to_speech_settings_widgets.py @@ -27,7 +27,7 @@ class TextToSpeechProviders(Enum): ELEVEN_LABS = "ElevenLabs" -# mapping from title in UI -> voice ID +# Mapping from Eleven Labs Voice ID -> Title in UI ELEVEN_LABS_VOICES = { "syN7Wt0nfLXAqq9LI9R6": "Indian Man With A Deep Voice", "99XLBDkANYZ0Ww7MKN8d": "[aaa] Ramakrishnan - INDIAN TEACHER. CEREBRAL AND THOUGHTFUL PRONUNCIATION 🔥", @@ -36,12 +36,12 @@ class TextToSpeechProviders(Enum): "prqKmUi0Zo7WBmA81Vy4": "[ElevenVoices] Rahul - Indian Male Young Adult", } +# Mapping from Model ID -> Title in UI ELEVEN_LABS_MODELS = { "eleven_multilingual_v2": "Multilingual V2", "eleven_monolingual_v1": "English V1 - Low latency English TTS", } - BARK_SUPPORTED_LANGS = [ ("English", "en"), ("German", "de"), diff --git a/recipes/TextToSpeech.py b/recipes/TextToSpeech.py index de2c86319..fc97ad36e 100644 --- a/recipes/TextToSpeech.py +++ b/recipes/TextToSpeech.py @@ -133,14 +133,14 @@ def _get_eleven_labs_price(self, state: dict): def _get_tts_provider(self, state: dict): tts_provider = state.get("tts_provider", TextToSpeechProviders.UBERDUCK.name) - # TODO: validate tts_provider before state lookup + # TODO: validate tts_provider before lookup? return TextToSpeechProviders[tts_provider] def additional_notes(self): tts_provider = st.session_state.get("tts_provider") if tts_provider == TextToSpeechProviders.ELEVEN_LABS.name: return """ - *Eleven Labs cost ≈ 4 credits per 10 words* + *Eleven Labs cost ≈ 4 credits per 10 words* """ else: return "" From 24a0d81df921fc84337999f3572fb3056da2e7e0 Mon Sep 17 00:00:00 2001 From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com> Date: Fri, 6 Oct 2023 21:32:53 +0530 Subject: [PATCH 09/20] update eleven labs to have premade voices by default earlier, generated voices from the voice labs were displayed. another auxiliary change is made in this commit: - the API for eleven labs now accepts elevenlabs_voice_name rather than elevenlabs_voice_id. this makes the API easier to use. --- .../text_to_speech_settings_widgets.py | 48 ++++++++++++++++--- recipes/TextToSpeech.py | 17 +++---- 2 files changed, 50 insertions(+), 15 deletions(-) diff --git a/daras_ai_v2/text_to_speech_settings_widgets.py b/daras_ai_v2/text_to_speech_settings_widgets.py index 869069517..5a73db7f1 100644 --- a/daras_ai_v2/text_to_speech_settings_widgets.py +++ b/daras_ai_v2/text_to_speech_settings_widgets.py @@ -27,13 +27,47 @@ class TextToSpeechProviders(Enum): ELEVEN_LABS = "ElevenLabs" -# Mapping from Eleven Labs Voice ID -> Title in UI +# Mapping from Eleven Labs Voice Name -> Voice ID ELEVEN_LABS_VOICES = { - "syN7Wt0nfLXAqq9LI9R6": "Indian Man With A Deep Voice", - "99XLBDkANYZ0Ww7MKN8d": "[aaa] Ramakrishnan - INDIAN TEACHER. CEREBRAL AND THOUGHTFUL PRONUNCIATION 🔥", - "TSh1KthMgVjY3BfYFkwS": "[ElevenVoices] Riya - Indian Female Young Adult", - "dvRCseVM0rT31i8clSVy": "Wise Grandma slow, seductive, pleasant, mature", - "prqKmUi0Zo7WBmA81Vy4": "[ElevenVoices] Rahul - Indian Male Young Adult", + "Rachel": "21m00Tcm4TlvDq8ikWAM", + "Clyde": "2EiwWnXFnvU5JabPnv8n", + "Domi": "AZnzlk1XvdvUeBnXmlld", + "Dave": "CYw3kZ02Hs0563khs1Fj", + "Fin": "D38z5RcWu1voky8WS1ja", + "Bella": "EXAVITQu4vr4xnSDxMaL", + "Antoni": "ErXwobaYiN019PkySvjV", + "Thomas": "GBv7mTt0atIp3Br8iCZE", + "Charlie": "IKne3meq5aSn9XLyUdCD", + "Emily": "LcfcDJNUP1GQjkzn1xUU", + "Elli": "MF3mGyEYCl7XYWbV9V6O", + "Callum": "N2lVS1w4EtoT3dr4eOWO", + "Patrick": "ODq5zmih8GrVes37Dizd", + "Harry": "SOYHLrjzK2X1ezoPC6cr", + "Liam": "TX3LPaxmHKxFdv7VOQHJ", + "Dorothy": "ThT5KcBeYPX3keUQqHPh", + "Josh": "TxGEqnHWrfWFTfGW9XjX", + "Arnold": "VR6AewLTigWG4xSOukaG", + "Charlotte": "XB0fDUnXU5powFXDhCwa", + "Matilda": "XrExE9yKIg1WjnnlVkGX", + "Matthew": "Yko7PKHZNXotIFUBG7I9", + "James": "ZQe5CZNOzWyzPSCn5a3c", + "Joseph": "Zlb1dXrM653N07WRdFW3", + "Jeremy": "bVMeCyTHy58xNoL34h3p", + "Michael": "flq6f7yk4E4fJM5XTYuZ", + "Ethan": "g5CIjZEefAph4nQFvHAz", + "Gigi": "jBpfuIE2acCO8z3wKNLl", + "Freya": "jsCqWAovK2LkecY7zXl4", + "Grace": "oWAxZDx7w5VEj9dCyTzz", + "Daniel": "onwK4e9ZLuTAKqWW03F9", + "Serena": "pMsXgVXv3BLzUgSXRplE", + "Adam": "pNInz6obpgDQGcFmaJgB", + "Nicole": "piTKgcLEGmPE4e6mEKli", + "Jessie": "t0jbNlBVZ17f02VDIeMI", + "Ryan": "wViXBPUzp2ZZixB1xQuM", + "Sam": "yoZ06aMxZJJ28mfd3POQ", + "Glinda": "z9fAnlkpzviPz146aGWa", + "Giovanni": "zcAOhNBS3c14rBihAFp1", + "Mimi": "zrHiDhphv9ZnVXBqCLjz", } # Mapping from Model ID -> Title in UI @@ -165,7 +199,7 @@ def text_to_speech_settings(): ###### Voice name (ElevenLabs) """, key="elevenlabs_voice_name", - format_func=ELEVEN_LABS_VOICES.__getitem__, + format_func=str, options=ELEVEN_LABS_VOICES.keys(), ) diff --git a/recipes/TextToSpeech.py b/recipes/TextToSpeech.py index fc97ad36e..ae534a3e1 100644 --- a/recipes/TextToSpeech.py +++ b/recipes/TextToSpeech.py @@ -42,7 +42,7 @@ class TextToSpeechPage(BasePage): "google_speaking_rate": 1.0, "uberduck_voice_name": "Aiden Botha", "uberduck_speaking_rate": 1.0, - "elevenlabs_voice_name": "syN7Wt0nfLXAqq9LI9R6", + "elevenlabs_voice_name": "Rachel", "elevenlabs_model": "eleven_multilingual_v2", "elevenlabs_stability": 0.5, "elevenlabs_similarity_boost": 0.75, @@ -251,19 +251,20 @@ def run(self, state: dict): ) case TextToSpeechProviders.ELEVEN_LABS: - # default to first voice ID in the mapping - default_voice_id = next(iter(ELEVEN_LABS_VOICES)) - - # default to first model ID in the mapping + # default to first in the mapping default_voice_model = next(iter(ELEVEN_LABS_MODELS)) + default_voice_name = next(iter(ELEVEN_LABS_VOICES)) - voice_id = state.get("elevenlabs_voice_name", default_voice_id) voice_model = state.get("elevenlabs_model", default_voice_model) + voice_name = state.get("elevenlabs_voice_name", default_voice_name) - if voice_id not in ELEVEN_LABS_VOICES: - raise ValueError(f"Invalid voice_name: {voice_id}") + # validate voice_model / voice_name if voice_model not in ELEVEN_LABS_MODELS: raise ValueError(f"Invalid model: {voice_model}") + if voice_name not in ELEVEN_LABS_VOICES: + raise ValueError(f"Invalid voice_name: {voice_name}") + else: + voice_id = ELEVEN_LABS_VOICES[voice_name] stability = state.get("elevenlabs_stability", 0.5) similarity_boost = state.get("elevenlabs_similarity_boost", 0.75) From d3c330062958c1b259e617a1fa706ca3b1db1d89 Mon Sep 17 00:00:00 2001 From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com> Date: Fri, 6 Oct 2023 22:42:32 +0530 Subject: [PATCH 10/20] Show Eleven Labs as second TTS provider in the list --- daras_ai_v2/text_to_speech_settings_widgets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/daras_ai_v2/text_to_speech_settings_widgets.py b/daras_ai_v2/text_to_speech_settings_widgets.py index 5a73db7f1..b435ee462 100644 --- a/daras_ai_v2/text_to_speech_settings_widgets.py +++ b/daras_ai_v2/text_to_speech_settings_widgets.py @@ -22,9 +22,9 @@ class TextToSpeechProviders(Enum): GOOGLE_TTS = "Google Cloud Text-to-Speech" + ELEVEN_LABS = "ElevenLabs" UBERDUCK = "uberduck.ai" BARK = "Bark (suno-ai)" - ELEVEN_LABS = "ElevenLabs" # Mapping from Eleven Labs Voice Name -> Voice ID From 47483112ef01a381c8b269c3566d0bf98a9a1954 Mon Sep 17 00:00:00 2001 From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com> Date: Fri, 6 Oct 2023 22:42:55 +0530 Subject: [PATCH 11/20] Move voice_model to below voice_name for Eleven Labs --- daras_ai_v2/text_to_speech_settings_widgets.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/daras_ai_v2/text_to_speech_settings_widgets.py b/daras_ai_v2/text_to_speech_settings_widgets.py index b435ee462..d40b2ab35 100644 --- a/daras_ai_v2/text_to_speech_settings_widgets.py +++ b/daras_ai_v2/text_to_speech_settings_widgets.py @@ -202,6 +202,14 @@ def text_to_speech_settings(): format_func=str, options=ELEVEN_LABS_VOICES.keys(), ) + st.selectbox( + """ + ###### Voice Model + """, + key="elevenlabs_model", + format_func=ELEVEN_LABS_MODELS.__getitem__, + options=ELEVEN_LABS_MODELS.keys(), + ) col1, col2 = st.columns(2) with col1: From 48e58af9ea974dcea41797bdd6174ba9dcf4b65b Mon Sep 17 00:00:00 2001 From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com> Date: Fri, 6 Oct 2023 22:43:15 +0530 Subject: [PATCH 12/20] Show Eleven Labs supported languages --- daras_ai_v2/text_to_speech_settings_widgets.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/daras_ai_v2/text_to_speech_settings_widgets.py b/daras_ai_v2/text_to_speech_settings_widgets.py index d40b2ab35..55df22607 100644 --- a/daras_ai_v2/text_to_speech_settings_widgets.py +++ b/daras_ai_v2/text_to_speech_settings_widgets.py @@ -76,6 +76,9 @@ class TextToSpeechProviders(Enum): "eleven_monolingual_v1": "English V1 - Low latency English TTS", } +ELEVEN_LABS_SUPPORTED_LANGS = ["English", "Chinese", "Spanish", "Hindi", "Portuguese", "French", "German", "Japanese", "Arabic", "Korean", "Indonesian", "Italian", "Dutch", "Turkish", "Polish", "Swedish", "Filipino", "Malay", "Romanian", "Ukrainian", "Greek", "Czech", "Danish", "Finnish", "Bulgarian", "Croatian", "Slovak", "Tamil"] + + BARK_SUPPORTED_LANGS = [ ("English", "en"), ("German", "de"), @@ -238,16 +241,9 @@ def text_to_speech_settings(): key="elevenlabs_similarity_boost", ) - col1, _ = st.columns(2) - with col1: - st.selectbox( - """ - ###### Voice Model - """, - key="elevenlabs_model", - format_func=ELEVEN_LABS_MODELS.__getitem__, - options=ELEVEN_LABS_MODELS.keys(), - ) + with st.expander("Eleven Labs Supported Languages"): + st.caption("With Multilingual V2 voice model") + st.caption(", ".join(ELEVEN_LABS_SUPPORTED_LANGS)) @st.cache_data() From d280e3e28b3872610aa3c38f6d45c07e62b1dfbe Mon Sep 17 00:00:00 2001 From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com> Date: Fri, 6 Oct 2023 22:44:42 +0530 Subject: [PATCH 13/20] Run black --- .../text_to_speech_settings_widgets.py | 109 +++++++++++------- 1 file changed, 69 insertions(+), 40 deletions(-) diff --git a/daras_ai_v2/text_to_speech_settings_widgets.py b/daras_ai_v2/text_to_speech_settings_widgets.py index 55df22607..dd05f6d5f 100644 --- a/daras_ai_v2/text_to_speech_settings_widgets.py +++ b/daras_ai_v2/text_to_speech_settings_widgets.py @@ -29,45 +29,45 @@ class TextToSpeechProviders(Enum): # Mapping from Eleven Labs Voice Name -> Voice ID ELEVEN_LABS_VOICES = { - "Rachel": "21m00Tcm4TlvDq8ikWAM", - "Clyde": "2EiwWnXFnvU5JabPnv8n", - "Domi": "AZnzlk1XvdvUeBnXmlld", - "Dave": "CYw3kZ02Hs0563khs1Fj", - "Fin": "D38z5RcWu1voky8WS1ja", - "Bella": "EXAVITQu4vr4xnSDxMaL", - "Antoni": "ErXwobaYiN019PkySvjV", - "Thomas": "GBv7mTt0atIp3Br8iCZE", - "Charlie": "IKne3meq5aSn9XLyUdCD", - "Emily": "LcfcDJNUP1GQjkzn1xUU", - "Elli": "MF3mGyEYCl7XYWbV9V6O", - "Callum": "N2lVS1w4EtoT3dr4eOWO", - "Patrick": "ODq5zmih8GrVes37Dizd", - "Harry": "SOYHLrjzK2X1ezoPC6cr", - "Liam": "TX3LPaxmHKxFdv7VOQHJ", - "Dorothy": "ThT5KcBeYPX3keUQqHPh", - "Josh": "TxGEqnHWrfWFTfGW9XjX", - "Arnold": "VR6AewLTigWG4xSOukaG", - "Charlotte": "XB0fDUnXU5powFXDhCwa", - "Matilda": "XrExE9yKIg1WjnnlVkGX", - "Matthew": "Yko7PKHZNXotIFUBG7I9", - "James": "ZQe5CZNOzWyzPSCn5a3c", - "Joseph": "Zlb1dXrM653N07WRdFW3", - "Jeremy": "bVMeCyTHy58xNoL34h3p", - "Michael": "flq6f7yk4E4fJM5XTYuZ", - "Ethan": "g5CIjZEefAph4nQFvHAz", - "Gigi": "jBpfuIE2acCO8z3wKNLl", - "Freya": "jsCqWAovK2LkecY7zXl4", - "Grace": "oWAxZDx7w5VEj9dCyTzz", - "Daniel": "onwK4e9ZLuTAKqWW03F9", - "Serena": "pMsXgVXv3BLzUgSXRplE", - "Adam": "pNInz6obpgDQGcFmaJgB", - "Nicole": "piTKgcLEGmPE4e6mEKli", - "Jessie": "t0jbNlBVZ17f02VDIeMI", - "Ryan": "wViXBPUzp2ZZixB1xQuM", - "Sam": "yoZ06aMxZJJ28mfd3POQ", - "Glinda": "z9fAnlkpzviPz146aGWa", - "Giovanni": "zcAOhNBS3c14rBihAFp1", - "Mimi": "zrHiDhphv9ZnVXBqCLjz", + "Rachel": "21m00Tcm4TlvDq8ikWAM", + "Clyde": "2EiwWnXFnvU5JabPnv8n", + "Domi": "AZnzlk1XvdvUeBnXmlld", + "Dave": "CYw3kZ02Hs0563khs1Fj", + "Fin": "D38z5RcWu1voky8WS1ja", + "Bella": "EXAVITQu4vr4xnSDxMaL", + "Antoni": "ErXwobaYiN019PkySvjV", + "Thomas": "GBv7mTt0atIp3Br8iCZE", + "Charlie": "IKne3meq5aSn9XLyUdCD", + "Emily": "LcfcDJNUP1GQjkzn1xUU", + "Elli": "MF3mGyEYCl7XYWbV9V6O", + "Callum": "N2lVS1w4EtoT3dr4eOWO", + "Patrick": "ODq5zmih8GrVes37Dizd", + "Harry": "SOYHLrjzK2X1ezoPC6cr", + "Liam": "TX3LPaxmHKxFdv7VOQHJ", + "Dorothy": "ThT5KcBeYPX3keUQqHPh", + "Josh": "TxGEqnHWrfWFTfGW9XjX", + "Arnold": "VR6AewLTigWG4xSOukaG", + "Charlotte": "XB0fDUnXU5powFXDhCwa", + "Matilda": "XrExE9yKIg1WjnnlVkGX", + "Matthew": "Yko7PKHZNXotIFUBG7I9", + "James": "ZQe5CZNOzWyzPSCn5a3c", + "Joseph": "Zlb1dXrM653N07WRdFW3", + "Jeremy": "bVMeCyTHy58xNoL34h3p", + "Michael": "flq6f7yk4E4fJM5XTYuZ", + "Ethan": "g5CIjZEefAph4nQFvHAz", + "Gigi": "jBpfuIE2acCO8z3wKNLl", + "Freya": "jsCqWAovK2LkecY7zXl4", + "Grace": "oWAxZDx7w5VEj9dCyTzz", + "Daniel": "onwK4e9ZLuTAKqWW03F9", + "Serena": "pMsXgVXv3BLzUgSXRplE", + "Adam": "pNInz6obpgDQGcFmaJgB", + "Nicole": "piTKgcLEGmPE4e6mEKli", + "Jessie": "t0jbNlBVZ17f02VDIeMI", + "Ryan": "wViXBPUzp2ZZixB1xQuM", + "Sam": "yoZ06aMxZJJ28mfd3POQ", + "Glinda": "z9fAnlkpzviPz146aGWa", + "Giovanni": "zcAOhNBS3c14rBihAFp1", + "Mimi": "zrHiDhphv9ZnVXBqCLjz", } # Mapping from Model ID -> Title in UI @@ -76,7 +76,36 @@ class TextToSpeechProviders(Enum): "eleven_monolingual_v1": "English V1 - Low latency English TTS", } -ELEVEN_LABS_SUPPORTED_LANGS = ["English", "Chinese", "Spanish", "Hindi", "Portuguese", "French", "German", "Japanese", "Arabic", "Korean", "Indonesian", "Italian", "Dutch", "Turkish", "Polish", "Swedish", "Filipino", "Malay", "Romanian", "Ukrainian", "Greek", "Czech", "Danish", "Finnish", "Bulgarian", "Croatian", "Slovak", "Tamil"] +ELEVEN_LABS_SUPPORTED_LANGS = [ + "English", + "Chinese", + "Spanish", + "Hindi", + "Portuguese", + "French", + "German", + "Japanese", + "Arabic", + "Korean", + "Indonesian", + "Italian", + "Dutch", + "Turkish", + "Polish", + "Swedish", + "Filipino", + "Malay", + "Romanian", + "Ukrainian", + "Greek", + "Czech", + "Danish", + "Finnish", + "Bulgarian", + "Croatian", + "Slovak", + "Tamil", +] BARK_SUPPORTED_LANGS = [ From f2b744ef2259be0d33a6f06ca900196c1d47ea25 Mon Sep 17 00:00:00 2001 From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com> Date: Fri, 6 Oct 2023 22:57:34 +0530 Subject: [PATCH 14/20] Make text in "Eleven Labs Supported Languages" section smaller This commit also introduces another chagne in gooey_ui that accepts props for `caption` and `expander` as kwargs. --- daras_ai_v2/text_to_speech_settings_widgets.py | 13 ++++++++++--- gooey_ui/components.py | 6 ++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/daras_ai_v2/text_to_speech_settings_widgets.py b/daras_ai_v2/text_to_speech_settings_widgets.py index dd05f6d5f..153422c9a 100644 --- a/daras_ai_v2/text_to_speech_settings_widgets.py +++ b/daras_ai_v2/text_to_speech_settings_widgets.py @@ -270,9 +270,16 @@ def text_to_speech_settings(): key="elevenlabs_similarity_boost", ) - with st.expander("Eleven Labs Supported Languages"): - st.caption("With Multilingual V2 voice model") - st.caption(", ".join(ELEVEN_LABS_SUPPORTED_LANGS)) + with st.expander( + "Eleven Labs Supported Languages", + style={"fontSize": "0.9rem", "textDecoration": "underline"}, + ): + st.caption( + "With Multilingual V2 voice model", style={"fontSize": "0.8rem"} + ) + st.caption( + ", ".join(ELEVEN_LABS_SUPPORTED_LANGS), style={"fontSize": "0.8rem"} + ) @st.cache_data() diff --git a/gooey_ui/components.py b/gooey_ui/components.py index ea3bbedda..f6ea3b6d5 100644 --- a/gooey_ui/components.py +++ b/gooey_ui/components.py @@ -131,7 +131,8 @@ def success(body: str, icon: str = "✅", *, unsafe_allow_html=False): def caption(body: str, **props): - markdown(body, style={"fontSize": "0.9rem"}, className="text-muted", **props) + style = props.setdefault("style", {"fontSize": "0.9rem"}) + markdown(body, className="text-muted", **props) def option_menu(*args, options, **kwargs): @@ -391,12 +392,13 @@ def button( form_submit_button = button -def expander(label: str, *, expanded: bool = False): +def expander(label: str, *, expanded: bool = False, **props): node = state.RenderTreeNode( name="expander", props=dict( label=dedent(label), open=expanded, + **props, ), ) node.mount() From bcc55c35c34d5f9208fda9029a87f1e9438573c9 Mon Sep 17 00:00:00 2001 From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com> Date: Mon, 9 Oct 2023 18:23:37 +0530 Subject: [PATCH 15/20] Include 11labs pricing for /copilot recipe --- recipes/VideoBots.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/recipes/VideoBots.py b/recipes/VideoBots.py index d5f405ed6..a54e55d50 100644 --- a/recipes/VideoBots.py +++ b/recipes/VideoBots.py @@ -546,6 +546,30 @@ def render_steps(self): st.write(f"**Generated Audio {idx + 1}**") st.audio(audio_url) + def get_raw_price(self, state: dict): + match state.get("tts_provider"): + case TextToSpeechProviders.ELEVEN_LABS.name: + output_text_list = state.get( + "raw_tts_text", state.get("raw_output_text", []) + ) + tts_state = {"text_prompt": "".join(output_text_list)} + return super().get_raw_price(state) + TextToSpeechPage().get_raw_price( + tts_state + ) + case _: + return super().get_raw_price(state) + + def additional_notes(self): + tts_provider = st.session_state.get("tts_provider") + match tts_provider: + case TextToSpeechProviders.ELEVEN_LABS.name: + return f""" + - *Base cost = {super().get_raw_price(st.session_state)} credits* + - *Additional Eleven Labs cost ≈ 4 credits per 10 words of the output* + """ + case _: + return "" + def run(self, state: dict) -> typing.Iterator[str | None]: request: VideoBotsPage.RequestModel = self.RequestModel.parse_obj(state) From 0f1d98a081516820e1e0beab8d3f8c988fc1cd49 Mon Sep 17 00:00:00 2001 From: Dev Aggarwal Date: Mon, 9 Oct 2023 20:08:25 +0530 Subject: [PATCH 16/20] add eleven labs defaults fix eleven labs on videobots --- recipes/LipsyncTTS.py | 9 +++++++++ recipes/VideoBots.py | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/recipes/LipsyncTTS.py b/recipes/LipsyncTTS.py index d250f746b..d31171596 100644 --- a/recipes/LipsyncTTS.py +++ b/recipes/LipsyncTTS.py @@ -17,6 +17,13 @@ class LipsyncTTSPage(LipsyncPage, TextToSpeechPage): workflow = Workflow.LIPSYNC_TTS slug_versions = ["LipsyncTTS", "lipsync-maker"] + sane_defaults = { + "elevenlabs_voice_name": "Rachel", + "elevenlabs_model": "eleven_multilingual_v2", + "elevenlabs_stability": 0.5, + "elevenlabs_similarity_boost": 0.75, + } + class RequestModel(BaseModel): input_face: str input_audio: str | None @@ -37,6 +44,8 @@ class RequestModel(BaseModel): google_speaking_rate: float | None google_pitch: float | None + bark_history_prompt: str | None + elevenlabs_voice_name: str | None elevenlabs_model: str | None elevenlabs_stability: float | None diff --git a/recipes/VideoBots.py b/recipes/VideoBots.py index a54e55d50..98682aef6 100644 --- a/recipes/VideoBots.py +++ b/recipes/VideoBots.py @@ -162,6 +162,10 @@ class VideoBotsPage(BasePage): "google_speaking_rate": 1.0, "uberduck_voice_name": "Aiden Botha", "uberduck_speaking_rate": 1.0, + "elevenlabs_voice_name": "Rachel", + "elevenlabs_model": "eleven_multilingual_v2", + "elevenlabs_stability": 0.5, + "elevenlabs_similarity_boost": 0.75, # gpt3 "selected_model": LargeLanguageModels.text_davinci_003.name, "avoid_repetition": True, @@ -200,6 +204,11 @@ class RequestModel(BaseModel): google_voice_name: str | None google_speaking_rate: float | None google_pitch: float | None + bark_history_prompt: str | None + elevenlabs_voice_name: str | None + elevenlabs_model: str | None + elevenlabs_stability: float | None + elevenlabs_similarity_boost: float | None # llm settings selected_model: typing.Literal[ From 3f03aeb629a1c783ecf5a05f4ae2fab84bd037cb Mon Sep 17 00:00:00 2001 From: Dev Aggarwal Date: Thu, 5 Oct 2023 19:14:47 +0530 Subject: [PATCH 17/20] move audio-ldm, bark, dis, u2net, nemo_asr to k8s use replicate for gfpgan add gte models --- daras_ai_v2/asr.py | 20 ++++----- daras_ai_v2/face_restoration.py | 15 ++++--- daras_ai_v2/gpu_server.py | 32 ++------------- daras_ai_v2/image_segmentation.py | 35 ++++++++++------ daras_ai_v2/settings.py | 1 - daras_ai_v2/stable_diffusion.py | 8 ++-- recipes/Text2Audio.py | 68 ++++++++++++------------------- recipes/TextToSpeech.py | 38 ++++++----------- recipes/embeddings_page.py | 8 ++++ 9 files changed, 92 insertions(+), 133 deletions(-) diff --git a/daras_ai_v2/asr.py b/daras_ai_v2/asr.py index 0dc03b00a..1ed8e94bf 100644 --- a/daras_ai_v2/asr.py +++ b/daras_ai_v2/asr.py @@ -384,19 +384,15 @@ def run_asr( if result.alternatives ) elif "nemo" in selected_model.name: - r = requests.post( - str(GpuEndpoints.nemo_asr), - json={ - "pipeline": dict( - model_id=asr_model_ids[selected_model], - ), - "inputs": dict( - audio=audio_url, - ), - }, + data = call_celery_task( + "nemo_asr", + pipeline=dict( + model_id=asr_model_ids[selected_model], + ), + inputs=dict( + audio=audio_url, + ), ) - r.raise_for_status() - data = r.json() # check if we should use the fast queue # call one of the self-hosted models else: diff --git a/daras_ai_v2/face_restoration.py b/daras_ai_v2/face_restoration.py index 09d6cfa00..cfd2d1ec0 100644 --- a/daras_ai_v2/face_restoration.py +++ b/daras_ai_v2/face_restoration.py @@ -75,11 +75,10 @@ def gfpgan(img: str, scale: int = 1) -> bytes: elif scale != 2: scale *= 2 - return call_gpu_server_b64( - endpoint=GpuEndpoints.gfpgan, - input_data={ - "img": img, - "version": "v1.4", - "scale": scale, - }, - )[0] + # https://replicate.com/nightmareai/real-esrgan/versions/42fed1c4974146d4d2414e2be2c5277c7fcf05fcc3a73abf41610695738c1d7b#output-schema + model = replicate.models.get("tencentarc/gfpgan") + version = model.versions.get( + "9283608cc6b7be6b65a8e44983db012355fde4132009bf99d976b2f0896856a3" + ) + img = version.predict(img=img, version="v1.4", scale=scale) + return requests.get(img).content diff --git a/daras_ai_v2/gpu_server.py b/daras_ai_v2/gpu_server.py index d56f2a9d0..87a4fcc66 100644 --- a/daras_ai_v2/gpu_server.py +++ b/daras_ai_v2/gpu_server.py @@ -11,35 +11,9 @@ class GpuEndpoints: - wav2lip = settings.GPU_SERVER_1.copy().set(port=5001) glid_3_xl_stable = settings.GPU_SERVER_1.copy().set(port=5002) - gfpgan = settings.GPU_SERVER_1.copy().set(port=5003) - dichotomous_image_segmentation = settings.GPU_SERVER_1.copy().set(port=5004) - # flan_t5 = f"{settings.GPU_SERVER_2}:5005" - # runway_ml_inpainting = f"{settings.GPU_SERVER_2}:5006" - u2net = settings.GPU_SERVER_1.copy().set(port=5007) - # deforum_sd = f"{settings.GPU_SERVER_2}:5008" - sd_2 = settings.GPU_SERVER_1.copy().set(port=5011) - # sd_multi = settings.GPU_SERVER_1.copy().set(port=5012) - # real_esrgan = settings.GPU_SERVER_1furl().set(port=5013) - # defourm_sd = settings.GPU_SERVER_2.copy().set(port=5014) / "deforum" - - lavis = settings.GPU_SERVER_1.copy().set(port=5015) - vqa = lavis / "vqa" - image_captioning = lavis / "image-captioning" - - _asr = settings.GPU_SERVER_1.copy().set(port=5016) - whisper = _asr / "whisper" - nemo_asr = _asr / "nemo/asr" - - _asr_fast = settings.GPU_SERVER_1.copy().set(port=5019) - whisper_fast = _asr_fast / "whisper" - nemo_asr_fast = _asr_fast / "nemo/asr" - - audio_ldm = settings.GPU_SERVER_1.copy().set(port=5017) / "audio_ldm" - bark = settings.GPU_SERVER_1.copy().set(port=5017) / "bark" - deepfloyd_if = settings.GPU_SERVER_1.copy().set(port=5018) / "deepfloyd_if" + sd_2 = settings.GPU_SERVER_1.copy().set(port=5011) def call_gpu_server_b64(*, endpoint: str, input_data: dict) -> list[bytes]: @@ -133,7 +107,7 @@ def call_celery_task_outfile( task_name: str, *, pipeline: dict, - inputs: dict, + inputs, content_type: str, filename: str, num_outputs: int = 1, @@ -173,7 +147,7 @@ def call_celery_task( task_name: str, *, pipeline: dict, - inputs: dict, + inputs, queue_prefix: str = "gooey-gpu", ): queue = os.path.join(queue_prefix, pipeline["model_id"].strip()).strip("/") diff --git a/daras_ai_v2/image_segmentation.py b/daras_ai_v2/image_segmentation.py index 2db811e88..5515149ca 100644 --- a/daras_ai_v2/image_segmentation.py +++ b/daras_ai_v2/image_segmentation.py @@ -1,7 +1,10 @@ from enum import Enum -from daras_ai.image_input import bytes_to_cv2_img, cv2_img_to_bytes -from daras_ai_v2.gpu_server import call_gpu_server_b64, GpuEndpoints +import requests + +from daras_ai_v2.gpu_server import ( + call_celery_task_outfile, +) class ImageSegmentationModels(Enum): @@ -10,18 +13,26 @@ class ImageSegmentationModels(Enum): def u2net(input_image: str) -> bytes: - return call_gpu_server_b64( - endpoint=GpuEndpoints.u2net, - input_data={ - "image": input_image, - }, + url = call_celery_task_outfile( + "u2net", + pipeline=dict(model_id="u2net"), + inputs=[input_image], + content_type="image/png", + filename="u2net.png", )[0] + r = requests.get(url) + r.raise_for_status() + return r.content def dis(input_image: str) -> bytes: - return call_gpu_server_b64( - endpoint=GpuEndpoints.dichotomous_image_segmentation, - input_data={ - "input_image": input_image, - }, + url = call_celery_task_outfile( + "dis", + pipeline=dict(model_id="isnet-general-use.pth"), + inputs=[input_image], + content_type="image/png", + filename="dis.png", )[0] + r = requests.get(url) + r.raise_for_status() + return r.content diff --git a/daras_ai_v2/settings.py b/daras_ai_v2/settings.py index 4c762753f..08994c34b 100644 --- a/daras_ai_v2/settings.py +++ b/daras_ai_v2/settings.py @@ -226,7 +226,6 @@ EXPLORE_URL = furl(APP_BASE_URL).add(path="explore").url GPU_SERVER_1 = furl(config("GPU_SERVER_1", "http://gpu-1.gooey.ai")) -GPU_SERVER_2 = furl(config("GPU_SERVER_2", "http://gpu-2.gooey.ai")) SERPER_API_KEY = config("SERPER_API_KEY", None) diff --git a/daras_ai_v2/stable_diffusion.py b/daras_ai_v2/stable_diffusion.py index e5bfdb250..e641a8c4f 100644 --- a/daras_ai_v2/stable_diffusion.py +++ b/daras_ai_v2/stable_diffusion.py @@ -49,15 +49,15 @@ class Text2ImgModels(Enum): text2img_model_ids = { - Text2ImgModels.sd_2: "stabilityai/stable-diffusion-2-1", Text2ImgModels.sd_1_5: "runwayml/stable-diffusion-v1-5", + Text2ImgModels.sd_2: "stabilityai/stable-diffusion-2-1", + Text2ImgModels.dream_shaper: "Lykon/DreamShaper", + Text2ImgModels.analog_diffusion: "wavymulder/Analog-Diffusion", Text2ImgModels.openjourney: "prompthero/openjourney", Text2ImgModels.openjourney_2: "prompthero/openjourney-v2", - Text2ImgModels.analog_diffusion: "wavymulder/Analog-Diffusion", - Text2ImgModels.protogen_5_3: "darkstorm2150/Protogen_v5.3_Official_Release", Text2ImgModels.dreamlike_2: "dreamlike-art/dreamlike-photoreal-2.0", Text2ImgModels.rodent_diffusion_1_5: "devxpy/rodent-diffusion-1-5", - Text2ImgModels.dream_shaper: "Lykon/DreamShaper", + Text2ImgModels.protogen_5_3: "darkstorm2150/Protogen_v5.3_Official_Release", Text2ImgModels.deepfloyd_if: [ "DeepFloyd/IF-I-XL-v1.0", "DeepFloyd/IF-II-L-v1.0", diff --git a/recipes/Text2Audio.py b/recipes/Text2Audio.py index 54435751f..074462257 100644 --- a/recipes/Text2Audio.py +++ b/recipes/Text2Audio.py @@ -1,16 +1,13 @@ -import datetime import typing from enum import Enum -import requests -import gooey_ui as st from pydantic import BaseModel +import gooey_ui as st from bots.models import Workflow -from daras_ai.image_input import storage_blob_for from daras_ai_v2.base import BasePage from daras_ai_v2.enum_selector_widget import enum_multiselect -from daras_ai_v2.gpu_server import GpuEndpoints +from daras_ai_v2.gpu_server import call_celery_task_outfile from daras_ai_v2.img_model_settings_widgets import ( negative_prompt_setting, guidance_scale_setting, @@ -103,44 +100,31 @@ def run(self, state: dict) -> typing.Iterator[str | None]: state["output_audios"] = output_audios = {} for selected_model in request.selected_models: - yield f"Running {Text2AudioModels[selected_model].value}..." - - blobs = [ - storage_blob_for(f"gooey.ai - {request.text_prompt} ({i + 1}).wav") - for i in range(request.num_outputs) - ] - r = requests.post( - str(GpuEndpoints.audio_ldm), - json={ - "pipeline": { - "model_id": "cvssp/audioldm", - "upload_urls": [ - blob.generate_signed_url( - version="v4", - # This URL is valid for 15 minutes - expiration=datetime.timedelta(minutes=30), - # Allow PUT requests using this URL. - method="PUT", - content_type="audio/wav", - ) - for blob in blobs - ], - "seed": request.seed, - }, - "inputs": { - "prompt": [request.text_prompt], - "negative_prompt": [request.negative_prompt] - if request.negative_prompt - else None, - "num_waveforms_per_prompt": request.num_outputs, - "num_inference_steps": request.quality, - "guidance_scale": request.guidance_scale, - "audio_length_in_s": request.duration_sec, - }, - }, + model = Text2AudioModels[selected_model] + model_id = text2audio_model_ids[model] + + yield f"Running {model.value}..." + + output_audios[selected_model] = call_celery_task_outfile( + "audio_ldm", + pipeline=dict( + model_id=model_id, + seed=request.seed, + ), + inputs=dict( + prompt=[request.text_prompt], + negative_prompt=[request.negative_prompt] + if request.negative_prompt + else None, + num_waveforms_per_prompt=request.num_outputs, + num_inference_steps=request.quality, + guidance_scale=request.guidance_scale, + audio_length_in_s=request.duration_sec, + ), + filename=f"gooey.ai - {request.text_prompt}.wav", + content_type="audio/wav", + num_outputs=request.num_outputs, ) - r.raise_for_status() - output_audios[selected_model] = [blob.public_url for blob in blobs] def render_output(self): _render_output(st.session_state) diff --git a/recipes/TextToSpeech.py b/recipes/TextToSpeech.py index ae534a3e1..778aba1d4 100644 --- a/recipes/TextToSpeech.py +++ b/recipes/TextToSpeech.py @@ -12,7 +12,7 @@ from daras_ai.image_input import upload_file_from_bytes, storage_blob_for from daras_ai_v2 import settings from daras_ai_v2.base import BasePage -from daras_ai_v2.gpu_server import GpuEndpoints +from daras_ai_v2.gpu_server import GpuEndpoints, call_celery_task_outfile from daras_ai_v2.loom_video_widget import youtube_video from daras_ai_v2.text_to_speech_settings_widgets import ( UBERDUCK_VOICES, @@ -151,30 +151,18 @@ def run(self, state: dict): yield f"Generating audio using {provider.value} ..." match provider: case TextToSpeechProviders.BARK: - blob = storage_blob_for(f"bark_tts.wav") - r = requests.post( - str(GpuEndpoints.bark), - json={ - "pipeline": dict( - upload_urls=[ - blob.generate_signed_url( - version="v4", - # This URL is valid for 15 minutes - expiration=datetime.timedelta(minutes=30), - # Allow PUT requests using this URL. - method="PUT", - content_type="audio/wav", - ), - ], - ), - "inputs": dict( - prompt=text.split("---"), - # history_prompt=history_prompt, - ), - }, - ) - r.raise_for_status() - state["audio_url"] = blob.public_url + state["audio_url"] = call_celery_task_outfile( + "bark", + pipeline=dict( + model_id="bark", + ), + inputs=dict( + prompt=text.split("---"), + # history_prompt=history_prompt, + ), + filename="bark_tts.wav", + content_type="audio/wav", + )[0] case TextToSpeechProviders.UBERDUCK: voicemodel_uuid = ( diff --git a/recipes/embeddings_page.py b/recipes/embeddings_page.py index 17af91e98..f2474f7ea 100644 --- a/recipes/embeddings_page.py +++ b/recipes/embeddings_page.py @@ -27,6 +27,14 @@ class EmbeddingModels(models.TextChoices): "Multilingual E5 Large (Liang Wang)", "intfloat/multilingual-e5-large", ) + gte_large = ( + "General Text Embeddings Large (Dingkun Long)", + "thenlper/gte-large", + ) + gte_base = ( + "General Text Embeddings Base (Dingkun Long)", + "thenlper/gte-base", + ) class EmbeddingsPage(BasePage): From c31beb8941c2c5f80321a13f0677431ed4195578 Mon Sep 17 00:00:00 2001 From: Dev Aggarwal Date: Mon, 9 Oct 2023 15:07:14 +0530 Subject: [PATCH 18/20] migrate inpaint models deprecate sd 1.4, deepfloyd, rodent diffusion --- daras_ai_v2/enum_selector_widget.py | 17 ++- daras_ai_v2/gpu_server.py | 4 +- daras_ai_v2/image_segmentation.py | 4 +- daras_ai_v2/img_model_settings_widgets.py | 18 ++-- daras_ai_v2/language_model.py | 4 + daras_ai_v2/stable_diffusion.py | 122 ++++++++++------------ recipes/ObjectInpainting.py | 1 - 7 files changed, 86 insertions(+), 84 deletions(-) diff --git a/daras_ai_v2/enum_selector_widget.py b/daras_ai_v2/enum_selector_widget.py index dcbbd586a..3b9fcb905 100644 --- a/daras_ai_v2/enum_selector_widget.py +++ b/daras_ai_v2/enum_selector_widget.py @@ -14,6 +14,12 @@ def enum_multiselect( checkboxes=True, allow_none=True, ): + try: + deprecated = enum_cls._deprecated() + except AttributeError: + deprecated = set() + enums = [e for e in enum_cls if not e in deprecated] + if checkboxes: if label: st.write(label) @@ -31,13 +37,13 @@ def render(e): else: selected.discard(e.name) - grid_layout(2, enum_cls, render, separator=False) + grid_layout(2, enums, render, separator=False) st.session_state[key] = list(selected) return selected else: return st.multiselect( - options=[e.name for e in enum_cls], + options=[e.name for e in enums], format_func=lambda k: enum_cls[k].value, label=label, key=key, @@ -53,8 +59,13 @@ def enum_selector( exclude: list[E] | None = None, **kwargs, ) -> str: + try: + deprecated = enum_cls._deprecated() + except AttributeError: + deprecated = set() + enums = [e for e in enum_cls if not e in deprecated] label = label or enum_cls.__name__ - options = [e.name for e in enum_cls] + options = [e.name for e in enums] if exclude: options = [o for o in options if o not in exclude] if allow_none: diff --git a/daras_ai_v2/gpu_server.py b/daras_ai_v2/gpu_server.py index 87a4fcc66..2e49a7d6b 100644 --- a/daras_ai_v2/gpu_server.py +++ b/daras_ai_v2/gpu_server.py @@ -107,7 +107,7 @@ def call_celery_task_outfile( task_name: str, *, pipeline: dict, - inputs, + inputs: dict, content_type: str, filename: str, num_outputs: int = 1, @@ -147,7 +147,7 @@ def call_celery_task( task_name: str, *, pipeline: dict, - inputs, + inputs: dict, queue_prefix: str = "gooey-gpu", ): queue = os.path.join(queue_prefix, pipeline["model_id"].strip()).strip("/") diff --git a/daras_ai_v2/image_segmentation.py b/daras_ai_v2/image_segmentation.py index 5515149ca..30a2055bc 100644 --- a/daras_ai_v2/image_segmentation.py +++ b/daras_ai_v2/image_segmentation.py @@ -16,7 +16,7 @@ def u2net(input_image: str) -> bytes: url = call_celery_task_outfile( "u2net", pipeline=dict(model_id="u2net"), - inputs=[input_image], + inputs={"images": [input_image]}, content_type="image/png", filename="u2net.png", )[0] @@ -29,7 +29,7 @@ def dis(input_image: str) -> bytes: url = call_celery_task_outfile( "dis", pipeline=dict(model_id="isnet-general-use.pth"), - inputs=[input_image], + inputs={"images": [input_image]}, content_type="image/png", filename="dis.png", )[0] diff --git a/daras_ai_v2/img_model_settings_widgets.py b/daras_ai_v2/img_model_settings_widgets.py index c17bc5e2a..fea120051 100644 --- a/daras_ai_v2/img_model_settings_widgets.py +++ b/daras_ai_v2/img_model_settings_widgets.py @@ -81,7 +81,7 @@ def model_selector( col1, col2 = st.columns(2) with col1: selected_model = enum_selector( - Img2ImgModels, + models_enum, label=""" ### 🤖 Choose your preferred AI Model """, @@ -101,7 +101,7 @@ def model_selector( ): if "selected_controlnet_model" in st.session_state: st.session_state["selected_controlnet_model"] = None - else: + elif models_enum is Img2ImgModels: enum_multiselect( ControlNetModels, label=controlnet_explanation, @@ -109,12 +109,12 @@ def model_selector( checkboxes=False, allow_none=not require_controlnet, ) - with col2: - controlnet_settings( - extra_explanations=extra_explanations, - low_explanation=low_explanation, - high_explanation=high_explanation, - ) + with col2: + controlnet_settings( + extra_explanations=extra_explanations, + low_explanation=low_explanation, + high_explanation=high_explanation, + ) return selected_model @@ -410,7 +410,7 @@ def prompt_strength_setting(selected_model: str = None): def negative_prompt_setting(selected_model: str = None): - if selected_model in [Text2ImgModels.dall_e.name, InpaintingModels.runway_ml.name]: + if selected_model in [Text2ImgModels.dall_e.name]: return st.text_area( diff --git a/daras_ai_v2/language_model.py b/daras_ai_v2/language_model.py index 408e6bd51..754fc0c01 100644 --- a/daras_ai_v2/language_model.py +++ b/daras_ai_v2/language_model.py @@ -54,6 +54,10 @@ class LargeLanguageModels(Enum): code_davinci_002 = "Codex [Deprecated] (openai)" + @classmethod + def _deprecated(cls): + return {cls.code_davinci_002} + def is_chat_model(self) -> bool: return self in [ LargeLanguageModels.gpt_4, diff --git a/daras_ai_v2/stable_diffusion.py b/daras_ai_v2/stable_diffusion.py index e641a8c4f..c2f69d826 100644 --- a/daras_ai_v2/stable_diffusion.py +++ b/daras_ai_v2/stable_diffusion.py @@ -2,7 +2,6 @@ import typing from enum import Enum -import replicate import requests from PIL import Image from django.db import models @@ -28,9 +27,20 @@ class InpaintingModels(Enum): sd_2 = "Stable Diffusion v2.1 (stability.ai)" runway_ml = "Stable Diffusion v1.5 (RunwayML)" - jack_qiao = "Stable Diffusion v1.4 (Jack Qiao)" dall_e = "Dall-E (OpenAI)" + jack_qiao = "Stable Diffusion v1.4 [Deprecated] (Jack Qiao)" + + @classmethod + def _deprecated(cls): + return {cls.jack_qiao} + + +inpaint_model_ids = { + InpaintingModels.sd_2: "stabilityai/stable-diffusion-2-inpainting", + InpaintingModels.runway_ml: "runwayml/stable-diffusion-inpainting", +} + class Text2ImgModels(Enum): # sd_1_4 = "SD v1.4 (RunwayML)" # Host this too? @@ -42,10 +52,16 @@ class Text2ImgModels(Enum): analog_diffusion = "Analog Diffusion (wavymulder)" protogen_5_3 = "Protogen v5.3 (darkstorm2150)" dreamlike_2 = "Dreamlike Photoreal 2.0 (dreamlike.art)" - rodent_diffusion_1_5 = "Rodent Diffusion 1.5 (NerdyRodent)" - jack_qiao = "Stable Diffusion v1.4 (Jack Qiao)" dall_e = "Dall-E (OpenAI)" - deepfloyd_if = "DeepFloyd IF (stability.ai)" + wuerstchen = "Wuerstchen" + + jack_qiao = "Stable Diffusion v1.4 [Deprecated] (Jack Qiao)" + deepfloyd_if = "DeepFloyd IF [Deprecated] (stability.ai)" + rodent_diffusion_1_5 = "Rodent Diffusion 1.5 [Deprecated] (NerdyRodent)" + + @classmethod + def _deprecated(cls): + return {cls.jack_qiao, cls.deepfloyd_if, cls.rodent_diffusion_1_5} text2img_model_ids = { @@ -56,13 +72,7 @@ class Text2ImgModels(Enum): Text2ImgModels.openjourney: "prompthero/openjourney", Text2ImgModels.openjourney_2: "prompthero/openjourney-v2", Text2ImgModels.dreamlike_2: "dreamlike-art/dreamlike-photoreal-2.0", - Text2ImgModels.rodent_diffusion_1_5: "devxpy/rodent-diffusion-1-5", Text2ImgModels.protogen_5_3: "darkstorm2150/Protogen_v5.3_Official_Release", - Text2ImgModels.deepfloyd_if: [ - "DeepFloyd/IF-I-XL-v1.0", - "DeepFloyd/IF-II-L-v1.0", - "stabilityai/stable-diffusion-x4-upscaler", - ], } @@ -77,10 +87,15 @@ class Img2ImgModels(Enum): analog_diffusion = "Analog Diffusion (wavymulder)" protogen_5_3 = "Protogen v5.3 (darkstorm2150)" dreamlike_2 = "Dreamlike Photoreal 2.0 (dreamlike.art)" - rodent_diffusion_1_5 = "Rodent Diffusion 1.5 (NerdyRodent)" - jack_qiao = "Stable Diffusion v1.4 (Jack Qiao)" dall_e = "Dall-E (OpenAI)" + jack_qiao = "Stable Diffusion v1.4 [Deprecated] (Jack Qiao)" + rodent_diffusion_1_5 = "Rodent Diffusion 1.5 [Deprecated] (NerdyRodent)" + + @classmethod + def _deprecated(cls): + return {cls.jack_qiao, cls.rodent_diffusion_1_5} + img2img_model_ids = { Img2ImgModels.sd_2: "stabilityai/stable-diffusion-2-1", @@ -91,7 +106,6 @@ class Img2ImgModels(Enum): Img2ImgModels.analog_diffusion: "wavymulder/Analog-Diffusion", Img2ImgModels.protogen_5_3: "darkstorm2150/Protogen_v5.3_Official_Release", Img2ImgModels.dreamlike_2: "dreamlike-art/dreamlike-photoreal-2.0", - Img2ImgModels.rodent_diffusion_1_5: "devxpy/rodent-diffusion-1-5", } @@ -475,42 +489,6 @@ def inpainting( _resolution_check(width, height) match selected_model: - case InpaintingModels.sd_2.name: - if num_inference_steps == 110: - num_inference_steps = 100 - out_imgs = call_gpu_server_b64( - endpoint=GpuEndpoints.sd_2, - input_data={ - "prompt": prompt, - "width": width, - "height": height, - "num_outputs": num_outputs, - "num_inference_steps": num_inference_steps, - "edit_image": edit_image, - "mask_image": mask, - "guidance_scale": guidance_scale, - "negative_prompt": negative_prompt or "", - "seed": seed, - }, - ) - case InpaintingModels.runway_ml.name: - model = replicate.models.get("andreasjansson/stable-diffusion-inpainting") - version = model.versions.get( - "8eb2da8345bee796efcd925573f077e36ed5fb4ea3ba240ef70c23cf33f0d848" - ) - out_imgs = [ - requests.get(img).content - for img in version.predict( - prompt=prompt, - image=edit_image, - mask=mask, - invert_mask=True, - num_outputs=num_outputs, - num_inference_steps=num_inference_steps, - guidance_scale=guidance_scale, - seed=seed, - ) - ] case InpaintingModels.dall_e.name: import openai @@ -522,31 +500,41 @@ def inpainting( response = openai.Image.create_edit( prompt=prompt, image=image, - mask=None, n=num_outputs, size=f"{edge}x{edge}", response_format="b64_json", ) out_imgs = [b64_img_decode(part["b64_json"]) for part in response["data"]] - case _: - out_imgs = call_gpu_server_b64( - endpoint=GpuEndpoints.glid_3_xl_stable, - input_data={ - "prompt": prompt, - "num_inference_steps": num_inference_steps, - # "init_image": "string", - "edit_image": edit_image, - "mask": mask, - "num_outputs": num_outputs, - "negative_prompt": negative_prompt or "", - # "negative_prompt": "string", - # "outpaint": "expand", - # "skip_timesteps": 0, - "width": width, - "height": height, + + case InpaintingModels.sd_2.name | InpaintingModels.runway_ml.name: + out_imgs_urls = call_sd_multi( + "diffusion.inpaint", + pipeline={ + "model_id": inpaint_model_ids[InpaintingModels[selected_model]], "seed": seed, + # "scheduler": Schedulers[scheduler].label + # if scheduler + # else "UniPCMultistepScheduler", + "disable_safety_checker": True, + }, + inputs={ + "prompt": [prompt], + "negative_prompt": [negative_prompt] if negative_prompt else None, + "num_images_per_prompt": num_outputs, + "num_inference_steps": num_inference_steps, + "guidance_scale": guidance_scale, + "image": [edit_image], + "mask_image": [mask], }, ) + out_imgs = [] + for url in out_imgs_urls: + r = requests.get(url) + r.raise_for_status() + out_imgs.append(r.content) + + case _: + raise ValueError(f"Invalid model {selected_model}") out_imgs = _recomposite_inpainting_outputs(out_imgs, edit_image_bytes, mask_bytes) diff --git a/recipes/ObjectInpainting.py b/recipes/ObjectInpainting.py index 8afd229ca..ff4c81fde 100644 --- a/recipes/ObjectInpainting.py +++ b/recipes/ObjectInpainting.py @@ -19,7 +19,6 @@ ) from daras_ai_v2.loom_video_widget import youtube_video from daras_ai_v2.repositioning import ( - reposition_object, reposition_object_img_bytes, repositioning_preview_widget, ) From 3b80da293d36307ffe1c0f51303a33268696e057 Mon Sep 17 00:00:00 2001 From: Dev Aggarwal Date: Mon, 9 Oct 2023 20:42:49 +0530 Subject: [PATCH 19/20] remove ded code --- daras_ai_v2/gpu_server.py | 2 -- daras_ai_v2/stable_diffusion.py | 32 -------------------------------- 2 files changed, 34 deletions(-) diff --git a/daras_ai_v2/gpu_server.py b/daras_ai_v2/gpu_server.py index 2e49a7d6b..d4c47a139 100644 --- a/daras_ai_v2/gpu_server.py +++ b/daras_ai_v2/gpu_server.py @@ -11,9 +11,7 @@ class GpuEndpoints: - glid_3_xl_stable = settings.GPU_SERVER_1.copy().set(port=5002) deepfloyd_if = settings.GPU_SERVER_1.copy().set(port=5018) / "deepfloyd_if" - sd_2 = settings.GPU_SERVER_1.copy().set(port=5011) def call_gpu_server_b64(*, endpoint: str, input_data: dict) -> list[bytes]: diff --git a/daras_ai_v2/stable_diffusion.py b/daras_ai_v2/stable_diffusion.py index c2f69d826..7574b0bd4 100644 --- a/daras_ai_v2/stable_diffusion.py +++ b/daras_ai_v2/stable_diffusion.py @@ -15,8 +15,6 @@ ) from daras_ai_v2.extract_face import rgb_img_to_rgba from daras_ai_v2.gpu_server import ( - call_gpu_server_b64, - GpuEndpoints, b64_img_decode, call_sd_multi, ) @@ -271,18 +269,6 @@ def text2img( _resolution_check(width, height, max_size=(1024, 1024)) match selected_model: - case Text2ImgModels.jack_qiao.name: - out_imgs = call_gpu_server_b64( - endpoint=GpuEndpoints.glid_3_xl_stable, - input_data={ - "prompt": prompt, - "num_inference_steps": num_inference_steps, - "num_outputs": num_outputs, - "negative_prompt": negative_prompt or "", - "width": width, - "height": height, - }, - ) case Text2ImgModels.dall_e.name: import openai @@ -352,24 +338,6 @@ def img2img( _resolution_check(width, height) match selected_model: - case Img2ImgModels.jack_qiao.name: - out_imgs = call_gpu_server_b64( - endpoint=GpuEndpoints.glid_3_xl_stable, - input_data={ - "prompt": prompt, - "num_inference_steps": num_inference_steps, - "init_image": init_image, - # "edit_image": edit_image, - # "mask": mask, - "num_outputs": num_outputs, - "negative_prompt": negative_prompt or "", - # "outpaint": "expand", - "skip_timesteps": int(num_inference_steps * (1 - prompt_strength)), - "width": width, - "height": height, - "seed": seed, - }, - ) case Img2ImgModels.dall_e.name: import openai From cd316548d0d0d11714383836e12bbf38da6f5bf8 Mon Sep 17 00:00:00 2001 From: Dev Aggarwal Date: Mon, 9 Oct 2023 20:46:00 +0530 Subject: [PATCH 20/20] remove wuerstchen --- daras_ai_v2/stable_diffusion.py | 1 - 1 file changed, 1 deletion(-) diff --git a/daras_ai_v2/stable_diffusion.py b/daras_ai_v2/stable_diffusion.py index 7574b0bd4..a78e962bd 100644 --- a/daras_ai_v2/stable_diffusion.py +++ b/daras_ai_v2/stable_diffusion.py @@ -51,7 +51,6 @@ class Text2ImgModels(Enum): protogen_5_3 = "Protogen v5.3 (darkstorm2150)" dreamlike_2 = "Dreamlike Photoreal 2.0 (dreamlike.art)" dall_e = "Dall-E (OpenAI)" - wuerstchen = "Wuerstchen" jack_qiao = "Stable Diffusion v1.4 [Deprecated] (Jack Qiao)" deepfloyd_if = "DeepFloyd IF [Deprecated] (stability.ai)"