From 38da111bdc2ad077057d9e05950fd1b58613adf0 Mon Sep 17 00:00:00 2001 From: Alexander Metzger Date: Sun, 24 Sep 2023 01:07:49 -0700 Subject: [PATCH 1/7] works on ASR --- daras_ai_v2/asr.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/daras_ai_v2/asr.py b/daras_ai_v2/asr.py index 5e88fec58..37c488528 100644 --- a/daras_ai_v2/asr.py +++ b/daras_ai_v2/asr.py @@ -23,11 +23,15 @@ TRANSLITERATION_SUPPORTED = {"ar", "bn", " gu", "hi", "ja", "kn", "ru", "ta", "te"} -# below list was found experimentally since the supported languages list by google is actually wrong: +# below CHIRP list was found experimentally since the supported languages list by google is actually wrong: CHIRP_SUPPORTED = {"af-ZA", "sq-AL", "am-ET", "ar-EG", "hy-AM", "as-IN", "ast-ES", "az-AZ", "eu-ES", "be-BY", "bs-BA", "bg-BG", "my-MM", "ca-ES", "ceb-PH", "ckb-IQ", "zh-Hans-CN", "yue-Hant-HK", "hr-HR", "cs-CZ", "da-DK", "nl-NL", "en-AU", "en-IN", "en-GB", "en-US", "et-EE", "fil-PH", "fi-FI", "fr-CA", "fr-FR", "gl-ES", "ka-GE", "de-DE", "el-GR", "gu-IN", "ha-NG", "iw-IL", "hi-IN", "hu-HU", "is-IS", "id-ID", "it-IT", "ja-JP", "jv-ID", "kea-CV", "kam-KE", "kn-IN", "kk-KZ", "km-KH", "ko-KR", "ky-KG", "lo-LA", "lv-LV", "ln-CD", "lt-LT", "luo-KE", "lb-LU", "mk-MK", "ms-MY", "ml-IN", "mt-MT", "mi-NZ", "mr-IN", "mn-MN", "ne-NP", "ny-MW", "oc-FR", "ps-AF", "fa-IR", "pl-PL", "pt-BR", "pa-Guru-IN", "ro-RO", "ru-RU", "nso-ZA", "sr-RS", "sn-ZW", "sd-IN", "si-LK", "sk-SK", "sl-SI", "so-SO", "es-ES", "es-US", "su-ID", "sw", "sv-SE", "tg-TJ", "ta-IN", "te-IN", "th-TH", "tr-TR", "uk-UA", "ur-PK", "uz-UZ", "vi-VN", "cy-GB", "wo-SN", "yo-NG", "zu-ZA"} # fmt: skip WHISPER_SUPPORTED = {"af", "ar", "hy", "az", "be", "bs", "bg", "ca", "zh", "hr", "cs", "da", "nl", "en", "et", "fi", "fr", "gl", "de", "el", "he", "hi", "hu", "is", "id", "it", "ja", "kn", "kk", "ko", "lv", "lt", "mk", "ms", "mr", "mi", "ne", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sk", "sl", "es", "sw", "sv", "tl", "ta", "th", "tr", "uk", "ur", "vi", "cy"} # fmt: skip +# See page 14 of https://scontent-sea1-1.xx.fbcdn.net/v/t39.2365-6/369747868_602316515432698_2401716319310287708_n.pdf?_nc_cat=106&ccb=1-7&_nc_sid=3c67a6&_nc_ohc=_5cpNOcftdYAX8rCrVo&_nc_ht=scontent-sea1-1.xx&oh=00_AfDVkx7XubifELxmB_Un-yEYMJavBHFzPnvTbTlalbd_1Q&oe=65141B39 +# For now, below are listed the languages that support ASR. Note that Seamless only accepts ISO 639-3 codes. +SEAMLESS_SUPPORTED = {"afr", "amh", "arb", "ary", "arz", "asm", "ast", "azj", "bel", "ben", "bos", "bul", "cat", "ceb", "ces", "ckb", "cmn", "cym", "dan", "deu", "ell", "eng", "est", "eus", "fin", "fra", "gaz", "gle", "glg", "guj", "heb", "hin", "hrv", "hun", "hye", "ibo", "ind", "isl", "ita", "jav", "jpn", "kam", "kan", "kat", "kaz", "kea", "khk", "khm", "kir", "kor", "lao", "lit", "ltz", "lug", "luo", "lvs", "mai", "mal", "mar", "mkd", "mlt", "mni", "mya", "nld", "nno", "nob", "npi", "nya", "oci", "ory", "pan", "pbt", "pes", "pol", "por", "ron", "rus", "slk", "slv", "sna", "snd", "som", "spa", "srp", "swe", "swh", "tam", "tel", "tgk", "tgl", "tha", "tur", "ukr", "urd", "uzn", "vie", "xho", "yor", "yue", "zlm", "zul"} # fmt: skip + class AsrModels(Enum): whisper_large_v2 = "Whisper Large v2 (openai)" @@ -38,6 +42,7 @@ class AsrModels(Enum): vakyansh_bhojpuri = "Vakyansh Bhojpuri (Open-Speech-EkStep)" usm = "Chirp / USM (Google)" deepgram = "Deepgram" + seamless = "Seamless M4T (Facebook Research)" asr_model_ids = { @@ -47,6 +52,7 @@ class AsrModels(Enum): AsrModels.vakyansh_bhojpuri: "Harveenchadha/vakyansh-wav2vec2-bhojpuri-bhom-60", AsrModels.nemo_english: "https://objectstore.e2enetworks.net/indic-asr-public/checkpoints/conformer/english_large_data_fixed.nemo", AsrModels.nemo_hindi: "https://objectstore.e2enetworks.net/indic-asr-public/checkpoints/conformer/stt_hi_conformer_ctc_large_v2.nemo", + AsrModels.seamless: "seamlessM4T_large", } forced_asr_languages = { @@ -61,6 +67,7 @@ class AsrModels(Enum): AsrModels.whisper_large_v2: WHISPER_SUPPORTED, AsrModels.usm: CHIRP_SUPPORTED, AsrModels.deepgram: WHISPER_SUPPORTED, + AsrModels.seamless: SEAMLESS_SUPPORTED, } @@ -117,7 +124,7 @@ def google_translate_languages() -> dict[str, str]: parent = f"projects/{project}/locations/global" client = translate.TranslationServiceClient() supported_languages = client.get_supported_languages( - parent, display_language_code="en" + parent=parent, display_language_code="en" ) return { lang.language_code: lang.display_name @@ -326,7 +333,19 @@ def run_asr( return "\n".join( f"Speaker {chunk['speaker']}: {chunk['text']}" for chunk in chunks ) - + elif selected_model == AsrModels.seamless: + data = call_celery_task( + "seamless", + pipeline=dict( + model_id=asr_model_ids[AsrModels.seamless], + ), + inputs=dict( + audio=audio_url, + task="ASR", + src_lang=language, + ), + ) + return data["text"] elif selected_model == AsrModels.usm: # note: only us-central1 and a few other regions support chirp recognizers (so global can't be used) location = "us-central1" From add7219550c987710e21fa0abff25c2f07d9ace8 Mon Sep 17 00:00:00 2001 From: Alexander Metzger Date: Sun, 24 Sep 2023 01:53:56 -0700 Subject: [PATCH 2/7] works on tts --- .../text_to_speech_settings_widgets.py | 61 +++++++++++++++++++ recipes/TextToSpeech.py | 25 +++++++- 2 files changed, 84 insertions(+), 2 deletions(-) diff --git a/daras_ai_v2/text_to_speech_settings_widgets.py b/daras_ai_v2/text_to_speech_settings_widgets.py index 0fc45a609..dda976dff 100644 --- a/daras_ai_v2/text_to_speech_settings_widgets.py +++ b/daras_ai_v2/text_to_speech_settings_widgets.py @@ -24,6 +24,7 @@ class TextToSpeechProviders(Enum): GOOGLE_TTS = "Google Cloud Text-to-Speech" UBERDUCK = "uberduck.ai" BARK = "Bark (suno-ai)" + SEAMLESS = "SeamlessM4T T2ST (Facebook Research)" BARK_SUPPORTED_LANGS = [ @@ -51,6 +52,47 @@ class TextToSpeechProviders(Enum): for n in range(10) } +# See page 14 of https://scontent-sea1-1.xx.fbcdn.net/v/t39.2365-6/369747868_602316515432698_2401716319310287708_n.pdf?_nc_cat=106&ccb=1-7&_nc_sid=3c67a6&_nc_ohc=_5cpNOcftdYAX8rCrVo&_nc_ht=scontent-sea1-1.xx&oh=00_AfDVkx7XubifELxmB_Un-yEYMJavBHFzPnvTbTlalbd_1Q&oe=65141B39 +# For now, below are listed the languages that support T2ST. Note that Seamless only accepts ISO 639-3 codes. +SEAMLESS_SUPPORTED: dict[str, str] = { + "arb": "Modern Standard Arabic", + "ben": "Bengali", + "cat": "Catalan", + "ces": "Czech", + "cmn": "Mandarin Chinese", + "cym": "Welsch", + "dan": "Danish", + "deu": "German", + "eng": "English", + "est": "Estonian", + "fin": "Finnish", + "fra": "French", + "hin": "Hindi", + "ind": "Indonesian", + "ita": "Italian", + "jpn": "Japanese", + "kor": "Korean", + "mlt": "Maltese", + "nld": "Dutch", + "pes": "Western Persian", + "pol": "Polish", + "por": "Portuguese", + "ron": "Romanian", + "rus": "Russian", + "slk": "Slovak", + "spa": "Spanish", + "swe": "Swedish", + "swh": "Swahili", + "tel": "Telugu", + "tgl": "Tagalog", + "tha": "Thai", + "tur": "Turkish", + "ukr": "Ukrainian", + "urd": "Urdu", + "uzn": "Northern Uzbek", + "vie": "Vietnamese", +} + def text_to_speech_settings(): st.write( @@ -142,6 +184,25 @@ def text_to_speech_settings(): key="uberduck_speaking_rate", ) + case TextToSpeechProviders.SEAMLESS.name: + with col2: + st.selectbox( + label=""" + ###### Seamless Input Language + """, + key="seamless_input_language", + format_func=lambda option: SEAMLESS_SUPPORTED[option], + options=SEAMLESS_SUPPORTED.keys(), + ) + st.selectbox( + label=""" + ###### Seamless Output Language + """, + key="seamless_output_language", + format_func=lambda option: SEAMLESS_SUPPORTED[option], + options=SEAMLESS_SUPPORTED.keys(), + ) + @st.cache_data() def google_tts_voices() -> dict[texttospeech.Voice, str]: diff --git a/recipes/TextToSpeech.py b/recipes/TextToSpeech.py index a6a85582e..927abcdb1 100644 --- a/recipes/TextToSpeech.py +++ b/recipes/TextToSpeech.py @@ -12,7 +12,7 @@ from daras_ai.image_input import upload_file_from_bytes, storage_blob_for from daras_ai_v2 import settings from daras_ai_v2.base import BasePage -from daras_ai_v2.gpu_server import GpuEndpoints +from daras_ai_v2.gpu_server import GpuEndpoints, call_celery_task from daras_ai_v2.loom_video_widget import youtube_video from daras_ai_v2.text_to_speech_settings_widgets import ( UBERDUCK_VOICES, @@ -40,6 +40,8 @@ class TextToSpeechPage(BasePage): "google_speaking_rate": 1.0, "uberduck_voice_name": "Aiden Botha", "uberduck_speaking_rate": 1.0, + "seamless_input_language": "eng", + "seamless_output_language": "eng", } class RequestModel(BaseModel): @@ -58,6 +60,9 @@ class RequestModel(BaseModel): bark_history_prompt: str | None + seamless_input_language: str | None + seamless_output_language: str | None + class ResponseModel(BaseModel): audio_url: str @@ -179,7 +184,23 @@ def run(self, state: dict): break else: time.sleep(0.1) - + case TextToSpeechProviders.SEAMLESS: + data = call_celery_task( + "seamless", + pipeline=dict( + model_id="seamlessM4T_large", + ), + inputs=dict( + text=text, + task="T2ST", + tgt_lang=state["seamless_output_language"], + src_lang=state["seamless_input_language"], + ), + ) + audio_url = upload_file_from_bytes( + "seamless_gen.wav", data.get("audio") + ) + state["audio_url"] = audio_url case TextToSpeechProviders.GOOGLE_TTS: voice_name = ( state["google_voice_name"] From 77996b349ab42d7caa28f283e5de7d11704449f7 Mon Sep 17 00:00:00 2001 From: Alexander Metzger Date: Tue, 26 Sep 2023 21:47:51 -0700 Subject: [PATCH 3/7] rename seamless enum name --- daras_ai_v2/asr.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/daras_ai_v2/asr.py b/daras_ai_v2/asr.py index 37c488528..d074e44ca 100644 --- a/daras_ai_v2/asr.py +++ b/daras_ai_v2/asr.py @@ -42,7 +42,7 @@ class AsrModels(Enum): vakyansh_bhojpuri = "Vakyansh Bhojpuri (Open-Speech-EkStep)" usm = "Chirp / USM (Google)" deepgram = "Deepgram" - seamless = "Seamless M4T (Facebook Research)" + seamless_m4t = "Seamless M4T (Facebook Research)" asr_model_ids = { @@ -52,7 +52,7 @@ class AsrModels(Enum): AsrModels.vakyansh_bhojpuri: "Harveenchadha/vakyansh-wav2vec2-bhojpuri-bhom-60", AsrModels.nemo_english: "https://objectstore.e2enetworks.net/indic-asr-public/checkpoints/conformer/english_large_data_fixed.nemo", AsrModels.nemo_hindi: "https://objectstore.e2enetworks.net/indic-asr-public/checkpoints/conformer/stt_hi_conformer_ctc_large_v2.nemo", - AsrModels.seamless: "seamlessM4T_large", + AsrModels.seamless_m4t: "seamlessM4T_large", } forced_asr_languages = { @@ -67,7 +67,7 @@ class AsrModels(Enum): AsrModels.whisper_large_v2: WHISPER_SUPPORTED, AsrModels.usm: CHIRP_SUPPORTED, AsrModels.deepgram: WHISPER_SUPPORTED, - AsrModels.seamless: SEAMLESS_SUPPORTED, + AsrModels.seamless_m4t: SEAMLESS_SUPPORTED, } @@ -333,11 +333,11 @@ def run_asr( return "\n".join( f"Speaker {chunk['speaker']}: {chunk['text']}" for chunk in chunks ) - elif selected_model == AsrModels.seamless: + elif selected_model == AsrModels.seamless_m4t: data = call_celery_task( "seamless", pipeline=dict( - model_id=asr_model_ids[AsrModels.seamless], + model_id=asr_model_ids[AsrModels.seamless_m4t], ), inputs=dict( audio=audio_url, From d169d4b36ee884d9219ed31d461d7eb9eb472fe3 Mon Sep 17 00:00:00 2001 From: Alexander Metzger Date: Tue, 26 Sep 2023 22:11:48 -0700 Subject: [PATCH 4/7] use call_celery_task_outfile --- recipes/TextToSpeech.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/recipes/TextToSpeech.py b/recipes/TextToSpeech.py index 927abcdb1..9b5bdbc51 100644 --- a/recipes/TextToSpeech.py +++ b/recipes/TextToSpeech.py @@ -12,7 +12,7 @@ from daras_ai.image_input import upload_file_from_bytes, storage_blob_for from daras_ai_v2 import settings from daras_ai_v2.base import BasePage -from daras_ai_v2.gpu_server import GpuEndpoints, call_celery_task +from daras_ai_v2.gpu_server import GpuEndpoints, call_celery_task_outfile from daras_ai_v2.loom_video_widget import youtube_video from daras_ai_v2.text_to_speech_settings_widgets import ( UBERDUCK_VOICES, @@ -185,7 +185,7 @@ def run(self, state: dict): else: time.sleep(0.1) case TextToSpeechProviders.SEAMLESS: - data = call_celery_task( + data = call_celery_task_outfile( "seamless", pipeline=dict( model_id="seamlessM4T_large", @@ -196,11 +196,10 @@ def run(self, state: dict): tgt_lang=state["seamless_output_language"], src_lang=state["seamless_input_language"], ), + content_type="audio/wav", + filename="seamless_gen.wav", ) - audio_url = upload_file_from_bytes( - "seamless_gen.wav", data.get("audio") - ) - state["audio_url"] = audio_url + state["audio_url"] = data[0] case TextToSpeechProviders.GOOGLE_TTS: voice_name = ( state["google_voice_name"] From e20a9bb638034301ec7b981104b9189e773474ab Mon Sep 17 00:00:00 2001 From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com> Date: Thu, 19 Oct 2023 16:19:48 +0530 Subject: [PATCH 5/7] remove seamlessm4t from TextToSpeech --- .../text_to_speech_settings_widgets.py | 61 ------------------- recipes/TextToSpeech.py | 22 +------ 2 files changed, 1 insertion(+), 82 deletions(-) diff --git a/daras_ai_v2/text_to_speech_settings_widgets.py b/daras_ai_v2/text_to_speech_settings_widgets.py index 8c3b91fcf..137cdb3c9 100644 --- a/daras_ai_v2/text_to_speech_settings_widgets.py +++ b/daras_ai_v2/text_to_speech_settings_widgets.py @@ -26,7 +26,6 @@ class TextToSpeechProviders(Enum): ELEVEN_LABS = "Eleven Labs (Premium)" UBERDUCK = "uberduck.ai" BARK = "Bark (suno-ai)" - SEAMLESS = "SeamlessM4T T2ST (Facebook Research)" # Mapping from Eleven Labs Voice Name -> Voice ID @@ -135,47 +134,6 @@ class TextToSpeechProviders(Enum): for n in range(10) } -# See page 14 of https://scontent-sea1-1.xx.fbcdn.net/v/t39.2365-6/369747868_602316515432698_2401716319310287708_n.pdf?_nc_cat=106&ccb=1-7&_nc_sid=3c67a6&_nc_ohc=_5cpNOcftdYAX8rCrVo&_nc_ht=scontent-sea1-1.xx&oh=00_AfDVkx7XubifELxmB_Un-yEYMJavBHFzPnvTbTlalbd_1Q&oe=65141B39 -# For now, below are listed the languages that support T2ST. Note that Seamless only accepts ISO 639-3 codes. -SEAMLESS_SUPPORTED: dict[str, str] = { - "arb": "Modern Standard Arabic", - "ben": "Bengali", - "cat": "Catalan", - "ces": "Czech", - "cmn": "Mandarin Chinese", - "cym": "Welsch", - "dan": "Danish", - "deu": "German", - "eng": "English", - "est": "Estonian", - "fin": "Finnish", - "fra": "French", - "hin": "Hindi", - "ind": "Indonesian", - "ita": "Italian", - "jpn": "Japanese", - "kor": "Korean", - "mlt": "Maltese", - "nld": "Dutch", - "pes": "Western Persian", - "pol": "Polish", - "por": "Portuguese", - "ron": "Romanian", - "rus": "Russian", - "slk": "Slovak", - "spa": "Spanish", - "swe": "Swedish", - "swh": "Swahili", - "tel": "Telugu", - "tgl": "Tagalog", - "tha": "Thai", - "tur": "Turkish", - "ukr": "Ukrainian", - "urd": "Urdu", - "uzn": "Northern Uzbek", - "vie": "Vietnamese", -} - def text_to_speech_settings(page=None): st.write( @@ -267,25 +225,6 @@ def text_to_speech_settings(page=None): key="uberduck_speaking_rate", ) - case TextToSpeechProviders.SEAMLESS.name: - with col2: - st.selectbox( - label=""" - ###### Seamless Input Language - """, - key="seamless_input_language", - format_func=lambda option: SEAMLESS_SUPPORTED[option], - options=SEAMLESS_SUPPORTED.keys(), - ) - st.selectbox( - label=""" - ###### Seamless Output Language - """, - key="seamless_output_language", - format_func=lambda option: SEAMLESS_SUPPORTED[option], - options=SEAMLESS_SUPPORTED.keys(), - ) - case TextToSpeechProviders.ELEVEN_LABS.name: with col2: if not ( diff --git a/recipes/TextToSpeech.py b/recipes/TextToSpeech.py index 1f24b978f..de8178cee 100644 --- a/recipes/TextToSpeech.py +++ b/recipes/TextToSpeech.py @@ -42,8 +42,6 @@ class TextToSpeechPage(BasePage): "google_speaking_rate": 1.0, "uberduck_voice_name": "Aiden Botha", "uberduck_speaking_rate": 1.0, - "seamless_input_language": "eng", - "seamless_output_language": "eng", "elevenlabs_voice_name": "Rachel", "elevenlabs_model": "eleven_multilingual_v2", "elevenlabs_stability": 0.5, @@ -66,9 +64,6 @@ class RequestModel(BaseModel): bark_history_prompt: str | None - seamless_input_language: str | None - seamless_output_language: str | None - elevenlabs_voice_name: str | None elevenlabs_model: str | None elevenlabs_stability: float | None @@ -205,22 +200,7 @@ def run(self, state: dict): break else: time.sleep(0.1) - case TextToSpeechProviders.SEAMLESS: - data = call_celery_task_outfile( - "seamless", - pipeline=dict( - model_id="seamlessM4T_large", - ), - inputs=dict( - text=text, - task="T2ST", - tgt_lang=state["seamless_output_language"], - src_lang=state["seamless_input_language"], - ), - content_type="audio/wav", - filename="seamless_gen.wav", - ) - state["audio_url"] = data[0] + case TextToSpeechProviders.GOOGLE_TTS: voice_name = ( state["google_voice_name"] From dd19703ec06a8a1d9882323d1983bb1e8e5fc4b0 Mon Sep 17 00:00:00 2001 From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com> Date: Mon, 23 Oct 2023 18:45:35 +0530 Subject: [PATCH 6/7] update model id for seamless asr --- daras_ai_v2/asr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/daras_ai_v2/asr.py b/daras_ai_v2/asr.py index 3d3c5b2f5..53747e76a 100644 --- a/daras_ai_v2/asr.py +++ b/daras_ai_v2/asr.py @@ -53,7 +53,7 @@ class AsrModels(Enum): AsrModels.vakyansh_bhojpuri: "Harveenchadha/vakyansh-wav2vec2-bhojpuri-bhom-60", AsrModels.nemo_english: "https://objectstore.e2enetworks.net/indic-asr-public/checkpoints/conformer/english_large_data_fixed.nemo", AsrModels.nemo_hindi: "https://objectstore.e2enetworks.net/indic-asr-public/checkpoints/conformer/stt_hi_conformer_ctc_large_v2.nemo", - AsrModels.seamless_m4t: "seamlessM4T_large", + AsrModels.seamless_m4t: "facebook/hf-seamless-m4t-large", } forced_asr_languages = { From 86d167b8299d2834b15ba968a2a6783c72c54a7e Mon Sep 17 00:00:00 2001 From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com> Date: Tue, 31 Oct 2023 19:24:44 +0530 Subject: [PATCH 7/7] allow returning non-text output formats for seamlesgs --- daras_ai_v2/asr.py | 1 - 1 file changed, 1 deletion(-) diff --git a/daras_ai_v2/asr.py b/daras_ai_v2/asr.py index 53747e76a..4b29982ee 100644 --- a/daras_ai_v2/asr.py +++ b/daras_ai_v2/asr.py @@ -346,7 +346,6 @@ def run_asr( src_lang=language, ), ) - return data["text"] elif selected_model == AsrModels.usm: # note: only us-central1 and a few other regions support chirp recognizers (so global can't be used) location = "us-central1"