Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Seamless M4T on ASR and TTS #168

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions daras_ai_v2/asr.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,15 @@

TRANSLITERATION_SUPPORTED = {"ar", "bn", " gu", "hi", "ja", "kn", "ru", "ta", "te"}

# below list was found experimentally since the supported languages list by google is actually wrong:
# below CHIRP list was found experimentally since the supported languages list by google is actually wrong:
CHIRP_SUPPORTED = {"af-ZA", "sq-AL", "am-ET", "ar-EG", "hy-AM", "as-IN", "ast-ES", "az-AZ", "eu-ES", "be-BY", "bs-BA", "bg-BG", "my-MM", "ca-ES", "ceb-PH", "ckb-IQ", "zh-Hans-CN", "yue-Hant-HK", "hr-HR", "cs-CZ", "da-DK", "nl-NL", "en-AU", "en-IN", "en-GB", "en-US", "et-EE", "fil-PH", "fi-FI", "fr-CA", "fr-FR", "gl-ES", "ka-GE", "de-DE", "el-GR", "gu-IN", "ha-NG", "iw-IL", "hi-IN", "hu-HU", "is-IS", "id-ID", "it-IT", "ja-JP", "jv-ID", "kea-CV", "kam-KE", "kn-IN", "kk-KZ", "km-KH", "ko-KR", "ky-KG", "lo-LA", "lv-LV", "ln-CD", "lt-LT", "luo-KE", "lb-LU", "mk-MK", "ms-MY", "ml-IN", "mt-MT", "mi-NZ", "mr-IN", "mn-MN", "ne-NP", "ny-MW", "oc-FR", "ps-AF", "fa-IR", "pl-PL", "pt-BR", "pa-Guru-IN", "ro-RO", "ru-RU", "nso-ZA", "sr-RS", "sn-ZW", "sd-IN", "si-LK", "sk-SK", "sl-SI", "so-SO", "es-ES", "es-US", "su-ID", "sw", "sv-SE", "tg-TJ", "ta-IN", "te-IN", "th-TH", "tr-TR", "uk-UA", "ur-PK", "uz-UZ", "vi-VN", "cy-GB", "wo-SN", "yo-NG", "zu-ZA"} # fmt: skip

WHISPER_SUPPORTED = {"af", "ar", "hy", "az", "be", "bs", "bg", "ca", "zh", "hr", "cs", "da", "nl", "en", "et", "fi", "fr", "gl", "de", "el", "he", "hi", "hu", "is", "id", "it", "ja", "kn", "kk", "ko", "lv", "lt", "mk", "ms", "mr", "mi", "ne", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sk", "sl", "es", "sw", "sv", "tl", "ta", "th", "tr", "uk", "ur", "vi", "cy"} # fmt: skip

# See page 14 of https://scontent-sea1-1.xx.fbcdn.net/v/t39.2365-6/369747868_602316515432698_2401716319310287708_n.pdf?_nc_cat=106&ccb=1-7&_nc_sid=3c67a6&_nc_ohc=_5cpNOcftdYAX8rCrVo&_nc_ht=scontent-sea1-1.xx&oh=00_AfDVkx7XubifELxmB_Un-yEYMJavBHFzPnvTbTlalbd_1Q&oe=65141B39
# For now, below are listed the languages that support ASR. Note that Seamless only accepts ISO 639-3 codes.
SEAMLESS_SUPPORTED = {"afr", "amh", "arb", "ary", "arz", "asm", "ast", "azj", "bel", "ben", "bos", "bul", "cat", "ceb", "ces", "ckb", "cmn", "cym", "dan", "deu", "ell", "eng", "est", "eus", "fin", "fra", "gaz", "gle", "glg", "guj", "heb", "hin", "hrv", "hun", "hye", "ibo", "ind", "isl", "ita", "jav", "jpn", "kam", "kan", "kat", "kaz", "kea", "khk", "khm", "kir", "kor", "lao", "lit", "ltz", "lug", "luo", "lvs", "mai", "mal", "mar", "mkd", "mlt", "mni", "mya", "nld", "nno", "nob", "npi", "nya", "oci", "ory", "pan", "pbt", "pes", "pol", "por", "ron", "rus", "slk", "slv", "sna", "snd", "som", "spa", "srp", "swe", "swh", "tam", "tel", "tgk", "tgl", "tha", "tur", "ukr", "urd", "uzn", "vie", "xho", "yor", "yue", "zlm", "zul"} # fmt: skip
SanderGi marked this conversation as resolved.
Show resolved Hide resolved


class AsrModels(Enum):
whisper_large_v2 = "Whisper Large v2 (openai)"
Expand All @@ -38,6 +42,7 @@ class AsrModels(Enum):
vakyansh_bhojpuri = "Vakyansh Bhojpuri (Open-Speech-EkStep)"
usm = "Chirp / USM (Google)"
deepgram = "Deepgram"
seamless_m4t = "Seamless M4T (Facebook Research)"


asr_model_ids = {
Expand All @@ -47,6 +52,7 @@ class AsrModels(Enum):
AsrModels.vakyansh_bhojpuri: "Harveenchadha/vakyansh-wav2vec2-bhojpuri-bhom-60",
AsrModels.nemo_english: "https://objectstore.e2enetworks.net/indic-asr-public/checkpoints/conformer/english_large_data_fixed.nemo",
AsrModels.nemo_hindi: "https://objectstore.e2enetworks.net/indic-asr-public/checkpoints/conformer/stt_hi_conformer_ctc_large_v2.nemo",
AsrModels.seamless_m4t: "seamlessM4T_large",
}

forced_asr_languages = {
Expand All @@ -61,6 +67,7 @@ class AsrModels(Enum):
AsrModels.whisper_large_v2: WHISPER_SUPPORTED,
AsrModels.usm: CHIRP_SUPPORTED,
AsrModels.deepgram: WHISPER_SUPPORTED,
AsrModels.seamless_m4t: SEAMLESS_SUPPORTED,
}


Expand Down Expand Up @@ -117,7 +124,7 @@ def google_translate_languages() -> dict[str, str]:
parent = f"projects/{project}/locations/global"
client = translate.TranslationServiceClient()
supported_languages = client.get_supported_languages(
parent, display_language_code="en"
parent=parent, display_language_code="en"
)
return {
lang.language_code: lang.display_name
Expand Down Expand Up @@ -326,7 +333,19 @@ def run_asr(
return "\n".join(
f"Speaker {chunk['speaker']}: {chunk['text']}" for chunk in chunks
)

elif selected_model == AsrModels.seamless_m4t:
data = call_celery_task(
"seamless",
pipeline=dict(
model_id=asr_model_ids[AsrModels.seamless_m4t],
),
inputs=dict(
audio=audio_url,
task="ASR",
src_lang=language,
),
)
return data["text"]
elif selected_model == AsrModels.usm:
# note: only us-central1 and a few other regions support chirp recognizers (so global can't be used)
location = "us-central1"
Expand Down
61 changes: 61 additions & 0 deletions daras_ai_v2/text_to_speech_settings_widgets.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class TextToSpeechProviders(Enum):
GOOGLE_TTS = "Google Cloud Text-to-Speech"
UBERDUCK = "uberduck.ai"
BARK = "Bark (suno-ai)"
SEAMLESS = "SeamlessM4T T2ST (Facebook Research)"


BARK_SUPPORTED_LANGS = [
Expand Down Expand Up @@ -51,6 +52,47 @@ class TextToSpeechProviders(Enum):
for n in range(10)
}

# See page 14 of https://scontent-sea1-1.xx.fbcdn.net/v/t39.2365-6/369747868_602316515432698_2401716319310287708_n.pdf?_nc_cat=106&ccb=1-7&_nc_sid=3c67a6&_nc_ohc=_5cpNOcftdYAX8rCrVo&_nc_ht=scontent-sea1-1.xx&oh=00_AfDVkx7XubifELxmB_Un-yEYMJavBHFzPnvTbTlalbd_1Q&oe=65141B39
# For now, below are listed the languages that support T2ST. Note that Seamless only accepts ISO 639-3 codes.
SEAMLESS_SUPPORTED: dict[str, str] = {
"arb": "Modern Standard Arabic",
"ben": "Bengali",
"cat": "Catalan",
"ces": "Czech",
"cmn": "Mandarin Chinese",
"cym": "Welsch",
"dan": "Danish",
"deu": "German",
"eng": "English",
"est": "Estonian",
"fin": "Finnish",
"fra": "French",
"hin": "Hindi",
"ind": "Indonesian",
"ita": "Italian",
"jpn": "Japanese",
"kor": "Korean",
"mlt": "Maltese",
"nld": "Dutch",
"pes": "Western Persian",
"pol": "Polish",
"por": "Portuguese",
"ron": "Romanian",
"rus": "Russian",
"slk": "Slovak",
"spa": "Spanish",
"swe": "Swedish",
"swh": "Swahili",
"tel": "Telugu",
"tgl": "Tagalog",
"tha": "Thai",
"tur": "Turkish",
"ukr": "Ukrainian",
"urd": "Urdu",
"uzn": "Northern Uzbek",
"vie": "Vietnamese",
}


def text_to_speech_settings():
st.write(
Expand Down Expand Up @@ -142,6 +184,25 @@ def text_to_speech_settings():
key="uberduck_speaking_rate",
)

case TextToSpeechProviders.SEAMLESS.name:
with col2:
st.selectbox(
label="""
###### Seamless Input Language
""",
key="seamless_input_language",
format_func=lambda option: SEAMLESS_SUPPORTED[option],
options=SEAMLESS_SUPPORTED.keys(),
)
st.selectbox(
label="""
###### Seamless Output Language
""",
key="seamless_output_language",
format_func=lambda option: SEAMLESS_SUPPORTED[option],
options=SEAMLESS_SUPPORTED.keys(),
)


@st.cache_data()
def google_tts_voices() -> dict[texttospeech.Voice, str]:
Expand Down
24 changes: 22 additions & 2 deletions recipes/TextToSpeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from daras_ai.image_input import upload_file_from_bytes, storage_blob_for
from daras_ai_v2 import settings
from daras_ai_v2.base import BasePage
from daras_ai_v2.gpu_server import GpuEndpoints
from daras_ai_v2.gpu_server import GpuEndpoints, call_celery_task_outfile
from daras_ai_v2.loom_video_widget import youtube_video
from daras_ai_v2.text_to_speech_settings_widgets import (
UBERDUCK_VOICES,
Expand Down Expand Up @@ -40,6 +40,8 @@ class TextToSpeechPage(BasePage):
"google_speaking_rate": 1.0,
"uberduck_voice_name": "Aiden Botha",
"uberduck_speaking_rate": 1.0,
"seamless_input_language": "eng",
"seamless_output_language": "eng",
}

class RequestModel(BaseModel):
Expand All @@ -58,6 +60,9 @@ class RequestModel(BaseModel):

bark_history_prompt: str | None

seamless_input_language: str | None
seamless_output_language: str | None

class ResponseModel(BaseModel):
audio_url: str

Expand Down Expand Up @@ -179,7 +184,22 @@ def run(self, state: dict):
break
else:
time.sleep(0.1)

case TextToSpeechProviders.SEAMLESS:
data = call_celery_task_outfile(
"seamless",
pipeline=dict(
model_id="seamlessM4T_large",
),
inputs=dict(
text=text,
task="T2ST",
tgt_lang=state["seamless_output_language"],
src_lang=state["seamless_input_language"],
),
content_type="audio/wav",
filename="seamless_gen.wav",
)
state["audio_url"] = data[0]
case TextToSpeechProviders.GOOGLE_TTS:
voice_name = (
state["google_voice_name"]
Expand Down