diff --git a/daras_ai_v2/asr.py b/daras_ai_v2/asr.py index dea8fa4a6..f4054d7d0 100644 --- a/daras_ai_v2/asr.py +++ b/daras_ai_v2/asr.py @@ -30,11 +30,15 @@ TRANSLITERATION_SUPPORTED = {"ar", "bn", " gu", "hi", "ja", "kn", "ru", "ta", "te"} -# below list was found experimentally since the supported languages list by google is actually wrong: +# below CHIRP list was found experimentally since the supported languages list by google is actually wrong: CHIRP_SUPPORTED = {"af-ZA", "sq-AL", "am-ET", "ar-EG", "hy-AM", "as-IN", "ast-ES", "az-AZ", "eu-ES", "be-BY", "bs-BA", "bg-BG", "my-MM", "ca-ES", "ceb-PH", "ckb-IQ", "zh-Hans-CN", "yue-Hant-HK", "hr-HR", "cs-CZ", "da-DK", "nl-NL", "en-AU", "en-IN", "en-GB", "en-US", "et-EE", "fil-PH", "fi-FI", "fr-CA", "fr-FR", "gl-ES", "ka-GE", "de-DE", "el-GR", "gu-IN", "ha-NG", "iw-IL", "hi-IN", "hu-HU", "is-IS", "id-ID", "it-IT", "ja-JP", "jv-ID", "kea-CV", "kam-KE", "kn-IN", "kk-KZ", "km-KH", "ko-KR", "ky-KG", "lo-LA", "lv-LV", "ln-CD", "lt-LT", "luo-KE", "lb-LU", "mk-MK", "ms-MY", "ml-IN", "mt-MT", "mi-NZ", "mr-IN", "mn-MN", "ne-NP", "ny-MW", "oc-FR", "ps-AF", "fa-IR", "pl-PL", "pt-BR", "pa-Guru-IN", "ro-RO", "ru-RU", "nso-ZA", "sr-RS", "sn-ZW", "sd-IN", "si-LK", "sk-SK", "sl-SI", "so-SO", "es-ES", "es-US", "su-ID", "sw", "sv-SE", "tg-TJ", "ta-IN", "te-IN", "th-TH", "tr-TR", "uk-UA", "ur-PK", "uz-UZ", "vi-VN", "cy-GB", "wo-SN", "yo-NG", "zu-ZA"} # fmt: skip WHISPER_SUPPORTED = {"af", "ar", "hy", "az", "be", "bs", "bg", "ca", "zh", "hr", "cs", "da", "nl", "en", "et", "fi", "fr", "gl", "de", "el", "he", "hi", "hu", "is", "id", "it", "ja", "kn", "kk", "ko", "lv", "lt", "mk", "ms", "mr", "mi", "ne", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sk", "sl", "es", "sw", "sv", "tl", "ta", "th", "tr", "uk", "ur", "vi", "cy"} # fmt: skip +# See page 14 of https://scontent-sea1-1.xx.fbcdn.net/v/t39.2365-6/369747868_602316515432698_2401716319310287708_n.pdf?_nc_cat=106&ccb=1-7&_nc_sid=3c67a6&_nc_ohc=_5cpNOcftdYAX8rCrVo&_nc_ht=scontent-sea1-1.xx&oh=00_AfDVkx7XubifELxmB_Un-yEYMJavBHFzPnvTbTlalbd_1Q&oe=65141B39 +# For now, below are listed the languages that support ASR. Note that Seamless only accepts ISO 639-3 codes. +SEAMLESS_SUPPORTED = {"afr", "amh", "arb", "ary", "arz", "asm", "ast", "azj", "bel", "ben", "bos", "bul", "cat", "ceb", "ces", "ckb", "cmn", "cym", "dan", "deu", "ell", "eng", "est", "eus", "fin", "fra", "gaz", "gle", "glg", "guj", "heb", "hin", "hrv", "hun", "hye", "ibo", "ind", "isl", "ita", "jav", "jpn", "kam", "kan", "kat", "kaz", "kea", "khk", "khm", "kir", "kor", "lao", "lit", "ltz", "lug", "luo", "lvs", "mai", "mal", "mar", "mkd", "mlt", "mni", "mya", "nld", "nno", "nob", "npi", "nya", "oci", "ory", "pan", "pbt", "pes", "pol", "por", "ron", "rus", "slk", "slv", "sna", "snd", "som", "spa", "srp", "swe", "swh", "tam", "tel", "tgk", "tgl", "tha", "tur", "ukr", "urd", "uzn", "vie", "xho", "yor", "yue", "zlm", "zul"} # fmt: skip + class AsrModels(Enum): whisper_large_v2 = "Whisper Large v2 (openai)" @@ -45,6 +49,7 @@ class AsrModels(Enum): vakyansh_bhojpuri = "Vakyansh Bhojpuri (Open-Speech-EkStep)" usm = "Chirp / USM (Google)" deepgram = "Deepgram" + seamless_m4t = "Seamless M4T (Facebook Research)" asr_model_ids = { @@ -54,6 +59,7 @@ class AsrModels(Enum): AsrModels.vakyansh_bhojpuri: "Harveenchadha/vakyansh-wav2vec2-bhojpuri-bhom-60", AsrModels.nemo_english: "https://objectstore.e2enetworks.net/indic-asr-public/checkpoints/conformer/english_large_data_fixed.nemo", AsrModels.nemo_hindi: "https://objectstore.e2enetworks.net/indic-asr-public/checkpoints/conformer/stt_hi_conformer_ctc_large_v2.nemo", + AsrModels.seamless_m4t: "facebook/hf-seamless-m4t-large", } forced_asr_languages = { @@ -68,6 +74,7 @@ class AsrModels(Enum): AsrModels.whisper_large_v2: WHISPER_SUPPORTED, AsrModels.usm: CHIRP_SUPPORTED, AsrModels.deepgram: WHISPER_SUPPORTED, + AsrModels.seamless_m4t: SEAMLESS_SUPPORTED, } @@ -367,7 +374,18 @@ def run_asr( return "\n".join( f"Speaker {chunk['speaker']}: {chunk['text']}" for chunk in chunks ) - + elif selected_model == AsrModels.seamless_m4t: + data = call_celery_task( + "seamless", + pipeline=dict( + model_id=asr_model_ids[AsrModels.seamless_m4t], + ), + inputs=dict( + audio=audio_url, + task="ASR", + src_lang=language, + ), + ) elif selected_model == AsrModels.usm: location = settings.GCP_REGION