Skip to content

Commit

Permalink
Merge pull request #190 from GooeyAI/seamlessm4t-asr
Browse files Browse the repository at this point in the history
  • Loading branch information
devxpy authored Nov 17, 2023
2 parents a388ab5 + 86d167b commit 97afb22
Showing 1 changed file with 20 additions and 2 deletions.
22 changes: 20 additions & 2 deletions daras_ai_v2/asr.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,15 @@

TRANSLITERATION_SUPPORTED = {"ar", "bn", " gu", "hi", "ja", "kn", "ru", "ta", "te"}

# below list was found experimentally since the supported languages list by google is actually wrong:
# below CHIRP list was found experimentally since the supported languages list by google is actually wrong:
CHIRP_SUPPORTED = {"af-ZA", "sq-AL", "am-ET", "ar-EG", "hy-AM", "as-IN", "ast-ES", "az-AZ", "eu-ES", "be-BY", "bs-BA", "bg-BG", "my-MM", "ca-ES", "ceb-PH", "ckb-IQ", "zh-Hans-CN", "yue-Hant-HK", "hr-HR", "cs-CZ", "da-DK", "nl-NL", "en-AU", "en-IN", "en-GB", "en-US", "et-EE", "fil-PH", "fi-FI", "fr-CA", "fr-FR", "gl-ES", "ka-GE", "de-DE", "el-GR", "gu-IN", "ha-NG", "iw-IL", "hi-IN", "hu-HU", "is-IS", "id-ID", "it-IT", "ja-JP", "jv-ID", "kea-CV", "kam-KE", "kn-IN", "kk-KZ", "km-KH", "ko-KR", "ky-KG", "lo-LA", "lv-LV", "ln-CD", "lt-LT", "luo-KE", "lb-LU", "mk-MK", "ms-MY", "ml-IN", "mt-MT", "mi-NZ", "mr-IN", "mn-MN", "ne-NP", "ny-MW", "oc-FR", "ps-AF", "fa-IR", "pl-PL", "pt-BR", "pa-Guru-IN", "ro-RO", "ru-RU", "nso-ZA", "sr-RS", "sn-ZW", "sd-IN", "si-LK", "sk-SK", "sl-SI", "so-SO", "es-ES", "es-US", "su-ID", "sw", "sv-SE", "tg-TJ", "ta-IN", "te-IN", "th-TH", "tr-TR", "uk-UA", "ur-PK", "uz-UZ", "vi-VN", "cy-GB", "wo-SN", "yo-NG", "zu-ZA"} # fmt: skip

WHISPER_SUPPORTED = {"af", "ar", "hy", "az", "be", "bs", "bg", "ca", "zh", "hr", "cs", "da", "nl", "en", "et", "fi", "fr", "gl", "de", "el", "he", "hi", "hu", "is", "id", "it", "ja", "kn", "kk", "ko", "lv", "lt", "mk", "ms", "mr", "mi", "ne", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sk", "sl", "es", "sw", "sv", "tl", "ta", "th", "tr", "uk", "ur", "vi", "cy"} # fmt: skip

# See page 14 of https://scontent-sea1-1.xx.fbcdn.net/v/t39.2365-6/369747868_602316515432698_2401716319310287708_n.pdf?_nc_cat=106&ccb=1-7&_nc_sid=3c67a6&_nc_ohc=_5cpNOcftdYAX8rCrVo&_nc_ht=scontent-sea1-1.xx&oh=00_AfDVkx7XubifELxmB_Un-yEYMJavBHFzPnvTbTlalbd_1Q&oe=65141B39
# For now, below are listed the languages that support ASR. Note that Seamless only accepts ISO 639-3 codes.
SEAMLESS_SUPPORTED = {"afr", "amh", "arb", "ary", "arz", "asm", "ast", "azj", "bel", "ben", "bos", "bul", "cat", "ceb", "ces", "ckb", "cmn", "cym", "dan", "deu", "ell", "eng", "est", "eus", "fin", "fra", "gaz", "gle", "glg", "guj", "heb", "hin", "hrv", "hun", "hye", "ibo", "ind", "isl", "ita", "jav", "jpn", "kam", "kan", "kat", "kaz", "kea", "khk", "khm", "kir", "kor", "lao", "lit", "ltz", "lug", "luo", "lvs", "mai", "mal", "mar", "mkd", "mlt", "mni", "mya", "nld", "nno", "nob", "npi", "nya", "oci", "ory", "pan", "pbt", "pes", "pol", "por", "ron", "rus", "slk", "slv", "sna", "snd", "som", "spa", "srp", "swe", "swh", "tam", "tel", "tgk", "tgl", "tha", "tur", "ukr", "urd", "uzn", "vie", "xho", "yor", "yue", "zlm", "zul"} # fmt: skip


class AsrModels(Enum):
whisper_large_v2 = "Whisper Large v2 (openai)"
Expand All @@ -45,6 +49,7 @@ class AsrModels(Enum):
vakyansh_bhojpuri = "Vakyansh Bhojpuri (Open-Speech-EkStep)"
usm = "Chirp / USM (Google)"
deepgram = "Deepgram"
seamless_m4t = "Seamless M4T (Facebook Research)"


asr_model_ids = {
Expand All @@ -54,6 +59,7 @@ class AsrModels(Enum):
AsrModels.vakyansh_bhojpuri: "Harveenchadha/vakyansh-wav2vec2-bhojpuri-bhom-60",
AsrModels.nemo_english: "https://objectstore.e2enetworks.net/indic-asr-public/checkpoints/conformer/english_large_data_fixed.nemo",
AsrModels.nemo_hindi: "https://objectstore.e2enetworks.net/indic-asr-public/checkpoints/conformer/stt_hi_conformer_ctc_large_v2.nemo",
AsrModels.seamless_m4t: "facebook/hf-seamless-m4t-large",
}

forced_asr_languages = {
Expand All @@ -68,6 +74,7 @@ class AsrModels(Enum):
AsrModels.whisper_large_v2: WHISPER_SUPPORTED,
AsrModels.usm: CHIRP_SUPPORTED,
AsrModels.deepgram: WHISPER_SUPPORTED,
AsrModels.seamless_m4t: SEAMLESS_SUPPORTED,
}


Expand Down Expand Up @@ -367,7 +374,18 @@ def run_asr(
return "\n".join(
f"Speaker {chunk['speaker']}: {chunk['text']}" for chunk in chunks
)

elif selected_model == AsrModels.seamless_m4t:
data = call_celery_task(
"seamless",
pipeline=dict(
model_id=asr_model_ids[AsrModels.seamless_m4t],
),
inputs=dict(
audio=audio_url,
task="ASR",
src_lang=language,
),
)
elif selected_model == AsrModels.usm:
location = settings.GCP_REGION

Expand Down

0 comments on commit 97afb22

Please sign in to comment.