diff --git a/app.py b/app.py index 0f2d9ddf..ea094c50 100644 --- a/app.py +++ b/app.py @@ -1,11 +1,13 @@ import os import argparse import gradio as gr +from gradio_i18n import Translate, gettext as _ import yaml from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR, INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH, - UVR_MODELS_DIR) + UVR_MODELS_DIR, I18N_YAML_PATH) +from modules.utils.constants import AUTOMATIC_DETECTION from modules.utils.files_manager import load_yaml from modules.whisper.whisper_factory import WhisperFactory from modules.whisper.faster_whisper_inference import FasterWhisperInference @@ -22,6 +24,7 @@ class App: def __init__(self, args): self.args = args self.app = gr.Blocks(css=CSS, theme=self.args.theme, delete_cache=(60, 3600)) + self.i18n = Translate(I18N_YAML_PATH) self.whisper_inf = WhisperFactory.create_whisper_inference( whisper_type=self.args.whisper_type, whisper_model_dir=self.args.whisper_model_dir, @@ -38,8 +41,8 @@ def __init__(self, args): output_dir=os.path.join(self.args.output_dir, "translations") ) self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH) - print(f"Use \"{self.args.whisper_type}\" implementation") - print(f"Device \"{self.whisper_inf.device}\" is detected") + print(f"Use \"{self.args.whisper_type}\" implementation\n" + f"Device \"{self.whisper_inf.device}\" is detected") def create_whisper_parameters(self): whisper_params = self.default_params["whisper"] @@ -49,23 +52,28 @@ def create_whisper_parameters(self): with gr.Row(): dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"], - label="Model") - dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs, - value=whisper_params["lang"], label="Language") - dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format") + label=_("Model")) + dd_lang = gr.Dropdown(choices=self.whisper_inf.available_langs + [AUTOMATIC_DETECTION], + value=AUTOMATIC_DETECTION if whisper_params["lang"] == AUTOMATIC_DETECTION.unwrap() + else whisper_params["lang"], label=_("Language")) + dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label=_("File Format")) with gr.Row(): - cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label="Translate to English?", + cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label=_("Translate to English?"), interactive=True) with gr.Row(): - cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], label="Add a timestamp to the end of the filename", + cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], + label=_("Add a timestamp to the end of the filename"), interactive=True) - with gr.Accordion("Advanced Parameters", open=False): - nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, interactive=True, + with gr.Accordion(_("Advanced Parameters"), open=False): + nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, + interactive=True, info="Beam size to use for decoding.") - nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=whisper_params["log_prob_threshold"], interactive=True, + nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", + value=whisper_params["log_prob_threshold"], interactive=True, info="If the average log probability over sampled tokens is below this value, treat as failed.") - nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=whisper_params["no_speech_threshold"], interactive=True, + nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=whisper_params["no_speech_threshold"], + interactive=True, info="If the no speech probability is higher than this value AND the average log probability over sampled tokens is below 'Log Prob Threshold', consider the segment as silent.") dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True, @@ -75,10 +83,12 @@ def create_whisper_parameters(self): info="Number of candidates when sampling with non-zero temperature.") nb_patience = gr.Number(label="Patience", value=whisper_params["patience"], interactive=True, info="Beam search patience factor.") - cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=whisper_params["condition_on_previous_text"], + cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", + value=whisper_params["condition_on_previous_text"], interactive=True, info="Condition on previous text during decoding.") - sld_prompt_reset_on_temperature = gr.Slider(label="Prompt Reset On Temperature", value=whisper_params["prompt_reset_on_temperature"], + sld_prompt_reset_on_temperature = gr.Slider(label="Prompt Reset On Temperature", + value=whisper_params["prompt_reset_on_temperature"], minimum=0, maximum=1, step=0.01, interactive=True, info="Resets prompt if temperature is above this value." " Arg has effect only if 'Condition On Previous Text' is True.") @@ -87,7 +97,8 @@ def create_whisper_parameters(self): sd_temperature = gr.Slider(label="Temperature", value=whisper_params["temperature"], minimum=0.0, step=0.01, maximum=1.0, interactive=True, info="Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `Compression Ratio Threshold` or `Log Prob Threshold`.") - nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=whisper_params["compression_ratio_threshold"], + nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", + value=whisper_params["compression_ratio_threshold"], interactive=True, info="If the gzip compression ratio is above this value, treat as failed.") nb_chunk_length = gr.Number(label="Chunk Length (s)", value=lambda: whisper_params["chunk_length"], @@ -96,9 +107,11 @@ def create_whisper_parameters(self): with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)): nb_length_penalty = gr.Number(label="Length Penalty", value=whisper_params["length_penalty"], info="Exponential length penalty constant.") - nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=whisper_params["repetition_penalty"], + nb_repetition_penalty = gr.Number(label="Repetition Penalty", + value=whisper_params["repetition_penalty"], info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).") - nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=whisper_params["no_repeat_ngram_size"], + nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", + value=whisper_params["no_repeat_ngram_size"], precision=0, info="Prevent repetitions of n-grams with this size (set 0 to disable).") tb_prefix = gr.Textbox(label="Prefix", value=lambda: whisper_params["prefix"], @@ -107,48 +120,55 @@ def create_whisper_parameters(self): info="Suppress blank outputs at the beginning of the sampling.") tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value=whisper_params["suppress_tokens"], info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.") - nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=whisper_params["max_initial_timestamp"], + nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", + value=whisper_params["max_initial_timestamp"], info="The initial timestamp cannot be later than this.") cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=whisper_params["word_timestamps"], info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.") - tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value=whisper_params["prepend_punctuations"], + tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", + value=whisper_params["prepend_punctuations"], info="If 'Word Timestamps' is True, merge these punctuation symbols with the next word.") - tb_append_punctuations = gr.Textbox(label="Append Punctuations", value=whisper_params["append_punctuations"], + tb_append_punctuations = gr.Textbox(label="Append Punctuations", + value=whisper_params["append_punctuations"], info="If 'Word Timestamps' is True, merge these punctuation symbols with the previous word.") nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: whisper_params["max_new_tokens"], precision=0, info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.") nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)", - value=lambda: whisper_params["hallucination_silence_threshold"], + value=lambda: whisper_params[ + "hallucination_silence_threshold"], info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.") tb_hotwords = gr.Textbox(label="Hotwords", value=lambda: whisper_params["hotwords"], info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.") - nb_language_detection_threshold = gr.Number(label="Language Detection Threshold", value=lambda: whisper_params["language_detection_threshold"], + nb_language_detection_threshold = gr.Number(label="Language Detection Threshold", + value=lambda: whisper_params[ + "language_detection_threshold"], info="If the maximum probability of the language tokens is higher than this value, the language is detected.") - nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=lambda: whisper_params["language_detection_segments"], + nb_language_detection_segments = gr.Number(label="Language Detection Segments", + value=lambda: whisper_params["language_detection_segments"], precision=0, info="Number of segments to consider for the language detection.") with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)): nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0) - with gr.Accordion("Background Music Remover Filter", open=False): - cb_bgm_separation = gr.Checkbox(label="Enable Background Music Remover Filter", value=uvr_params["is_separate_bgm"], + with gr.Accordion(_("Background Music Remover Filter"), open=False): + cb_bgm_separation = gr.Checkbox(label=_("Enable Background Music Remover Filter"), + value=uvr_params["is_separate_bgm"], interactive=True, - info="Enabling this will remove background music by submodel before" - " transcribing ") - dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device, + info=_("Enabling this will remove background music")) + dd_uvr_device = gr.Dropdown(label=_("Device"), value=self.whisper_inf.music_separator.device, choices=self.whisper_inf.music_separator.available_devices) - dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"], + dd_uvr_model_size = gr.Dropdown(label=_("Model"), value=uvr_params["model_size"], choices=self.whisper_inf.music_separator.available_models) nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0) - cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"]) - cb_uvr_enable_offload = gr.Checkbox(label="Offload sub model after removing background music", + cb_uvr_save_file = gr.Checkbox(label=_("Save separated files to output"), value=uvr_params["save_file"]) + cb_uvr_enable_offload = gr.Checkbox(label=_("Offload sub model after removing background music"), value=uvr_params["enable_offload"]) - with gr.Accordion("Voice Detection Filter", open=False): - cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"], + with gr.Accordion(_("Voice Detection Filter"), open=False): + cb_vad_filter = gr.Checkbox(label=_("Enable Silero VAD Filter"), value=vad_params["vad_filter"], interactive=True, - info="Enable this to transcribe only detected voice parts by submodel.") + info=_("Enable this to transcribe only detected voice")) sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=vad_params["threshold"], info="Lower it to be more sensitive to small sounds.") @@ -165,15 +185,11 @@ def create_whisper_parameters(self): nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"], info="Final speech chunks are padded by this time each side") - with gr.Accordion("Diarization", open=False): - cb_diarize = gr.Checkbox(label="Enable Diarization", value=diarization_params["is_diarize"]) - tb_hf_token = gr.Text(label="HuggingFace Token", value=diarization_params["hf_token"], - info="This is only needed the first time you download the model. If you already have" - " models, you don't need to enter. To download the model, you must manually go " - "to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and " - "\"https://huggingface.co/pyannote/segmentation-3.0\" and agree to" - " their requirement.") - dd_diarization_device = gr.Dropdown(label="Device", + with gr.Accordion(_("Diarization"), open=False): + cb_diarize = gr.Checkbox(label=_("Enable Diarization"), value=diarization_params["is_diarize"]) + tb_hf_token = gr.Text(label=_("HuggingFace Token"), value=diarization_params["hf_token"], + info=_("This is only needed the first time you download the model")) + dd_diarization_device = gr.Dropdown(label=_("Device"), choices=self.whisper_inf.diarizer.get_available_device(), value=self.whisper_inf.diarizer.get_device()) @@ -213,179 +229,191 @@ def launch(self): uvr_params = self.default_params["bgm_separation"] with self.app: - with gr.Row(): - with gr.Column(): - gr.Markdown(MARKDOWN, elem_id="md_project") - with gr.Tabs(): - with gr.TabItem("File"): # tab1 + with self.i18n: + with gr.Row(): with gr.Column(): - input_file = gr.Files(type="filepath", label="Upload File here") - tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)", - info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them." - " Leave this field empty if you do not wish to use a local path.", - visible=self.args.colab, - value="") - - whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters() - - with gr.Row(): - btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary") - with gr.Row(): - tb_indicator = gr.Textbox(label="Output", scale=5) - files_subtitles = gr.Files(label="Downloadable output file", scale=3, interactive=False) - btn_openfolder = gr.Button('πŸ“‚', scale=1) - - params = [input_file, tb_input_folder, dd_file_format, cb_timestamp] - btn_run.click(fn=self.whisper_inf.transcribe_file, - inputs=params + whisper_params.as_list(), - outputs=[tb_indicator, files_subtitles]) - btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None) - - with gr.TabItem("Youtube"): # tab2 - with gr.Row(): - tb_youtubelink = gr.Textbox(label="Youtube Link") - with gr.Row(equal_height=True): + gr.Markdown(MARKDOWN, elem_id="md_project") + with gr.Tabs(): + with gr.TabItem(_("File")): # tab1 with gr.Column(): - img_thumbnail = gr.Image(label="Youtube Thumbnail") - with gr.Column(): - tb_title = gr.Label(label="Youtube Title") - tb_description = gr.Textbox(label="Youtube Description", max_lines=15) + input_file = gr.Files(type="filepath", label=_("Upload File here")) + tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)", + info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them." + " Leave this field empty if you do not wish to use a local path.", + visible=self.args.colab, + value="") - whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters() + whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters() - with gr.Row(): - btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary") - with gr.Row(): - tb_indicator = gr.Textbox(label="Output", scale=5) - files_subtitles = gr.Files(label="Downloadable output file", scale=3) - btn_openfolder = gr.Button('πŸ“‚', scale=1) + with gr.Row(): + btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary") + with gr.Row(): + tb_indicator = gr.Textbox(label=_("Output"), scale=5) + files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3, interactive=False) + btn_openfolder = gr.Button('πŸ“‚', scale=1) - params = [tb_youtubelink, dd_file_format, cb_timestamp] + params = [input_file, tb_input_folder, dd_file_format, cb_timestamp] + btn_run.click(fn=self.whisper_inf.transcribe_file, + inputs=params + whisper_params.as_list(), + outputs=[tb_indicator, files_subtitles]) + btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None) - btn_run.click(fn=self.whisper_inf.transcribe_youtube, - inputs=params + whisper_params.as_list(), - outputs=[tb_indicator, files_subtitles]) - tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink], - outputs=[img_thumbnail, tb_title, tb_description]) - btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None) + with gr.TabItem(_("Youtube")): # tab2 + with gr.Row(): + tb_youtubelink = gr.Textbox(label=_("Youtube Link")) + with gr.Row(equal_height=True): + with gr.Column(): + img_thumbnail = gr.Image(label=_("Youtube Thumbnail")) + with gr.Column(): + tb_title = gr.Label(label=_("Youtube Title")) + tb_description = gr.Textbox(label=_("Youtube Description"), max_lines=15) - with gr.TabItem("Mic"): # tab3 - with gr.Row(): - mic_input = gr.Microphone(label="Record with Mic", type="filepath", interactive=True) + whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters() - whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters() + with gr.Row(): + btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary") + with gr.Row(): + tb_indicator = gr.Textbox(label=_("Output"), scale=5) + files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3) + btn_openfolder = gr.Button('πŸ“‚', scale=1) - with gr.Row(): - btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary") - with gr.Row(): - tb_indicator = gr.Textbox(label="Output", scale=5) - files_subtitles = gr.Files(label="Downloadable output file", scale=3) - btn_openfolder = gr.Button('πŸ“‚', scale=1) + params = [tb_youtubelink, dd_file_format, cb_timestamp] - params = [mic_input, dd_file_format, cb_timestamp] + btn_run.click(fn=self.whisper_inf.transcribe_youtube, + inputs=params + whisper_params.as_list(), + outputs=[tb_indicator, files_subtitles]) + tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink], + outputs=[img_thumbnail, tb_title, tb_description]) + btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None) - btn_run.click(fn=self.whisper_inf.transcribe_mic, - inputs=params + whisper_params.as_list(), - outputs=[tb_indicator, files_subtitles]) - btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None) + with gr.TabItem(_("Mic")): # tab3 + with gr.Row(): + mic_input = gr.Microphone(label=_("Record with Mic"), type="filepath", interactive=True) - with gr.TabItem("T2T Translation"): # tab 4 - with gr.Row(): - file_subs = gr.Files(type="filepath", label="Upload Subtitle Files to translate here") + whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters() - with gr.TabItem("DeepL API"): # sub tab1 - with gr.Row(): - tb_api_key = gr.Textbox(label="Your Auth Key (API KEY)", value=deepl_params["api_key"]) - with gr.Row(): - dd_source_lang = gr.Dropdown(label="Source Language", value=deepl_params["source_lang"], - choices=list(self.deepl_api.available_source_langs.keys())) - dd_target_lang = gr.Dropdown(label="Target Language", value=deepl_params["target_lang"], - choices=list(self.deepl_api.available_target_langs.keys())) with gr.Row(): - cb_is_pro = gr.Checkbox(label="Pro User?", value=deepl_params["is_pro"]) + btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary") with gr.Row(): - cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"], label="Add a timestamp to the end of the filename", - interactive=True) - with gr.Row(): - btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary") - with gr.Row(): - tb_indicator = gr.Textbox(label="Output", scale=5) - files_subtitles = gr.Files(label="Downloadable output file", scale=3) + tb_indicator = gr.Textbox(label=_("Output"), scale=5) + files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3) btn_openfolder = gr.Button('πŸ“‚', scale=1) - btn_run.click(fn=self.deepl_api.translate_deepl, - inputs=[tb_api_key, file_subs, dd_source_lang, dd_target_lang, - cb_is_pro, cb_timestamp], - outputs=[tb_indicator, files_subtitles]) + params = [mic_input, dd_file_format, cb_timestamp] - btn_openfolder.click(fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")), - inputs=None, - outputs=None) + btn_run.click(fn=self.whisper_inf.transcribe_mic, + inputs=params + whisper_params.as_list(), + outputs=[tb_indicator, files_subtitles]) + btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None) - with gr.TabItem("NLLB"): # sub tab2 - with gr.Row(): - dd_model_size = gr.Dropdown(label="Model", value=nllb_params["model_size"], - choices=self.nllb_inf.available_models) - dd_source_lang = gr.Dropdown(label="Source Language", value=nllb_params["source_lang"], - choices=self.nllb_inf.available_source_langs) - dd_target_lang = gr.Dropdown(label="Target Language", value=nllb_params["target_lang"], - choices=self.nllb_inf.available_target_langs) - with gr.Row(): - nb_max_length = gr.Number(label="Max Length Per Line", value=nllb_params["max_length"], - precision=0) - with gr.Row(): - cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"], label="Add a timestamp to the end of the filename", - interactive=True) + with gr.TabItem(_("T2T Translation")): # tab 4 with gr.Row(): - btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary") - with gr.Row(): - tb_indicator = gr.Textbox(label="Output", scale=5) - files_subtitles = gr.Files(label="Downloadable output file", scale=3) - btn_openfolder = gr.Button('πŸ“‚', scale=1) + file_subs = gr.Files(type="filepath", label=_("Upload Subtitle Files to translate here")) + + with gr.TabItem(_("DeepL API")): # sub tab1 + with gr.Row(): + tb_api_key = gr.Textbox(label=_("Your Auth Key (API KEY)"), + value=deepl_params["api_key"]) + with gr.Row(): + dd_source_lang = gr.Dropdown(label=_("Source Language"), + value=AUTOMATIC_DETECTION if deepl_params["source_lang"] == AUTOMATIC_DETECTION.unwrap() + else deepl_params["source_lang"], + choices=list(self.deepl_api.available_source_langs.keys())) + dd_target_lang = gr.Dropdown(label=_("Target Language"), + value=deepl_params["target_lang"], + choices=list(self.deepl_api.available_target_langs.keys())) + with gr.Row(): + cb_is_pro = gr.Checkbox(label=_("Pro User?"), value=deepl_params["is_pro"]) + with gr.Row(): + cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"], + label=_("Add a timestamp to the end of the filename"), + interactive=True) + with gr.Row(): + btn_run = gr.Button(_("TRANSLATE SUBTITLE FILE"), variant="primary") + with gr.Row(): + tb_indicator = gr.Textbox(label=_("Output"), scale=5) + files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3) + btn_openfolder = gr.Button('πŸ“‚', scale=1) + + btn_run.click(fn=self.deepl_api.translate_deepl, + inputs=[tb_api_key, file_subs, dd_source_lang, dd_target_lang, + cb_is_pro, cb_timestamp], + outputs=[tb_indicator, files_subtitles]) + + btn_openfolder.click( + fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")), + inputs=None, + outputs=None) + + with gr.TabItem(_("NLLB")): # sub tab2 + with gr.Row(): + dd_model_size = gr.Dropdown(label=_("Model"), value=nllb_params["model_size"], + choices=self.nllb_inf.available_models) + dd_source_lang = gr.Dropdown(label=_("Source Language"), + value=nllb_params["source_lang"], + choices=self.nllb_inf.available_source_langs) + dd_target_lang = gr.Dropdown(label=_("Target Language"), + value=nllb_params["target_lang"], + choices=self.nllb_inf.available_target_langs) + with gr.Row(): + nb_max_length = gr.Number(label="Max Length Per Line", value=nllb_params["max_length"], + precision=0) + with gr.Row(): + cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"], + label=_("Add a timestamp to the end of the filename"), + interactive=True) + with gr.Row(): + btn_run = gr.Button(_("TRANSLATE SUBTITLE FILE"), variant="primary") + with gr.Row(): + tb_indicator = gr.Textbox(label=_("Output"), scale=5) + files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3) + btn_openfolder = gr.Button('πŸ“‚', scale=1) + with gr.Column(): + md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table") + + btn_run.click(fn=self.nllb_inf.translate_file, + inputs=[file_subs, dd_model_size, dd_source_lang, dd_target_lang, + nb_max_length, cb_timestamp], + outputs=[tb_indicator, files_subtitles]) + + btn_openfolder.click( + fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")), + inputs=None, + outputs=None) + + with gr.TabItem(_("BGM Separation")): + files_audio = gr.Files(type="filepath", label=_("Upload Audio Files to separate background music")) + dd_uvr_device = gr.Dropdown(label=_("Device"), value=self.whisper_inf.music_separator.device, + choices=self.whisper_inf.music_separator.available_devices) + dd_uvr_model_size = gr.Dropdown(label=_("Model"), value=uvr_params["model_size"], + choices=self.whisper_inf.music_separator.available_models) + nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], + precision=0) + cb_uvr_save_file = gr.Checkbox(label=_("Save separated files to output"), + value=True, visible=False) + btn_run = gr.Button(_("SEPARATE BACKGROUND MUSIC"), variant="primary") with gr.Column(): - md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table") - - btn_run.click(fn=self.nllb_inf.translate_file, - inputs=[file_subs, dd_model_size, dd_source_lang, dd_target_lang, - nb_max_length, cb_timestamp], - outputs=[tb_indicator, files_subtitles]) - - btn_openfolder.click(fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")), - inputs=None, - outputs=None) - - with gr.TabItem("BGM Separation"): - files_audio = gr.Files(type="filepath", label="Upload Audio Files to separate background music") - dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device, - choices=self.whisper_inf.music_separator.available_devices) - dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"], - choices=self.whisper_inf.music_separator.available_models) - nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0) - cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", - value=True, visible=False) - btn_run = gr.Button("SEPARATE BACKGROUND MUSIC", variant="primary") - with gr.Column(): - with gr.Row(): - ad_instrumental = gr.Audio(label="Instrumental", scale=8) - btn_open_instrumental_folder = gr.Button('πŸ“‚', scale=1) - with gr.Row(): - ad_vocals = gr.Audio(label="Vocals", scale=8) - btn_open_vocals_folder = gr.Button('πŸ“‚', scale=1) - - btn_run.click(fn=self.whisper_inf.music_separator.separate_files, - inputs=[files_audio, dd_uvr_model_size, dd_uvr_device, nb_uvr_segment_size, - cb_uvr_save_file], - outputs=[ad_instrumental, ad_vocals]) - btn_open_instrumental_folder.click(inputs=None, - outputs=None, - fn=lambda: self.open_folder(os.path.join( - self.args.output_dir, "UVR", "instrumental" - ))) - btn_open_vocals_folder.click(inputs=None, - outputs=None, - fn=lambda: self.open_folder(os.path.join( - self.args.output_dir, "UVR", "vocals" - ))) + with gr.Row(): + ad_instrumental = gr.Audio(label=_("Instrumental"), scale=8) + btn_open_instrumental_folder = gr.Button('πŸ“‚', scale=1) + with gr.Row(): + ad_vocals = gr.Audio(label=_("Vocals"), scale=8) + btn_open_vocals_folder = gr.Button('πŸ“‚', scale=1) + + btn_run.click(fn=self.whisper_inf.music_separator.separate_files, + inputs=[files_audio, dd_uvr_model_size, dd_uvr_device, nb_uvr_segment_size, + cb_uvr_save_file], + outputs=[ad_instrumental, ad_vocals]) + btn_open_instrumental_folder.click(inputs=None, + outputs=None, + fn=lambda: self.open_folder(os.path.join( + self.args.output_dir, "UVR", "instrumental" + ))) + btn_open_vocals_folder.click(inputs=None, + outputs=None, + fn=lambda: self.open_folder(os.path.join( + self.args.output_dir, "UVR", "vocals" + ))) # Launch the app with optional gradio settings args = self.args @@ -418,10 +446,10 @@ def on_change_models(model_size: str): return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True) -# Create the parser for command-line arguments parser = argparse.ArgumentParser() parser.add_argument('--whisper_type', type=str, default="faster-whisper", - help='A type of the whisper implementation between: ["whisper", "faster-whisper", "insanely-fast-whisper"]') + choices=["whisper", "faster-whisper", "insanely-fast-whisper"], + help='A type of the whisper implementation (Github repo name)') parser.add_argument('--share', type=str2bool, default=False, nargs='?', const=True, help='Gradio share value') parser.add_argument('--server_name', type=str, default=None, help='Gradio server host') parser.add_argument('--server_port', type=int, default=None, help='Gradio server port') @@ -430,8 +458,10 @@ def on_change_models(model_size: str): parser.add_argument('--password', type=str, default=None, help='Gradio authentication password') parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme') parser.add_argument('--colab', type=str2bool, default=False, nargs='?', const=True, help='Is colab user or not') -parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True, help='Enable api or not in Gradio') -parser.add_argument('--inbrowser', type=str2bool, default=True, nargs='?', const=True, help='Whether to automatically start Gradio app or not') +parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True, + help='Enable api or not in Gradio') +parser.add_argument('--inbrowser', type=str2bool, default=True, nargs='?', const=True, + help='Whether to automatically start Gradio app or not') parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR, help='Directory path of the whisper model') parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR, diff --git a/configs/translation.yaml b/configs/translation.yaml new file mode 100644 index 00000000..8cbedbf7 --- /dev/null +++ b/configs/translation.yaml @@ -0,0 +1,321 @@ +en: # English + Language: Language + File: File + Youtube: Youtube + Mic: Mic + T2T Translation: T2T Translation + BGM Separation: BGM Separation + GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE + Output: Output + Downloadable output file: Downloadable output file + Upload File here: Upload File here + Model: Model + Automatic Detection: Automatic Detection + File Format: File Format + Translate to English?: Translate to English? + Add a timestamp to the end of the filename: Add a timestamp to the end of the filename + Advanced Parameters: Advanced Parameters + Background Music Remover Filter: Background Music Remover Filter + Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing + Enable Background Music Remover Filter: Enable Background Music Remover Filter + Save separated files to output: Save separated files to output + Offload sub model after removing background music: Offload sub model after removing background music + Voice Detection Filter: Voice Detection Filter + Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel. + Enable Silero VAD Filter: Enable Silero VAD Filter + Diarization: Diarization + Enable Diarization: Enable Diarization + HuggingFace Token: HuggingFace Token + This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement. + Device: Device + Youtube Link: Youtube Link + Youtube Thumbnail: Youtube Thumbnail + Youtube Title: Youtube Title + Youtube Description: Youtube Description + Record with Mic: Record with Mic + Upload Subtitle Files to translate here: Upload Subtitle Files to translate here + Your Auth Key (API KEY): Your Auth Key (API KEY) + Source Language: Source Language + Target Language: Target Language + Pro User?: Pro User? + TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE + Upload Audio Files to separate background music: Upload Audio Files to separate background music + Instrumental: Instrumental + Vocals: Vocals + SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC + +ko: # Korean + Language: μ–Έμ–΄ + File: 파일 + Youtube: 유튜브 + Mic: 마이크 + T2T Translation: T2T μžλ§‰ λ²ˆμ—­ + BGM Separation: λ°°κ²½ μŒμ•… 뢄리 + GENERATE SUBTITLE FILE: μžλ§‰ 파일 생성 + Output: κ²°κ³Όλ¬Ό + Downloadable output file: κ²°κ³Όλ¬Ό 파일 λ‹€μš΄λ‘œλ“œ + Upload File here: νŒŒμΌμ„ μ—…λ‘œλ“œ ν•˜μ„Έμš” + Model: λͺ¨λΈ + Automatic Detection: μžλ™ 감지 + File Format: 파일 ν˜•μ‹ + Translate to English?: μ˜μ–΄λ‘œ λ²ˆμ—­ν•©λ‹ˆκΉŒ? (μœ„μŠ€νΌ λͺ¨λΈ 자체 λ²ˆμ—­ κΈ°λŠ₯) + Add a timestamp to the end of the filename: 파일 이름 끝에 νƒ€μž„μŠ€νƒœν”„ 뢙이기 + Advanced Parameters: κ³ κΈ‰ λ³€μˆ˜ + Background Music Remover Filter: λ°°κ²½ μŒμ•… 제거 ν•„ν„° + Enabling this will remove background music: λ°›μ•„μ“°κΈ° 이전에 λ¨Όμ € λ°°κ²½ μŒμ•… 제거용 μ„œλΈŒ λͺ¨λΈμ„ ν™œμ„±ν™” ν•©λ‹ˆλ‹€. + Enable Background Music Remover Filter: λ°°κ²½ μŒμ•… 제거 ν•„ν„° ν™œμ„±ν™” + Save separated files to output: λΆ„λ¦¬λœ λ°°κ²½ μŒμ•… & μŒμ„± 파일 λ”°λ‘œ 좜λ ₯ 폴더에 μ €μž₯ + Offload sub model after removing background music: λ°°κ²½ μŒμ•… 제거 ν›„ μ„œλΈŒ λͺ¨λΈμ„ λΉ„ν™œμ„±ν™” ν•©λ‹ˆλ‹€. (VRAM 이 λΆ€μ‘±ν•  μ‹œ μ²΄ν¬ν•˜μ„Έμš”.) + Voice Detection Filter: λͺ©μ†Œλ¦¬ 감지 ν•„ν„° + Enable this to transcribe only detected voice: μ„œλΈŒ λͺ¨λΈμ— μ˜ν•΄ λͺ©μ†Œλ¦¬λΌκ³  νŒλ‹¨λœ λΆ€λΆ„λ§Œ λ°›μ•„μ“°κΈ°λ₯Ό μ§„ν–‰ν•©λ‹ˆλ‹€. + Enable Silero VAD Filter: Silero VAD ν•„ν„° ν™œμ„±ν™” + Diarization: ν™”μž ꡬ뢄 + Enable Diarization: ν™”μž ꡬ뢄 ν™œμ„±ν™” + HuggingFace Token: ν—ˆκΉ…νŽ˜μ΄μŠ€ 토큰 + This is only needed the first time you download the model: λͺ¨λΈμ„ 처음 λ‹€μš΄λ°›μ„ λ•Œλ§Œ 토큰이 ν•„μš”ν•©λ‹ˆλ‹€. 이미 λ‹€μš΄λ‘œλ“œ λ°›μœΌμ‹  μƒνƒœλΌλ©΄ μž…λ ₯ν•˜μ§€ μ•Šμ•„λ„ λ©λ‹ˆλ‹€. λͺ¨λΈμ„ λ‹€μš΄ λ°›κΈ° μœ„ν•΄μ„  "https://huggingface.co/pyannote/speaker-diarization-3.1" 와 "https://huggingface.co/pyannote/segmentation-3.0" μ—μ„œ λ¨Όμ € μ‚¬μš© 지침에 λ™μ˜ν•˜μ…”μ•Ό ν•©λ‹ˆλ‹€. + Device: λ””λ°”μ΄μŠ€ + Youtube Link: 유튜브 링크 + Youtube Thumbnail: 유튜브 썸넀일 + Youtube Title: 유튜브 제λͺ© + Youtube Description: 유튜브 μ„€λͺ… + Record with Mic: 마이크둜 λ…ΉμŒν•˜μ„Έμš” + Upload Subtitle Files to translate here: λ²ˆμ—­ν•  μžλ§‰ νŒŒμΌμ„ μ—…λ‘œλ“œ ν•˜μ„Έμš” + Your Auth Key (API KEY): DeepL API ν‚€ + Source Language: 원본 μ–Έμ–΄ + Target Language: λŒ€μƒ μ–Έμ–΄ + Pro User?: Pro 버전 μ‚¬μš©μž + TRANSLATE SUBTITLE FILE: μžλ§‰ 파일 λ²ˆμ—­ + Upload Audio Files to separate background music: λ°°κ²½ μŒμ•…μ„ 뢄리할 μ˜€λ””μ˜€ νŒŒμΌμ„ μ—…λ‘œλ“œ ν•˜μ„Έμš” + Instrumental: μ•…κΈ° + Vocals: 보컬 + SEPARATE BACKGROUND MUSIC: λ°°κ²½ μŒμ•… 뢄리 + +ja: # Japanese + Language: 言θͺž + File: File + Youtube: Youtube + Mic: Mic + T2T Translation: T2T Translation + BGM Separation: BGM Separation + GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE + Output: Output + Downloadable output file: Downloadable output file + Upload File here: Upload File here + Model: Model + Automatic Detection: Automatic Detection + File Format: File Format + Translate to English?: Translate to English? + Add a timestamp to the end of the filename: Add a timestamp to the end of the filename + Advanced Parameters: Advanced Parameters + Background Music Remover Filter: Background Music Remover Filter + Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing + Enable Background Music Remover Filter: Enable Background Music Remover Filter + Save separated files to output: Save separated files to output + Offload sub model after removing background music: Offload sub model after removing background music + Voice Detection Filter: Voice Detection Filter + Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel. + Enable Silero VAD Filter: Enable Silero VAD Filter + Diarization: Diarization + Enable Diarization: Enable Diarization + HuggingFace Token: HuggingFace Token + This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement. + Device: Device + Youtube Link: Youtube Link + Youtube Thumbnail: Youtube Thumbnail + Youtube Title: Youtube Title + Youtube Description: Youtube Description + Record with Mic: Record with Mic + Upload Subtitle Files to translate here: Upload Subtitle Files to translate here + Your Auth Key (API KEY): Your Auth Key (API KEY) + Source Language: Source Language + Target Language: Target Language + Pro User?: Pro User? + TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE + Upload Audio Files to separate background music: Upload Audio Files to separate background music + Instrumental: Instrumental + Vocals: Vocals + SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC + +es: # Spanish + Language: Idioma + File: File + Youtube: Youtube + Mic: Mic + T2T Translation: T2T Translation + BGM Separation: BGM Separation + GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE + Output: Output + Downloadable output file: Downloadable output file + Upload File here: Upload File here + Model: Model + Automatic Detection: Automatic Detection + File Format: File Format + Translate to English?: Translate to English? + Add a timestamp to the end of the filename: Add a timestamp to the end of the filename + Advanced Parameters: Advanced Parameters + Background Music Remover Filter: Background Music Remover Filter + Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing + Enable Background Music Remover Filter: Enable Background Music Remover Filter + Save separated files to output: Save separated files to output + Offload sub model after removing background music: Offload sub model after removing background music + Voice Detection Filter: Voice Detection Filter + Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel. + Enable Silero VAD Filter: Enable Silero VAD Filter + Diarization: Diarization + Enable Diarization: Enable Diarization + HuggingFace Token: HuggingFace Token + This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement. + Device: Device + Youtube Link: Youtube Link + Youtube Thumbnail: Youtube Thumbnail + Youtube Title: Youtube Title + Youtube Description: Youtube Description + Record with Mic: Record with Mic + Upload Subtitle Files to translate here: Upload Subtitle Files to translate here + Your Auth Key (API KEY): Your Auth Key (API KEY) + Source Language: Source Language + Target Language: Target Language + Pro User?: Pro User? + TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE + Upload Audio Files to separate background music: Upload Audio Files to separate background music + Instrumental: Instrumental + Vocals: Vocals + SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC + +fr: # French + Language: Langue + File: File + Youtube: Youtube + Mic: Mic + T2T Translation: T2T Translation + BGM Separation: BGM Separation + GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE + Output: Output + Downloadable output file: Downloadable output file + Upload File here: Upload File here + Model: Model + Automatic Detection: Automatic Detection + File Format: File Format + Translate to English?: Translate to English? + Add a timestamp to the end of the filename: Add a timestamp to the end of the filename + Advanced Parameters: Advanced Parameters + Background Music Remover Filter: Background Music Remover Filter + Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing + Enable Background Music Remover Filter: Enable Background Music Remover Filter + Save separated files to output: Save separated files to output + Offload sub model after removing background music: Offload sub model after removing background music + Voice Detection Filter: Voice Detection Filter + Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel. + Enable Silero VAD Filter: Enable Silero VAD Filter + Diarization: Diarization + Enable Diarization: Enable Diarization + HuggingFace Token: HuggingFace Token + This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement. + Device: Device + Youtube Link: Youtube Link + Youtube Thumbnail: Youtube Thumbnail + Youtube Title: Youtube Title + Youtube Description: Youtube Description + Record with Mic: Record with Mic + Upload Subtitle Files to translate here: Upload Subtitle Files to translate here + Your Auth Key (API KEY): Your Auth Key (API KEY) + Source Language: Source Language + Target Language: Target Language + Pro User?: Pro User? + TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE + Upload Audio Files to separate background music: Upload Audio Files to separate background music + Instrumental: Instrumental + Vocals: Vocals + SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC + +de: # German + Language: Sprache + File: File + Youtube: Youtube + Mic: Mic + T2T Translation: T2T Translation + BGM Separation: BGM Separation + GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE + Output: Output + Downloadable output file: Downloadable output file + Upload File here: Upload File here + Model: Model + Automatic Detection: Automatic Detection + File Format: File Format + Translate to English?: Translate to English? + Add a timestamp to the end of the filename: Add a timestamp to the end of the filename + Advanced Parameters: Advanced Parameters + Background Music Remover Filter: Background Music Remover Filter + Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing + Enable Background Music Remover Filter: Enable Background Music Remover Filter + Save separated files to output: Save separated files to output + Offload sub model after removing background music: Offload sub model after removing background music + Voice Detection Filter: Voice Detection Filter + Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel. + Enable Silero VAD Filter: Enable Silero VAD Filter + Diarization: Diarization + Enable Diarization: Enable Diarization + HuggingFace Token: HuggingFace Token + This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement. + Device: Device + Youtube Link: Youtube Link + Youtube Thumbnail: Youtube Thumbnail + Youtube Title: Youtube Title + Youtube Description: Youtube Description + Record with Mic: Record with Mic + Upload Subtitle Files to translate here: Upload Subtitle Files to translate here + Your Auth Key (API KEY): Your Auth Key (API KEY) + Source Language: Source Language + Target Language: Target Language + Pro User?: Pro User? + TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE + Upload Audio Files to separate background music: Upload Audio Files to separate background music + Instrumental: Instrumental + Vocals: Vocals + SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC + +zh: # Chinese + Language: 语言 + File: File + Youtube: Youtube + Mic: Mic + T2T Translation: T2T Translation + BGM Separation: BGM Separation + GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE + Output: Output + Downloadable output file: Downloadable output file + Upload File here: Upload File here + Model: Model + Automatic Detection: Automatic Detection + File Format: File Format + Translate to English?: Translate to English? + Add a timestamp to the end of the filename: Add a timestamp to the end of the filename + Advanced Parameters: Advanced Parameters + Background Music Remover Filter: Background Music Remover Filter + Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing + Enable Background Music Remover Filter: Enable Background Music Remover Filter + Save separated files to output: Save separated files to output + Offload sub model after removing background music: Offload sub model after removing background music + Voice Detection Filter: Voice Detection Filter + Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel. + Enable Silero VAD Filter: Enable Silero VAD Filter + Diarization: Diarization + Enable Diarization: Enable Diarization + HuggingFace Token: HuggingFace Token + This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement. + Device: Device + Youtube Link: Youtube Link + Youtube Thumbnail: Youtube Thumbnail + Youtube Title: Youtube Title + Youtube Description: Youtube Description + Record with Mic: Record with Mic + Upload Subtitle Files to translate here: Upload Subtitle Files to translate here + Your Auth Key (API KEY): Your Auth Key (API KEY) + Source Language: Source Language + Target Language: Target Language + Pro User?: Pro User? + TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE + Upload Audio Files to separate background music: Upload Audio Files to separate background music + Instrumental: Instrumental + Vocals: Vocals + SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC \ No newline at end of file diff --git a/modules/translation/deepl_api.py b/modules/translation/deepl_api.py index 385b3a14..35f245bf 100644 --- a/modules/translation/deepl_api.py +++ b/modules/translation/deepl_api.py @@ -5,6 +5,7 @@ import gradio as gr from modules.utils.paths import TRANSLATION_OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH +from modules.utils.constants import AUTOMATIC_DETECTION from modules.utils.subtitle_manager import * from modules.utils.files_manager import load_yaml, save_yaml @@ -50,7 +51,7 @@ } DEEPL_AVAILABLE_SOURCE_LANGS = { - 'Automatic Detection': None, + AUTOMATIC_DETECTION: None, 'Bulgarian': 'BG', 'Czech': 'CS', 'Danish': 'DA', diff --git a/modules/utils/constants.py b/modules/utils/constants.py new file mode 100644 index 00000000..e9309bc3 --- /dev/null +++ b/modules/utils/constants.py @@ -0,0 +1,3 @@ +from gradio_i18n import Translate, gettext as _ + +AUTOMATIC_DETECTION = _("Automatic Detection") diff --git a/modules/utils/paths.py b/modules/utils/paths.py index 630ab40b..45d48b83 100644 --- a/modules/utils/paths.py +++ b/modules/utils/paths.py @@ -10,6 +10,7 @@ UVR_MODELS_DIR = os.path.join(MODELS_DIR, "UVR", "MDX_Net_Models") CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs") DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml") +I18N_YAML_PATH = os.path.join(CONFIGS_DIR, "translation.yaml") OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs") TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations") UVR_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "UVR") diff --git a/modules/whisper/whisper_base.py b/modules/whisper/whisper_base.py index 7fd27f3b..51c87ddf 100644 --- a/modules/whisper/whisper_base.py +++ b/modules/whisper/whisper_base.py @@ -14,6 +14,7 @@ from modules.uvr.music_separator import MusicSeparator from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH, UVR_MODELS_DIR) +from modules.utils.constants import AUTOMATIC_DETECTION from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename from modules.utils.youtube_manager import get_ytdata, get_ytaudio from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml @@ -107,7 +108,7 @@ def run(self, if params.lang is None: pass - elif params.lang == "Automatic Detection": + elif params.lang == AUTOMATIC_DETECTION: params.lang = None else: language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()} diff --git a/modules/whisper/whisper_parameter.py b/modules/whisper/whisper_parameter.py index 0aec677b..19115fc2 100644 --- a/modules/whisper/whisper_parameter.py +++ b/modules/whisper/whisper_parameter.py @@ -3,6 +3,8 @@ from typing import Optional, Dict import yaml +from modules.utils.constants import AUTOMATIC_DETECTION + @dataclass class WhisperParameters: @@ -306,7 +308,7 @@ def to_yaml(self) -> Dict: data = { "whisper": { "model_size": self.model_size, - "lang": "Automatic Detection" if self.lang is None else self.lang, + "lang": AUTOMATIC_DETECTION.unwrap() if self.lang is None else self.lang, "is_translate": self.is_translate, "beam_size": self.beam_size, "log_prob_threshold": self.log_prob_threshold, diff --git a/requirements.txt b/requirements.txt index afe8b6c7..283a39a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,7 @@ git+https://github.com/jhj0517/jhj0517-whisper.git faster-whisper==1.0.3 transformers gradio +git+https://github.com/jhj0517/gradio-i18n.git@fix/encoding-error pytubefix ruamel.yaml==0.18.6 pyannote.audio==3.3.1