diff --git a/api/server.py b/api/server.py index 474498b..7e8bfa9 100644 --- a/api/server.py +++ b/api/server.py @@ -159,122 +159,122 @@ def detect_language(item: Dict): # Timeout in 20 minutes -@stub.function(gpu=GPU_TYPE, timeout=1200) -@web_endpoint(method="POST") -def generate_seamlessm4t_speech(item: Dict): - """ - Processes the input speech audio, performs voice activity detection, and translates the speech from the source language to the target language. +# @stub.function(gpu=GPU_TYPE, timeout=1200) +# @web_endpoint(method="POST") +# def generate_seamlessm4t_speech(item: Dict): +# """ +# Processes the input speech audio, performs voice activity detection, and translates the speech from the source language to the target language. - Parameters: - - item (Dict): A dictionary containing the base64 encoded audio data, source language, and target language. +# Parameters: +# - item (Dict): A dictionary containing the base64 encoded audio data, source language, and target language. - Returns: - - Dict: A dictionary containing the status code, message, detected speech chunks, and the translated text. - """ - # import wave - import os - import time +# Returns: +# - Dict: A dictionary containing the status code, message, detected speech chunks, and the translated text. +# """ +# # import wave +# import os +# import time - import torch - import torchaudio - from pydub import AudioSegment - from seamless_communication.inference import Translator +# import torch +# import torchaudio +# from pydub import AudioSegment +# from seamless_communication.inference import Translator - try: - # print(f"Payload: {item}") - USE_ONNX = False - model, utils = torch.hub.load( - repo_or_dir="snakers4/silero-vad", model="silero_vad", onnx=USE_ONNX - ) +# try: +# # print(f"Payload: {item}") +# USE_ONNX = False +# model, utils = torch.hub.load( +# repo_or_dir="snakers4/silero-vad", model="silero_vad", onnx=USE_ONNX +# ) - ( - get_speech_timestamps, - save_audio, - read_audio, - VADIterator, - collect_chunks, - ) = utils +# ( +# get_speech_timestamps, +# save_audio, +# read_audio, +# VADIterator, +# collect_chunks, +# ) = utils - # Decode the base64 audio and convert it for processing - b64 = item["wav_base64"] - # source_lang = item["source"] - # print(f"Target_lang: {item.get('target')}") - target_lang = item["target"] +# # Decode the base64 audio and convert it for processing +# b64 = item["wav_base64"] +# # source_lang = item["source"] +# # print(f"Target_lang: {item.get('target')}") +# target_lang = item["target"] - fname = base64_to_audio_file(b64_contents=b64) - convert_to_mono_16k(fname, "output.wav") +# fname = base64_to_audio_file(b64_contents=b64) +# convert_to_mono_16k(fname, "output.wav") - # Perform voice activity detection on the processed audio - wav = read_audio("output.wav", sampling_rate=SAMPLING_RATE) +# # Perform voice activity detection on the processed audio +# wav = read_audio("output.wav", sampling_rate=SAMPLING_RATE) - # get speech timestamps from full audio file - speech_timestamps_seconds = get_speech_timestamps( - wav, model, sampling_rate=SAMPLING_RATE, return_seconds=True - ) - print(speech_timestamps_seconds) - # translator = download_models() - start = time.perf_counter() - model_name = "seamlessM4T_v2_large" - vocoder_name = ( - "vocoder_v2" if model_name == "seamlessM4T_v2_large" else "vocoder_36langs" - ) +# # get speech timestamps from full audio file +# speech_timestamps_seconds = get_speech_timestamps( +# wav, model, sampling_rate=SAMPLING_RATE, return_seconds=True +# ) +# print(speech_timestamps_seconds) +# # translator = download_models() +# start = time.perf_counter() +# model_name = "seamlessM4T_v2_large" +# vocoder_name = ( +# "vocoder_v2" if model_name == "seamlessM4T_v2_large" else "vocoder_36langs" +# ) - translator = Translator( - model_name, - vocoder_name, - device=torch.device("cuda:0"), - dtype=torch.float16, - ) +# translator = Translator( +# model_name, +# vocoder_name, +# device=torch.device("cuda:0"), +# dtype=torch.float16, +# ) - duration = time.perf_counter() - start - print(f"Duration to load model is: {duration}") +# duration = time.perf_counter() - start +# print(f"Duration to load model is: {duration}") - # Replace t1, t2 with VAD time - timestamps_start = [] - timestamps_end = [] - text = [] +# # Replace t1, t2 with VAD time +# timestamps_start = [] +# timestamps_end = [] +# text = [] - async def generate(): - # Logic for VAD based filtering - for item in speech_timestamps_seconds: - s = item["start"] - e = item["end"] +# async def generate(): +# # Logic for VAD based filtering +# for item in speech_timestamps_seconds: +# s = item["start"] +# e = item["end"] - timestamps_start.append(s) - timestamps_end.append(e) - newAudio = AudioSegment.from_wav("output.wav") +# timestamps_start.append(s) +# timestamps_end.append(e) +# newAudio = AudioSegment.from_wav("output.wav") - # time in seconds should be multiplied by 1000.0 for AudioSegment array. So 20s = 20000 - newAudio = newAudio[s * 1000 : e * 1000] - new_audio_name = "new_" + str(s) + ".wav" - newAudio.export(new_audio_name, format="wav") - waveform, sample_rate = torchaudio.load(new_audio_name) - resampler = torchaudio.transforms.Resample( - sample_rate, SAMPLING_RATE, dtype=waveform.dtype - ) - resampled_waveform = resampler(waveform) - torchaudio.save("resampled.wav", resampled_waveform, SAMPLING_RATE) - translated_text, _ = translator.predict( - "resampled.wav", "s2tt", target_lang - ) - # print(translated_text) - text.append(str(translated_text[0])) - os.remove(new_audio_name) - os.remove("resampled.wav") - obj = { - "start": s, - "end": e, - "text": str(translated_text[0]), - } - print(obj) - yield json.dumps(obj) +# # time in seconds should be multiplied by 1000.0 for AudioSegment array. So 20s = 20000 +# newAudio = newAudio[s * 1000 : e * 1000] +# new_audio_name = "new_" + str(s) + ".wav" +# newAudio.export(new_audio_name, format="wav") +# waveform, sample_rate = torchaudio.load(new_audio_name) +# resampler = torchaudio.transforms.Resample( +# sample_rate, SAMPLING_RATE, dtype=waveform.dtype +# ) +# resampled_waveform = resampler(waveform) +# torchaudio.save("resampled.wav", resampled_waveform, SAMPLING_RATE) +# translated_text, _ = translator.predict( +# "resampled.wav", "s2tt", target_lang +# ) +# # print(translated_text) +# text.append(str(translated_text[0])) +# os.remove(new_audio_name) +# os.remove("resampled.wav") +# obj = { +# "start": s, +# "end": e, +# "text": str(translated_text[0]), +# } +# print(obj) +# yield json.dumps(obj) - return StreamingResponse(generate(), media_type="text/event-stream") +# return StreamingResponse(generate(), media_type="text/event-stream") - except Exception as e: - print(e) - logging.critical(e, exc_info=True) - return {"message": "Internal server error", "code": 500} +# except Exception as e: +# print(e) +# logging.critical(e, exc_info=True) +# return {"message": "Internal server error", "code": 500} def sliding_window_approch_timestamps(speech_timestamps_seconds): @@ -612,28 +612,26 @@ async def generate(): # Timeout in 20 minutes @stub.function(gpu=GPU_TYPE, timeout=1200) @web_endpoint(method="POST") -def youtube_generate_seamlessm4t_speech(item: Dict): +def seamless_generate_speech(item: Dict): + import os + import time + import torch + import torchaudio + from pytube import YouTube + from pydub import AudioSegment + from seamless_communication.inference import Translator + """ Processes the input speech audio, performs voice activity detection, and translates the speech from the source language to the target language. Parameters: - - item (Dict): A dictionary containing the base64 encoded audio data, source language, and target language. + - item (Dict): A dictionary containing the base64 encoded audio data, source language, target language, and type (youtube or file). Returns: - Dict: A dictionary containing the status code, message, detected speech chunks, and the translated text. """ - # import wave - import os - import time - - import torch - import torchaudio - from pytube import YouTube - from pydub import AudioSegment - from seamless_communication.inference import Translator try: - # print(f"Payload: {item}") USE_ONNX = False model, utils = torch.hub.load( repo_or_dir="snakers4/silero-vad", model="silero_vad", onnx=USE_ONNX @@ -647,40 +645,36 @@ def youtube_generate_seamlessm4t_speech(item: Dict): collect_chunks, ) = utils - # Decode the base64 audio and convert it for processing - yt_id = item["yt_id"] - # source_lang = item["source"] - # print(f"Target_lang: {item.get('target')}") - target_lang = item["target"] + type = item.get("type") + if type == "youtube": + yt_id = item["yt_id"] + target_lang = item["target"] - # Download YouTube video - youtube_url = f"https://www.youtube.com/watch?v={yt_id}" - youtube = YouTube(youtube_url) - video = youtube.streams.filter(only_audio=True).first() - video.download(filename="temp_video.mp4") + youtube_url = f"https://www.youtube.com/watch?v={yt_id}" + youtube = YouTube(youtube_url) + video = youtube.streams.filter(only_audio=True).first() + video.download(filename="temp_video.mp4") - # Convert video to wav - audio = AudioSegment.from_file("temp_video.mp4", format="mp4") - audio.export("temp_audio.wav", format="wav") + audio = AudioSegment.from_file("temp_video.mp4", format="mp4") + audio.export("temp_audio.wav", format="wav") - # Convert audio to mono channel with 16K frequency - audio = AudioSegment.from_wav("temp_audio.wav") - audio = audio.set_channels(1).set_frame_rate(16000) - audio.export("output.wav", format="wav") + audio = AudioSegment.from_wav("temp_audio.wav") + audio = audio.set_channels(1).set_frame_rate(16000) + audio.export("output.wav", format="wav") - # fname = base64_to_audio_file(b64_contents=b64) - # convert_to_mono_16k(fname, "output.wav") + elif type == "file": + b64 = item["wav_base64"] + target_lang = item["target"] + + fname = base64_to_audio_file(b64_contents=b64) + convert_to_mono_16k(fname, "output.wav") - # Perform voice activity detection on the processed audio wav = read_audio("output.wav", sampling_rate=SAMPLING_RATE) - # get speech timestamps from full audio file speech_timestamps_seconds = get_speech_timestamps( wav, model, sampling_rate=SAMPLING_RATE, return_seconds=True ) - print(speech_timestamps_seconds) - # translator = download_models() - start = time.perf_counter() + model_name = "seamlessM4T_v2_large" vocoder_name = ( "vocoder_v2" if model_name == "seamlessM4T_v2_large" else "vocoder_36langs" @@ -693,10 +687,6 @@ def youtube_generate_seamlessm4t_speech(item: Dict): dtype=torch.float16, ) - duration = time.perf_counter() - start - print(f"Duration to load model is: {duration}") - - # Replace t1, t2 with VAD time timestamps_start = [] timestamps_end = [] text = [] @@ -710,7 +700,6 @@ async def generate(): timestamps_end.append(e) newAudio = AudioSegment.from_wav("output.wav") - # time in seconds should be multiplied by 1000.0 for AudioSegment array. So 20s = 20000 newAudio = newAudio[s * 1000 : e * 1000] new_audio_name = "new_" + str(s) + ".wav" newAudio.export(new_audio_name, format="wav") @@ -723,7 +712,6 @@ async def generate(): translated_text, _ = translator.predict( "resampled.wav", "s2tt", target_lang ) - # print(translated_text) text.append(str(translated_text[0])) os.remove(new_audio_name) os.remove("resampled.wav") @@ -964,117 +952,124 @@ async def generate(): return {"message": "Internal server error", "code": 500} -@stub.function(gpu=GPU_TYPE, timeout=600) -@web_endpoint(method="POST") -def youtube_generate_whisperx_speech(item: Dict): - """ - Processes the input speech audio and translates the speech to the target language using faster-whisper. +# @stub.function(gpu=GPU_TYPE, timeout=600) +# @web_endpoint(method="POST") +# def youtube_generate_whisperx_speech(item: Dict): +# """ +# Processes the input speech audio and translates the speech to the target language using faster-whisper. - Parameters: - - item (Dict): A dictionary containing the base64 encoded audio data and target language. +# Parameters: +# - item (Dict): A dictionary containing the base64 encoded audio data and target language. - Returns: - - Dict: A dictionary containing the status code, message, detected speech chunks, and the translated text. - """ - import torch - import torchaudio - import whisperx - from pytube import YouTube - from pydub import AudioSegment +# Returns: +# - Dict: A dictionary containing the status code, message, detected speech chunks, and the translated text. +# """ +# import torch +# import torchaudio +# import whisperx +# from pytube import YouTube +# from pydub import AudioSegment +# import os +# import time +# import torch +# import torchaudio +# from pytube import YouTube +# from pydub import AudioSegment +# from seamless_communication.inference import Translator - try: - yt_id = item["yt_id"] - target_lang = item["target"] +# try: +# yt_id = item["yt_id"] +# target_lang = item["target"] - # Download YouTube video - youtube_url = f"https://www.youtube.com/watch?v={yt_id}" - youtube = YouTube(youtube_url) - video = youtube.streams.filter(only_audio=True).first() - video.download(filename="temp_video.mp4") +# # Download YouTube video +# youtube_url = f"https://www.youtube.com/watch?v={yt_id}" +# youtube = YouTube(youtube_url) +# video = youtube.streams.filter(only_audio=True).first() +# video.download(filename="temp_video.mp4") - # Convert video to wav - audio = AudioSegment.from_file("temp_video.mp4", format="mp4") - audio.export("temp_audio.wav", format="wav") +# # Convert video to wav +# audio = AudioSegment.from_file("temp_video.mp4", format="mp4") +# audio.export("temp_audio.wav", format="wav") - # Convert audio to mono channel with 16K frequency - audio = AudioSegment.from_wav("temp_audio.wav") - audio = audio.set_channels(1).set_frame_rate(16000) - audio.export("output.wav", format="wav") +# # Convert audio to mono channel with 16K frequency +# audio = AudioSegment.from_wav("temp_audio.wav") +# audio = audio.set_channels(1).set_frame_rate(16000) +# audio.export("output.wav", format="wav") - USE_ONNX = False - model, utils = torch.hub.load( - repo_or_dir="snakers4/silero-vad", model="silero_vad", onnx=USE_ONNX - ) +# USE_ONNX = False +# model, utils = torch.hub.load( +# repo_or_dir="snakers4/silero-vad", model="silero_vad", onnx=USE_ONNX +# ) - ( - get_speech_timestamps, - save_audio, - read_audio, - VADIterator, - collect_chunks, - ) = utils +# ( +# get_speech_timestamps, +# save_audio, +# read_audio, +# VADIterator, +# collect_chunks, +# ) = utils - # Perform voice activity detection on the processed audio - wav = read_audio("output.wav", sampling_rate=SAMPLING_RATE) +# # Perform voice activity detection on the processed audio +# wav = read_audio("output.wav", sampling_rate=SAMPLING_RATE) - # get speech timestamps from full audio file - speech_timestamps_seconds = get_speech_timestamps( - wav, model, sampling_rate=SAMPLING_RATE, return_seconds=True - ) - print(speech_timestamps_seconds) +# # get speech timestamps from full audio file +# speech_timestamps_seconds = get_speech_timestamps( +# wav, model, sampling_rate=SAMPLING_RATE, return_seconds=True +# ) +# print(speech_timestamps_seconds) - grouped_timestamps = sliding_window_approch_timestamps( - speech_timestamps_seconds - ) - print(grouped_timestamps) +# grouped_timestamps = sliding_window_approch_timestamps( +# speech_timestamps_seconds +# ) +# print(grouped_timestamps) - model = whisperx.load_model( - MODEL_SIZE, "cuda", compute_type="float16", language=target_lang - ) +# model = whisperx.load_model( +# MODEL_SIZE, "cuda", compute_type="float16", language=target_lang +# ) - async def generate(): - for segment in grouped_timestamps: - s = segment["start"] - e = segment["end"] +# async def generate(): +# for segment in grouped_timestamps: +# s = segment["start"] +# e = segment["end"] - newAudio = AudioSegment.from_wav("output.wav") - newAudio = newAudio[s * 1000 : e * 1000] - new_audio_name = "new_" + str(s) + ".wav" - newAudio.export(new_audio_name, format="wav") - waveform, sample_rate = torchaudio.load(new_audio_name) - resampler = torchaudio.transforms.Resample( - sample_rate, SAMPLING_RATE, dtype=waveform.dtype - ) - resampled_waveform = resampler(waveform) - torchaudio.save("resampled.wav", resampled_waveform, SAMPLING_RATE) +# newAudio = AudioSegment.from_wav("output.wav") +# newAudio = newAudio[s * 1000 : e * 1000] +# new_audio_name = "new_" + str(s) + ".wav" +# newAudio.export(new_audio_name, format="wav") +# waveform, sample_rate = torchaudio.load(new_audio_name) +# resampler = torchaudio.transforms.Resample( +# sample_rate, SAMPLING_RATE, dtype=waveform.dtype +# ) +# resampled_waveform = resampler(waveform) +# torchaudio.save("resampled.wav", resampled_waveform, SAMPLING_RATE) - audio = whisperx.load_audio("resampled.wav") - result = model.transcribe(audio, batch_size=16) - model_a, metadata = whisperx.load_align_model( - language_code=target_lang, device="cuda" - ) +# audio = whisperx.load_audio("resampled.wav") +# result = model.transcribe(audio, batch_size=16) +# model_a, metadata = whisperx.load_align_model( +# language_code=target_lang, device="cuda" +# ) - result = whisperx.align( - result["segments"], - model_a, - metadata, - audio, - "cuda", - return_char_alignments=False, - ) +# result = whisperx.align( +# result["segments"], +# model_a, +# metadata, +# audio, +# "cuda", +# return_char_alignments=False, +# ) - for segment in result["segments"]: - obj = { - "start": segment["start"] + s, - "end": segment["end"] + s, - "text": segment["text"], - } - print(obj) - yield json.dumps(obj) +# for segment in result["segments"]: +# obj = { +# "start": segment["start"] + s, +# "end": segment["end"] + s, +# "text": segment["text"], +# } +# print(obj) +# yield json.dumps(obj) - return StreamingResponse(generate(), media_type="text/event-stream") +# return StreamingResponse(generate(), media_type="text/event-stream") - except Exception as e: - print(e) - logging.critical(e, exc_info=True) - return {"message": "Internal server error", "code": 500} +# except Exception as e: +# print(e) +# logging.critical(e, exc_info=True) +# return {"message": "Internal server error", "code": 500} diff --git a/ui/src/audioUtils.js b/ui/src/audioUtils.js index 5c6d7b3..da851dc 100644 --- a/ui/src/audioUtils.js +++ b/ui/src/audioUtils.js @@ -15,7 +15,7 @@ const blobToBase64 = (blob) => { }); }; -export const getAudioDetails = (file) => { +export const getAudioDetails = async (file, requestData) => { // Read the audio file const reader = new FileReader(); reader.readAsArrayBuffer(file); @@ -44,9 +44,20 @@ export const getAudioDetails = (file) => { // Send data to API try { - const response = await axios.post("YOUR_API_ENDPOINT", { - audio: base64Data, - }); + const response = await fetch( + "https://aldrinjenson--seamless-m4t-speech-detect-language.modal.run", + { + method: "POST", + // body: JSON.stringify(requestData), + body: JSON.stringify({ + wav_base64: base64Data, + }), + // body: JSON.stringify(requestData), + headers: { + "Content-Type": "application/json", + }, + } + ); console.log("API Response:", response.data); } catch (error) { console.error("Error sending data to API:", error); diff --git a/ui/src/components/generate/UploadFile.jsx b/ui/src/components/generate/UploadFile.jsx index b74e491..516011a 100644 --- a/ui/src/components/generate/UploadFile.jsx +++ b/ui/src/components/generate/UploadFile.jsx @@ -90,18 +90,22 @@ const UploadFile = ({ selectedModel ); - const audioDetails = getAudioDetails(uploadedFile); - console.log(audioDetails); - return null; + // const audioDetails = await getAudioDetails(uploadedFile, requestData); + // console.log(audioDetails); + // reset(false); + // return null; const toastId = toast.info("Uploading.."); - fetch(url, { - method: "POST", - body: JSON.stringify(requestData), - headers: { - "Content-Type": "application/json", - }, - }) + fetch( + "https://aldrinjenson--seamless-m4t-speech-seamless-generate-speech.modal.run", + { + method: "POST", + body: JSON.stringify({ ...requestData, type: "file" }), + headers: { + "Content-Type": "application/json", + }, + } + ) .then(async (res) => { if (res?.code === 500) throw new Error("Internal Server Error");