diff --git a/api/server.py b/api/server.py
index 474498b..7e8bfa9 100644
--- a/api/server.py
+++ b/api/server.py
@@ -159,122 +159,122 @@ def detect_language(item: Dict):
 
 
 # Timeout in 20 minutes
-@stub.function(gpu=GPU_TYPE, timeout=1200)
-@web_endpoint(method="POST")
-def generate_seamlessm4t_speech(item: Dict):
-    """
-    Processes the input speech audio, performs voice activity detection, and translates the speech from the source language to the target language.
+# @stub.function(gpu=GPU_TYPE, timeout=1200)
+# @web_endpoint(method="POST")
+# def generate_seamlessm4t_speech(item: Dict):
+#     """
+#     Processes the input speech audio, performs voice activity detection, and translates the speech from the source language to the target language.
 
-    Parameters:
-    - item (Dict): A dictionary containing the base64 encoded audio data, source language, and target language.
+#     Parameters:
+#     - item (Dict): A dictionary containing the base64 encoded audio data, source language, and target language.
 
-    Returns:
-    - Dict: A dictionary containing the status code, message, detected speech chunks, and the translated text.
-    """
-    # import wave
-    import os
-    import time
+#     Returns:
+#     - Dict: A dictionary containing the status code, message, detected speech chunks, and the translated text.
+#     """
+#     # import wave
+#     import os
+#     import time
 
-    import torch
-    import torchaudio
-    from pydub import AudioSegment
-    from seamless_communication.inference import Translator
+#     import torch
+#     import torchaudio
+#     from pydub import AudioSegment
+#     from seamless_communication.inference import Translator
 
-    try:
-        # print(f"Payload: {item}")
-        USE_ONNX = False
-        model, utils = torch.hub.load(
-            repo_or_dir="snakers4/silero-vad", model="silero_vad", onnx=USE_ONNX
-        )
+#     try:
+#         # print(f"Payload: {item}")
+#         USE_ONNX = False
+#         model, utils = torch.hub.load(
+#             repo_or_dir="snakers4/silero-vad", model="silero_vad", onnx=USE_ONNX
+#         )
 
-        (
-            get_speech_timestamps,
-            save_audio,
-            read_audio,
-            VADIterator,
-            collect_chunks,
-        ) = utils
+#         (
+#             get_speech_timestamps,
+#             save_audio,
+#             read_audio,
+#             VADIterator,
+#             collect_chunks,
+#         ) = utils
 
-        # Decode the base64 audio and convert it for processing
-        b64 = item["wav_base64"]
-        # source_lang = item["source"]
-        # print(f"Target_lang: {item.get('target')}")
-        target_lang = item["target"]
+#         # Decode the base64 audio and convert it for processing
+#         b64 = item["wav_base64"]
+#         # source_lang = item["source"]
+#         # print(f"Target_lang: {item.get('target')}")
+#         target_lang = item["target"]
 
-        fname = base64_to_audio_file(b64_contents=b64)
-        convert_to_mono_16k(fname, "output.wav")
+#         fname = base64_to_audio_file(b64_contents=b64)
+#         convert_to_mono_16k(fname, "output.wav")
 
-        # Perform voice activity detection on the processed audio
-        wav = read_audio("output.wav", sampling_rate=SAMPLING_RATE)
+#         # Perform voice activity detection on the processed audio
+#         wav = read_audio("output.wav", sampling_rate=SAMPLING_RATE)
 
-        # get speech timestamps from full audio file
-        speech_timestamps_seconds = get_speech_timestamps(
-            wav, model, sampling_rate=SAMPLING_RATE, return_seconds=True
-        )
-        print(speech_timestamps_seconds)
-        # translator = download_models()
-        start = time.perf_counter()
-        model_name = "seamlessM4T_v2_large"
-        vocoder_name = (
-            "vocoder_v2" if model_name == "seamlessM4T_v2_large" else "vocoder_36langs"
-        )
+#         # get speech timestamps from full audio file
+#         speech_timestamps_seconds = get_speech_timestamps(
+#             wav, model, sampling_rate=SAMPLING_RATE, return_seconds=True
+#         )
+#         print(speech_timestamps_seconds)
+#         # translator = download_models()
+#         start = time.perf_counter()
+#         model_name = "seamlessM4T_v2_large"
+#         vocoder_name = (
+#             "vocoder_v2" if model_name == "seamlessM4T_v2_large" else "vocoder_36langs"
+#         )
 
-        translator = Translator(
-            model_name,
-            vocoder_name,
-            device=torch.device("cuda:0"),
-            dtype=torch.float16,
-        )
+#         translator = Translator(
+#             model_name,
+#             vocoder_name,
+#             device=torch.device("cuda:0"),
+#             dtype=torch.float16,
+#         )
 
-        duration = time.perf_counter() - start
-        print(f"Duration to load model is: {duration}")
+#         duration = time.perf_counter() - start
+#         print(f"Duration to load model is: {duration}")
 
-        # Replace t1, t2 with VAD time
-        timestamps_start = []
-        timestamps_end = []
-        text = []
+#         # Replace t1, t2 with VAD time
+#         timestamps_start = []
+#         timestamps_end = []
+#         text = []
 
-        async def generate():
-            # Logic for VAD based filtering
-            for item in speech_timestamps_seconds:
-                s = item["start"]
-                e = item["end"]
+#         async def generate():
+#             # Logic for VAD based filtering
+#             for item in speech_timestamps_seconds:
+#                 s = item["start"]
+#                 e = item["end"]
 
-                timestamps_start.append(s)
-                timestamps_end.append(e)
-                newAudio = AudioSegment.from_wav("output.wav")
+#                 timestamps_start.append(s)
+#                 timestamps_end.append(e)
+#                 newAudio = AudioSegment.from_wav("output.wav")
 
-                # time in seconds should be multiplied by 1000.0 for AudioSegment array. So 20s = 20000
-                newAudio = newAudio[s * 1000 : e * 1000]
-                new_audio_name = "new_" + str(s) + ".wav"
-                newAudio.export(new_audio_name, format="wav")
-                waveform, sample_rate = torchaudio.load(new_audio_name)
-                resampler = torchaudio.transforms.Resample(
-                    sample_rate, SAMPLING_RATE, dtype=waveform.dtype
-                )
-                resampled_waveform = resampler(waveform)
-                torchaudio.save("resampled.wav", resampled_waveform, SAMPLING_RATE)
-                translated_text, _ = translator.predict(
-                    "resampled.wav", "s2tt", target_lang
-                )
-                # print(translated_text)
-                text.append(str(translated_text[0]))
-                os.remove(new_audio_name)
-                os.remove("resampled.wav")
-                obj = {
-                    "start": s,
-                    "end": e,
-                    "text": str(translated_text[0]),
-                }
-                print(obj)
-                yield json.dumps(obj)
+#                 # time in seconds should be multiplied by 1000.0 for AudioSegment array. So 20s = 20000
+#                 newAudio = newAudio[s * 1000 : e * 1000]
+#                 new_audio_name = "new_" + str(s) + ".wav"
+#                 newAudio.export(new_audio_name, format="wav")
+#                 waveform, sample_rate = torchaudio.load(new_audio_name)
+#                 resampler = torchaudio.transforms.Resample(
+#                     sample_rate, SAMPLING_RATE, dtype=waveform.dtype
+#                 )
+#                 resampled_waveform = resampler(waveform)
+#                 torchaudio.save("resampled.wav", resampled_waveform, SAMPLING_RATE)
+#                 translated_text, _ = translator.predict(
+#                     "resampled.wav", "s2tt", target_lang
+#                 )
+#                 # print(translated_text)
+#                 text.append(str(translated_text[0]))
+#                 os.remove(new_audio_name)
+#                 os.remove("resampled.wav")
+#                 obj = {
+#                     "start": s,
+#                     "end": e,
+#                     "text": str(translated_text[0]),
+#                 }
+#                 print(obj)
+#                 yield json.dumps(obj)
 
-        return StreamingResponse(generate(), media_type="text/event-stream")
+#         return StreamingResponse(generate(), media_type="text/event-stream")
 
-    except Exception as e:
-        print(e)
-        logging.critical(e, exc_info=True)
-        return {"message": "Internal server error", "code": 500}
+#     except Exception as e:
+#         print(e)
+#         logging.critical(e, exc_info=True)
+#         return {"message": "Internal server error", "code": 500}
 
 
 def sliding_window_approch_timestamps(speech_timestamps_seconds):
@@ -612,28 +612,26 @@ async def generate():
 # Timeout in 20 minutes
 @stub.function(gpu=GPU_TYPE, timeout=1200)
 @web_endpoint(method="POST")
-def youtube_generate_seamlessm4t_speech(item: Dict):
+def seamless_generate_speech(item: Dict):
+    import os
+    import time
+    import torch
+    import torchaudio
+    from pytube import YouTube
+    from pydub import AudioSegment
+    from seamless_communication.inference import Translator
+
     """
     Processes the input speech audio, performs voice activity detection, and translates the speech from the source language to the target language.
 
     Parameters:
-    - item (Dict): A dictionary containing the base64 encoded audio data, source language, and target language.
+    - item (Dict): A dictionary containing the base64 encoded audio data, source language, target language, and type (youtube or file).
 
     Returns:
     - Dict: A dictionary containing the status code, message, detected speech chunks, and the translated text.
     """
-    # import wave
-    import os
-    import time
-
-    import torch
-    import torchaudio
-    from pytube import YouTube
-    from pydub import AudioSegment
-    from seamless_communication.inference import Translator
 
     try:
-        # print(f"Payload: {item}")
         USE_ONNX = False
         model, utils = torch.hub.load(
             repo_or_dir="snakers4/silero-vad", model="silero_vad", onnx=USE_ONNX
@@ -647,40 +645,36 @@ def youtube_generate_seamlessm4t_speech(item: Dict):
             collect_chunks,
         ) = utils
 
-        # Decode the base64 audio and convert it for processing
-        yt_id = item["yt_id"]
-        # source_lang = item["source"]
-        # print(f"Target_lang: {item.get('target')}")
-        target_lang = item["target"]
+        type = item.get("type")
+        if type == "youtube":
+            yt_id = item["yt_id"]
+            target_lang = item["target"]
 
-        # Download YouTube video
-        youtube_url = f"https://www.youtube.com/watch?v={yt_id}"
-        youtube = YouTube(youtube_url)
-        video = youtube.streams.filter(only_audio=True).first()
-        video.download(filename="temp_video.mp4")
+            youtube_url = f"https://www.youtube.com/watch?v={yt_id}"
+            youtube = YouTube(youtube_url)
+            video = youtube.streams.filter(only_audio=True).first()
+            video.download(filename="temp_video.mp4")
 
-        # Convert video to wav
-        audio = AudioSegment.from_file("temp_video.mp4", format="mp4")
-        audio.export("temp_audio.wav", format="wav")
+            audio = AudioSegment.from_file("temp_video.mp4", format="mp4")
+            audio.export("temp_audio.wav", format="wav")
 
-        # Convert audio to mono channel with 16K frequency
-        audio = AudioSegment.from_wav("temp_audio.wav")
-        audio = audio.set_channels(1).set_frame_rate(16000)
-        audio.export("output.wav", format="wav")
+            audio = AudioSegment.from_wav("temp_audio.wav")
+            audio = audio.set_channels(1).set_frame_rate(16000)
+            audio.export("output.wav", format="wav")
 
-        # fname = base64_to_audio_file(b64_contents=b64)
-        # convert_to_mono_16k(fname, "output.wav")
+        elif type == "file":
+            b64 = item["wav_base64"]
+            target_lang = item["target"]
+
+            fname = base64_to_audio_file(b64_contents=b64)
+            convert_to_mono_16k(fname, "output.wav")
 
-        # Perform voice activity detection on the processed audio
         wav = read_audio("output.wav", sampling_rate=SAMPLING_RATE)
 
-        # get speech timestamps from full audio file
         speech_timestamps_seconds = get_speech_timestamps(
             wav, model, sampling_rate=SAMPLING_RATE, return_seconds=True
         )
-        print(speech_timestamps_seconds)
-        # translator = download_models()
-        start = time.perf_counter()
+
         model_name = "seamlessM4T_v2_large"
         vocoder_name = (
             "vocoder_v2" if model_name == "seamlessM4T_v2_large" else "vocoder_36langs"
@@ -693,10 +687,6 @@ def youtube_generate_seamlessm4t_speech(item: Dict):
             dtype=torch.float16,
         )
 
-        duration = time.perf_counter() - start
-        print(f"Duration to load model is: {duration}")
-
-        # Replace t1, t2 with VAD time
         timestamps_start = []
         timestamps_end = []
         text = []
@@ -710,7 +700,6 @@ async def generate():
                 timestamps_end.append(e)
                 newAudio = AudioSegment.from_wav("output.wav")
 
-                # time in seconds should be multiplied by 1000.0 for AudioSegment array. So 20s = 20000
                 newAudio = newAudio[s * 1000 : e * 1000]
                 new_audio_name = "new_" + str(s) + ".wav"
                 newAudio.export(new_audio_name, format="wav")
@@ -723,7 +712,6 @@ async def generate():
                 translated_text, _ = translator.predict(
                     "resampled.wav", "s2tt", target_lang
                 )
-                # print(translated_text)
                 text.append(str(translated_text[0]))
                 os.remove(new_audio_name)
                 os.remove("resampled.wav")
@@ -964,117 +952,124 @@ async def generate():
         return {"message": "Internal server error", "code": 500}
 
 
-@stub.function(gpu=GPU_TYPE, timeout=600)
-@web_endpoint(method="POST")
-def youtube_generate_whisperx_speech(item: Dict):
-    """
-    Processes the input speech audio and translates the speech to the target language using faster-whisper.
+# @stub.function(gpu=GPU_TYPE, timeout=600)
+# @web_endpoint(method="POST")
+# def youtube_generate_whisperx_speech(item: Dict):
+#     """
+#     Processes the input speech audio and translates the speech to the target language using faster-whisper.
 
-    Parameters:
-    - item (Dict): A dictionary containing the base64 encoded audio data and target language.
+#     Parameters:
+#     - item (Dict): A dictionary containing the base64 encoded audio data and target language.
 
-    Returns:
-    - Dict: A dictionary containing the status code, message, detected speech chunks, and the translated text.
-    """
-    import torch
-    import torchaudio
-    import whisperx
-    from pytube import YouTube
-    from pydub import AudioSegment
+#     Returns:
+#     - Dict: A dictionary containing the status code, message, detected speech chunks, and the translated text.
+#     """
+#     import torch
+#     import torchaudio
+#     import whisperx
+#     from pytube import YouTube
+#     from pydub import AudioSegment
+# import os
+# import time
+# import torch
+# import torchaudio
+# from pytube import YouTube
+# from pydub import AudioSegment
+# from seamless_communication.inference import Translator
 
-    try:
-        yt_id = item["yt_id"]
-        target_lang = item["target"]
+#     try:
+#         yt_id = item["yt_id"]
+#         target_lang = item["target"]
 
-        # Download YouTube video
-        youtube_url = f"https://www.youtube.com/watch?v={yt_id}"
-        youtube = YouTube(youtube_url)
-        video = youtube.streams.filter(only_audio=True).first()
-        video.download(filename="temp_video.mp4")
+#         # Download YouTube video
+#         youtube_url = f"https://www.youtube.com/watch?v={yt_id}"
+#         youtube = YouTube(youtube_url)
+#         video = youtube.streams.filter(only_audio=True).first()
+#         video.download(filename="temp_video.mp4")
 
-        # Convert video to wav
-        audio = AudioSegment.from_file("temp_video.mp4", format="mp4")
-        audio.export("temp_audio.wav", format="wav")
+#         # Convert video to wav
+#         audio = AudioSegment.from_file("temp_video.mp4", format="mp4")
+#         audio.export("temp_audio.wav", format="wav")
 
-        # Convert audio to mono channel with 16K frequency
-        audio = AudioSegment.from_wav("temp_audio.wav")
-        audio = audio.set_channels(1).set_frame_rate(16000)
-        audio.export("output.wav", format="wav")
+#         # Convert audio to mono channel with 16K frequency
+#         audio = AudioSegment.from_wav("temp_audio.wav")
+#         audio = audio.set_channels(1).set_frame_rate(16000)
+#         audio.export("output.wav", format="wav")
 
-        USE_ONNX = False
-        model, utils = torch.hub.load(
-            repo_or_dir="snakers4/silero-vad", model="silero_vad", onnx=USE_ONNX
-        )
+#         USE_ONNX = False
+#         model, utils = torch.hub.load(
+#             repo_or_dir="snakers4/silero-vad", model="silero_vad", onnx=USE_ONNX
+#         )
 
-        (
-            get_speech_timestamps,
-            save_audio,
-            read_audio,
-            VADIterator,
-            collect_chunks,
-        ) = utils
+#         (
+#             get_speech_timestamps,
+#             save_audio,
+#             read_audio,
+#             VADIterator,
+#             collect_chunks,
+#         ) = utils
 
-        # Perform voice activity detection on the processed audio
-        wav = read_audio("output.wav", sampling_rate=SAMPLING_RATE)
+#         # Perform voice activity detection on the processed audio
+#         wav = read_audio("output.wav", sampling_rate=SAMPLING_RATE)
 
-        # get speech timestamps from full audio file
-        speech_timestamps_seconds = get_speech_timestamps(
-            wav, model, sampling_rate=SAMPLING_RATE, return_seconds=True
-        )
-        print(speech_timestamps_seconds)
+#         # get speech timestamps from full audio file
+#         speech_timestamps_seconds = get_speech_timestamps(
+#             wav, model, sampling_rate=SAMPLING_RATE, return_seconds=True
+#         )
+#         print(speech_timestamps_seconds)
 
-        grouped_timestamps = sliding_window_approch_timestamps(
-            speech_timestamps_seconds
-        )
-        print(grouped_timestamps)
+#         grouped_timestamps = sliding_window_approch_timestamps(
+#             speech_timestamps_seconds
+#         )
+#         print(grouped_timestamps)
 
-        model = whisperx.load_model(
-            MODEL_SIZE, "cuda", compute_type="float16", language=target_lang
-        )
+#         model = whisperx.load_model(
+#             MODEL_SIZE, "cuda", compute_type="float16", language=target_lang
+#         )
 
-        async def generate():
-            for segment in grouped_timestamps:
-                s = segment["start"]
-                e = segment["end"]
+#         async def generate():
+#             for segment in grouped_timestamps:
+#                 s = segment["start"]
+#                 e = segment["end"]
 
-                newAudio = AudioSegment.from_wav("output.wav")
-                newAudio = newAudio[s * 1000 : e * 1000]
-                new_audio_name = "new_" + str(s) + ".wav"
-                newAudio.export(new_audio_name, format="wav")
-                waveform, sample_rate = torchaudio.load(new_audio_name)
-                resampler = torchaudio.transforms.Resample(
-                    sample_rate, SAMPLING_RATE, dtype=waveform.dtype
-                )
-                resampled_waveform = resampler(waveform)
-                torchaudio.save("resampled.wav", resampled_waveform, SAMPLING_RATE)
+#                 newAudio = AudioSegment.from_wav("output.wav")
+#                 newAudio = newAudio[s * 1000 : e * 1000]
+#                 new_audio_name = "new_" + str(s) + ".wav"
+#                 newAudio.export(new_audio_name, format="wav")
+#                 waveform, sample_rate = torchaudio.load(new_audio_name)
+#                 resampler = torchaudio.transforms.Resample(
+#                     sample_rate, SAMPLING_RATE, dtype=waveform.dtype
+#                 )
+#                 resampled_waveform = resampler(waveform)
+#                 torchaudio.save("resampled.wav", resampled_waveform, SAMPLING_RATE)
 
-                audio = whisperx.load_audio("resampled.wav")
-                result = model.transcribe(audio, batch_size=16)
-                model_a, metadata = whisperx.load_align_model(
-                    language_code=target_lang, device="cuda"
-                )
+#                 audio = whisperx.load_audio("resampled.wav")
+#                 result = model.transcribe(audio, batch_size=16)
+#                 model_a, metadata = whisperx.load_align_model(
+#                     language_code=target_lang, device="cuda"
+#                 )
 
-                result = whisperx.align(
-                    result["segments"],
-                    model_a,
-                    metadata,
-                    audio,
-                    "cuda",
-                    return_char_alignments=False,
-                )
+#                 result = whisperx.align(
+#                     result["segments"],
+#                     model_a,
+#                     metadata,
+#                     audio,
+#                     "cuda",
+#                     return_char_alignments=False,
+#                 )
 
-                for segment in result["segments"]:
-                    obj = {
-                        "start": segment["start"] + s,
-                        "end": segment["end"] + s,
-                        "text": segment["text"],
-                    }
-                    print(obj)
-                    yield json.dumps(obj)
+#                 for segment in result["segments"]:
+#                     obj = {
+#                         "start": segment["start"] + s,
+#                         "end": segment["end"] + s,
+#                         "text": segment["text"],
+#                     }
+#                     print(obj)
+#                     yield json.dumps(obj)
 
-        return StreamingResponse(generate(), media_type="text/event-stream")
+#         return StreamingResponse(generate(), media_type="text/event-stream")
 
-    except Exception as e:
-        print(e)
-        logging.critical(e, exc_info=True)
-        return {"message": "Internal server error", "code": 500}
+#     except Exception as e:
+#         print(e)
+#         logging.critical(e, exc_info=True)
+#         return {"message": "Internal server error", "code": 500}
diff --git a/ui/src/audioUtils.js b/ui/src/audioUtils.js
index 5c6d7b3..da851dc 100644
--- a/ui/src/audioUtils.js
+++ b/ui/src/audioUtils.js
@@ -15,7 +15,7 @@ const blobToBase64 = (blob) => {
   });
 };
 
-export const getAudioDetails = (file) => {
+export const getAudioDetails = async (file, requestData) => {
   // Read the audio file
   const reader = new FileReader();
   reader.readAsArrayBuffer(file);
@@ -44,9 +44,20 @@ export const getAudioDetails = (file) => {
 
     // Send data to API
     try {
-      const response = await axios.post("YOUR_API_ENDPOINT", {
-        audio: base64Data,
-      });
+      const response = await fetch(
+        "https://aldrinjenson--seamless-m4t-speech-detect-language.modal.run",
+        {
+          method: "POST",
+          // body: JSON.stringify(requestData),
+          body: JSON.stringify({
+            wav_base64: base64Data,
+          }),
+          // body: JSON.stringify(requestData),
+          headers: {
+            "Content-Type": "application/json",
+          },
+        }
+      );
       console.log("API Response:", response.data);
     } catch (error) {
       console.error("Error sending data to API:", error);
diff --git a/ui/src/components/generate/UploadFile.jsx b/ui/src/components/generate/UploadFile.jsx
index b74e491..516011a 100644
--- a/ui/src/components/generate/UploadFile.jsx
+++ b/ui/src/components/generate/UploadFile.jsx
@@ -90,18 +90,22 @@ const UploadFile = ({
       selectedModel
     );
 
-    const audioDetails = getAudioDetails(uploadedFile);
-    console.log(audioDetails);
-    return null;
+    // const audioDetails = await getAudioDetails(uploadedFile, requestData);
+    // console.log(audioDetails);
+    // reset(false);
+    // return null;
 
     const toastId = toast.info("Uploading..");
-    fetch(url, {
-      method: "POST",
-      body: JSON.stringify(requestData),
-      headers: {
-        "Content-Type": "application/json",
-      },
-    })
+    fetch(
+      "https://aldrinjenson--seamless-m4t-speech-seamless-generate-speech.modal.run",
+      {
+        method: "POST",
+        body: JSON.stringify({ ...requestData, type: "file" }),
+        headers: {
+          "Content-Type": "application/json",
+        },
+      }
+    )
       .then(async (res) => {
         if (res?.code === 500) throw new Error("Internal Server Error");