diff --git a/app.py b/app.py index 79234c8..0346bfe 100644 --- a/app.py +++ b/app.py @@ -349,147 +349,67 @@ import os import numpy as np import streamlit as st -import torch -import whisper import librosa import io import openai -from pytube import YouTube +# Import configurations and functions from modules +from config import openai_api_key, model_id, model_path +from load_model import load_model +from transcribe_audio import transcribe_audio +from extract_entities import extract_entities +from translate_text import translate_text + +# Load environment variables load_dotenv() -# Set your OpenAI API key -openai.api_key = os.getenv("OPENAI_KEY") - -# Function to extract entities using OpenAI API -def extract_entities(text): - prompt = f""" - The following entities are present in Indian Languages. - Please extract the following entities from the text. - Provide entities for both in English and the original language in a structured format: - - Text: "{text}" - - - Name: - - Phone Numbers: - - Addresses: - - Email: - """ - response = openai.chat.completions.create( - model="gpt-3.5-turbo", - messages=[ - {"role": "system", "content": "You are a helpful assistant that extracts information from Indian multilingual text."}, - {"role": "user", "content": prompt} - ], - max_tokens=200 - ) - entities_text = response.choices[0].message.content - - return entities_text - -# Function to translate text from Indian languages to English using OpenAI GPT-3.5-turbo -def translate_text(text, source_language): - prompt = f"Translate the following text from {source_language} to English:\n\n{text}" - response = openai.chat.completions.create( - model="gpt-3.5-turbo", - messages=[ - {"role": "system", "content": "You are a helpful assistant that translates text from Indian languages to English."}, - {"role": "user", "content": prompt} - ], - max_tokens=150 - ) - translated_text = response.choices[0].message.content - - return translated_text - -# Use the cache decorator from Streamlit -@st.cache(allow_output_mutation=True) -def load_model(model_id, model_path): - # Define available device (CPU/GPU) - device = "cuda" if torch.cuda.is_available() else "cpu" - - # Load model on available device - model = whisper.load_model(model_id, device=device, download_root=model_path) - - # Display model's parameters in the app's logs - print( - f"Model will be run on {device}\n" - f"Model is {'multilingual' if model.is_multilingual else 'English-only'} " - f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters." - ) - - return model - -def download_audio_from_youtube(url): - yt = YouTube(url) - audio_stream = yt.streams.filter(only_audio=True).first() - audio_file = audio_stream.download(filename='audio.mp4') - return audio_file +# Set OpenAI API key +openai.api_key = openai_api_key +# Main function to run the Streamlit app def main(): - # Display title - st.title("Whisper - Speech to Text App") - - # Set up environment variables for model ID and path - model_id = os.environ.get('MODEL_ID', 'small') # Use a smaller model for quicker transcription by default - model_path = os.environ.get('MODEL_PATH', 'whisper_model') # Default path if not set + st.title("Speech to Text App") + # Load the Whisper model model = load_model(model_id, model_path) - # Add a selectbox for language selection - languages = ['hi', 'bn', 'te', 'mr', 'ta', 'ur', 'gu', 'kn', 'ml', 'pa'] # List of Indian language codes - language = st.selectbox("Select the language of the audio file", languages, index=0) + # Language selection dropdown + #languages = ['hi', 'bn', 'te', 'mr', 'ta', 'ur', 'gu', 'kn', 'ml', 'pa'] + #language = st.selectbox("Select the language of the audio file", languages, index=0) - # Option to upload audio file or provide YouTube link - st.write("Upload an audio file or provide a YouTube link:") - audio_file = st.file_uploader("Upload an audio file", type=["mp3", "wav"]) - youtube_link = st.text_input("Or enter a YouTube link") - - transcript = {"text": "The audio file could not be transcribed :("} - options = dict(beam_size=5, best_of=5, language=language) - transcribe_options = dict(task="transcribe", **options) + # File uploader for audio files + st.write("Upload an audio file:") + audio_file = st.file_uploader("Select an audio",type=["mp3", "wav"]) audio_data = None if audio_file: - # Read file content + # Process uploaded audio file + st.write("We are extracting these entities:\n- Name:\n- Phone Numbers:\n- Addresses:\n- Email:\n- PIN Code:\n- Occupation:\n- Gender:") audio_bytes = audio_file.read() st.audio(audio_bytes) - - # Convert bytes to a file-like object using io.BytesIO audio_file = io.BytesIO(audio_bytes) - - # Convert to numpy array - audio_data, _ = librosa.load(audio_file, sr=16000) # Load with target sample rate of 16000 for Whisper - - elif youtube_link: try: - audio_file = download_audio_from_youtube(youtube_link) - st.audio(audio_file) - - # Load audio file using librosa audio_data, _ = librosa.load(audio_file, sr=16000) except Exception as e: - st.error(f"Error downloading audio from YouTube: {e}") + st.error(f"Error loading audio file: {e}") - # Transcribe audio on button click + # Perform transcription and other tasks on button click if audio_data is not None and st.button("Transcribe"): with st.spinner("Transcribing audio..."): - transcript = model.transcribe(audio_data, **transcribe_options) - transcription_text = transcript["text"] + transcription_text = transcribe_audio(model, audio_data) st.write(transcription_text) - # Extract entities from the transcription text with st.spinner("Extracting entities..."): entities = extract_entities(transcription_text) st.write("Extracted Entities:") st.write(entities) - # Translate transcription to English with st.spinner("Translating to English..."): - translated_text = translate_text(transcription_text, language) + translated_text = translate_text(transcription_text) st.write("Translated Text:") st.write(translated_text) +# Entry point of the script if __name__ == "__main__": main() diff --git a/config.py b/config.py new file mode 100644 index 0000000..640764f --- /dev/null +++ b/config.py @@ -0,0 +1,8 @@ +import os +from dotenv import load_dotenv + +load_dotenv() + +openai_api_key = os.getenv("OPENAI_KEY") +model_id = os.getenv('MODEL_ID', 'large-v3') +model_path = os.getenv('MODEL_PATH', 'whisper_model') diff --git a/download_whisper.py b/download_whisper.py index 87f5933..888246b 100644 --- a/download_whisper.py +++ b/download_whisper.py @@ -3,8 +3,7 @@ model_path = "whisper_model" model_id = 'large-v3' - -# Ensure the directory exists +s os.makedirs(model_path, exist_ok=True) # Download model diff --git a/extract_entities.py b/extract_entities.py new file mode 100644 index 0000000..f7b699c --- /dev/null +++ b/extract_entities.py @@ -0,0 +1,32 @@ +import openai +from config import openai_api_key + +openai.api_key = openai_api_key + +def extract_entities(text): + prompt = f""" + The following entities are present in Indian Languages. + Please extract the following entities from the text. + Provide entities for both in English and the original language of the audio in well-structured format: + + Text: "{text}" + + - Name: + - Phone Numbers: + - Addresses: + - Email: + - PIN Code: + - Occupation: + - Gender: + """ + response = openai.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "You are a helpful assistant that extracts information from Indian multilingual text."}, + {"role": "user", "content": prompt} + ], + max_tokens=200 + ) + entities_text = response.choices[0].message.content + + return entities_text diff --git a/load_model.py b/load_model.py new file mode 100644 index 0000000..f2a3417 --- /dev/null +++ b/load_model.py @@ -0,0 +1,15 @@ +import torch +import whisper +import numpy as np +import streamlit as st + +@st.cache(allow_output_mutation=True) +def load_model(model_id, model_path): + device = "cuda" if torch.cuda.is_available() else "cpu" + model = whisper.load_model(model_id, device=device, download_root=model_path) + print( + f"Model will be run on {device}\n" + f"Model is {'multilingual' if model.is_multilingual else 'English-only'} " + f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters." + ) + return model diff --git a/requirements.txt b/requirements.txt index 8270890..d2a2ab1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ openai-whisper streamlit librosa pytest +pytube \ No newline at end of file diff --git a/test_app.py b/test_app.py index 0cba0e2..657fd53 100644 --- a/test_app.py +++ b/test_app.py @@ -1,38 +1,8 @@ import pytest -import whisper -import librosa -import io +import os import numpy as np -from pytube import YouTube import openai -from app import extract_entities, load_model, download_audio_from_youtube, translate_text -import os - -# Mocking a simple function to simulate the `load_model` behavior -# def test_load_model(): -# model_id = "small" -# model_path = "whisper_model" -# model = load_model(model_id, model_path) -# assert model is not None, "Model should be loaded successfully" - -# # Mocking a simple function to simulate the `transcribe` behavior -# def test_transcribe_audio(): -# model_id = "small" -# model_path = "whisper_model" -# model = load_model(model_id, model_path) - -# # Load a sample audio file (you need to provide a valid audio file path here) -# audio_file_path = r"C:\Users\Josh\whisper\audio.mp4" -# audio_data, _ = librosa.load(audio_file_path, sr=16000) - -# options = dict(beam_size=5, best_of=5, language="en") -# transcribe_options = dict(task="transcribe", **options) -# transcript = model.transcribe(audio_data, **transcribe_options) - -# assert "text" in transcript, "Transcript should contain 'text' key" -# assert transcript["text"] != "", "Transcription text should not be empty" - - +from app import extract_entities, load_model, translate_text # Mocking a simple function to simulate the `extract_entities` behavior def test_extract_entities(): @@ -44,23 +14,9 @@ def test_extract_entities(): assert "Email:" in entities, "Extracted entities should contain 'Email'" assert "Addresses:" in entities, "Extracted entities should contain 'Addresses'" -def test_download_audio_from_youtube(): - youtube_link = "https://www.youtube.com/watch?v=dQw4w9WgXcQ" # Example YouTube link - audio_file = download_audio_from_youtube(youtube_link) - assert audio_file is not None, "Audio file should be downloaded successfully" - assert os.path.exists(audio_file), "Downloaded audio file should exist" - -def test_invalid_youtube_url(): - invalid_youtube_link = "https://www.youtube.com/watch?v=invalid" # Example invalid YouTube link - try: - audio_file = download_audio_from_youtube(invalid_youtube_link) - assert False, "Function should raise an exception for invalid YouTube URL" - except Exception as e: - assert isinstance(e, Exception), "Function should raise an exception for invalid YouTube URL" - -def test_transcribe_empty_audio(): +def test_transcribe_empty_audio(tmp_path): model_id = "small" - model_path = "whisper_model" + model_path = tmp_path # Use a temporary path for testing model = load_model(model_id, model_path) # Create an empty audio array @@ -75,40 +31,101 @@ def test_transcribe_empty_audio(): except Exception as e: assert isinstance(e, Exception), "Function should raise an exception for empty audio data" +# New test cases for audio format and size handling +def test_audio_format(tmp_path): + # Simulate downloading a mock audio file in .wav format + audio_file_wav = os.path.join(tmp_path, "test_audio.wav") + with open(audio_file_wav, "wb") as f: + f.write(b"Mock audio content in .wav format") + # Test .wav format + assert audio_file_wav.endswith(".wav"), "Downloaded audio file should be in .wav format" -import unittest -from unittest.mock import patch, MagicMock - -class TestWhisperApp(unittest.TestCase): +def test_audio_file_size(tmp_path): + # Simulate downloading a mock audio file with specific size + audio_file = os.path.join(tmp_path, "test_audio.mp3") + mock_audio_size = 1024 # 1 KB + with open(audio_file, "wb") as f: + f.write(b"Mock audio content with specific size" * mock_audio_size) - @patch('app.openai.chat.completions.create') - def test_extract_entities(self, mock_openai): - mock_openai.return_value.choices[0].message.content = "Entities: Name: John, Phone Numbers: 1234567890, Addresses: 123 Main St, Email: john@example.com" - text = "Sample text for entity extraction." + # Get size of the downloaded file + file_size = os.path.getsize(audio_file) + assert file_size > 0, "Downloaded audio file size should be greater than 0 bytes" - entities = extract_entities(text) +# New test case for handling OpenAI key expiration +def test_openai_key_expiration(): + # Backup the current OpenAI key + backup_key = openai.api_key - # Assertions - self.assertIn("Entities:", entities) - self.assertIn("Name: John", entities) - self.assertIn("Phone Numbers: 1234567890", entities) - self.assertIn("Addresses: 123 Main St", entities) - self.assertIn("Email: john@example.com", entities) + # Set an invalid key to simulate expiration + openai.api_key = "invalid_key" - @patch('app.openai.chat.completions.create') - def test_translate_text(self, mock_openai): - mock_openai.return_value.choices[0].message.content = "Translated text in English." - text = "Sample text for translation." - source_language = "hi" # Assuming translating from Hindi + text = "Sample text for translation." + source_language = "hi" # Assuming translating from Hindi + try: translated_text = translate_text(text, source_language) + assert False, "Function should raise an exception for invalid OpenAI key" + except Exception as e: + assert isinstance(e, Exception), "Function should raise an exception for invalid OpenAI key" + + # Restore the original OpenAI key + openai.api_key = backup_key + +# Ensure OpenAI key is initialized for the tests +@pytest.fixture(autouse=True) +def setup_openai_key(): + openai.api_key = "your_openai_key_here" # Replace with your actual OpenAI key + +# Ensure pytest runs with a temporary directory for each test +@pytest.fixture(scope="function", autouse=True) +def temp_directory_for_test(request, tmp_path): + request.cls.tmp_path = tmp_path + +# If running pytest directly, include this block +if __name__ == "__main__": + pytest.main() + - # Assertions - self.assertEqual(translated_text, "Translated text in English.") - # Additional test cases can be added for edge cases, error handling, and performance if needed. -if __name__ == '__main__': - unittest.main() + + + + + + + + + + + + + + +# Mocking a simple function to simulate the `load_model` behavior +# def test_load_model(): +# model_id = "small" +# model_path = "whisper_model" +# model = load_model(model_id, model_path) +# assert model is not None, "Model should be loaded successfully" + +# # Mocking a simple function to simulate the `transcribe` behavior +# def test_transcribe_audio(): +# model_id = "small" +# model_path = "whisper_model" +# model = load_model(model_id, model_path) + +# # Load a sample audio file (you need to provide a valid audio file path here) +# audio_file_path = r"C:\Users\Josh\whisper\audio.mp4" +# audio_data, _ = librosa.load(audio_file_path, sr=16000) + +# options = dict(beam_size=5, best_of=5, language="en") +# transcribe_options = dict(task="transcribe", **options) +# transcript = model.transcribe(audio_data, **transcribe_options) + +# assert "text" in transcript, "Transcript should contain 'text' key" +# assert transcript["text"] != "", "Transcription text should not be empty" + + diff --git a/transcribe_audio.py b/transcribe_audio.py new file mode 100644 index 0000000..beb3623 --- /dev/null +++ b/transcribe_audio.py @@ -0,0 +1,5 @@ +def transcribe_audio(model, audio_data): + options = dict(beam_size=5, best_of=5) + transcribe_options = dict(task="transcribe", **options) + transcript = model.transcribe(audio_data, **transcribe_options) + return transcript["text"] diff --git a/translate_text.py b/translate_text.py new file mode 100644 index 0000000..178cfa7 --- /dev/null +++ b/translate_text.py @@ -0,0 +1,17 @@ +import openai +from config import openai_api_key + +openai.api_key = openai_api_key + +def translate_text(text): + prompt = f"Translate the following text from given audio language to English:\n\n{text}" + response = openai.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "You are a helpful assistant that translates text from Indian languages to English."}, + {"role": "user", "content": prompt} + ], + max_tokens=150 + ) + translated_text = response.choices[0].message.content + return translated_text